예제 #1
0
파일: lift.py 프로젝트: nicklein/kgtk
def run(input_file: KGTKFiles,
        output_file: KGTKFiles,
        label_file: KGTKFiles,
        unmodified_row_file: KGTKFiles,
        matched_label_file: KGTKFiles,
        unmatched_label_file: KGTKFiles,

        input_select_column_name: typing.Optional[str],
        input_select_column_value: typing.Optional[str],
        input_lifting_column_names: typing.List[str],

        output_lifted_column_names: typing.List[str],
        output_lifted_column_suffix: str,
        output_select_column_value: str,

        label_select_column_name: typing.Optional[str],
        label_select_column_value: str,
        label_match_column_name: typing.Optional[str],
        label_value_column_name: typing.Optional[str],

        default_value: str,

        remove_label_records: bool = False,
        sort_lifted_labels: bool = True,
        suppress_duplicate_labels: bool = True,
        suppress_empty_columns: bool = False,
        ok_if_no_labels: bool = False,
        prefilter_labels: bool = False,
        input_is_presorted: bool = False,
        labels_are_presorted: bool = False,

        clear_before_lift: bool = False,
        overwrite: bool = False,

        output_only_modified_rows: bool = False,

        languages: typing.Optional[typing.List[str]] = None,
        prioritize: bool = False,

        use_label_envar: bool = False,
        lift_all_columns: bool = False,
        require_label_file: bool = False,
        force_input_mode_none: bool = False,

        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want.
)->int:
    # import modules locally
    import os
    from pathlib import Path
    import sys
    
    from kgtk.exceptions import KGTKException
    from kgtk.lift.kgtklift import KgtkLift
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    label_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_input_file(label_file, who="KGTK label file")
    unmodified_row_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmodified_row_file, who="KGTK unmodified row output file")
    matched_label_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(matched_label_file, who="KGTK matched label output file")
    unmatched_label_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmatched_label_file, who="KGTK unmatched label output file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input", fallback=True)
    label_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="label", fallback=True)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file, flush=True)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True)
        if label_kgtk_file is not None:
            print("--label-file=%s" % label_kgtk_file, file=error_file, flush=True)
        if unmodified_row_kgtk_file is not None:
            print("--unmodified-row-output-file=%s" % unmodified_row_kgtk_file, file=error_file, flush=True)
        if matched_label_kgtk_file is not None:
            print("--matched-label-output-file=%s" % matched_label_kgtk_file, file=error_file, flush=True)
        if unmatched_label_kgtk_file is not None:
            print("--unmatched-label-output-file=%s" % unmatched_label_kgtk_file, file=error_file, flush=True)

        if input_select_column_name is not None:
            print("--input-select-column=%s" % input_select_column_name, file=error_file, flush=True)
        if input_select_column_value is not None:
            print("--input-select-value=%s" % input_select_column_value, file=error_file, flush=True)
        if input_lifting_column_names is not None and len(input_lifting_column_names) > 0:
            print("--columns-to-lift %s" % " ".join(input_lifting_column_names), file=error_file, flush=True)
        if output_lifted_column_names is not None and len(output_lifted_column_names) > 0:
            print("--columns-to-write %s" % " ".join(output_lifted_column_names), file=error_file, flush=True)

        print("--lift-suffix=%s" % output_lifted_column_suffix, file=error_file, flush=True)
        if output_select_column_value is not None:
            print("--update-select-value=%s" % output_select_column_value, file=error_file, flush=True)


        if label_select_column_name is not None:
            print("--label-select-column=%s" % label_select_column_name, file=error_file, flush=True)
        print("--label-select-value=%s" % label_select_column_value, file=error_file, flush=True)
        if label_match_column_name is not None:
            print("--label-match-column=%s" % label_match_column_name, file=error_file, flush=True)
        if label_value_column_name is not None:
            print("--label-value-column=%s" % label_value_column_name, file=error_file, flush=True)

        print("--default-value=%s" % repr(default_value), file=error_file, flush=True)
        print("--remove-label-records=%s" % repr(remove_label_records), file=error_file, flush=True)
        print("--sort-lifted-labels=%s" % repr(sort_lifted_labels), file=error_file, flush=True)
        print("--suppress-duplicate-labels=%s" % repr(suppress_duplicate_labels), file=error_file, flush=True)
        print("--suppress-empty-columns=%s" % repr(suppress_empty_columns), file=error_file, flush=True)
        print("--ok-if-no-labels=%s" % repr(ok_if_no_labels), file=error_file, flush=True)
        print("--prefilter-labels=%s" % repr(prefilter_labels), file=error_file, flush=True)
        print("--input-file-is-presorted=%s" % repr(input_is_presorted), file=error_file, flush=True)
        print("--label-file-is-presorted=%s" % repr(labels_are_presorted), file=error_file, flush=True)
        print("--clear-before-lift=%s" % repr(clear_before_lift), file=error_file, flush=True)
        print("--overwrite=%s" % repr(overwrite), file=error_file, flush=True)
        print("--output-only-modified-rows=%s" % repr(output_only_modified_rows), file=error_file, flush=True)

        if languages is not None:
            print("--languages %s" % " ".join(repr(l) for l in languages), file=error_file, flush=True)
        print("--prioritize=%s" % repr(prioritize), file=error_file, flush=True)
        
        print("--use-label-envar=%s" % repr(use_label_envar), file=error_file, flush=True)
        print("--lift-all-columns=%s" % repr(lift_all_columns), file=error_file, flush=True)
        print("--require-label-files=%s" % repr(require_label_file), file=error_file, flush=True)
        print("--force-input-mode-none=%s" % repr(force_input_mode_none), file=error_file, flush=True)
        input_reader_options.show(out=error_file, who="input")
        label_reader_options.show(out=error_file, who="label")
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    # Should the following functionality be moved to KgtkLift?
    if label_kgtk_file is None and use_label_envar:
        label_file_envar: str = 'KGTK_LABEL_FILE' # TODO: Move this to a common file.
        label_file_envar_value: typing.Optional[str] = os.getenv(label_file_envar)
        if label_file_envar_value is not None:
            label_kgtk_file = Path(label_file_envar_value)
            if verbose:
                print("Using label file %s from envar %s" % (repr(label_file_envar_value), repr(label_file_envar)), file=error_file, flush=True)

    if require_label_file and label_kgtk_file is None:
        raise KGTKException("A label file must be specified using --label-file or KGTK_LABEL_FILE")

    try:
        kl: KgtkLift = KgtkLift(
            input_file_path=input_kgtk_file,
            label_file_path=label_kgtk_file,
            output_file_path=output_kgtk_file,
            unmodified_row_file_path=unmodified_row_kgtk_file,
            matched_label_file_path=matched_label_kgtk_file,
            unmatched_label_file_path=unmatched_label_kgtk_file,

            input_select_column_name=input_select_column_name,
            input_select_column_value=input_select_column_value,
            input_lifting_column_names=input_lifting_column_names,

            output_lifted_column_suffix=output_lifted_column_suffix,
            output_select_column_value=output_select_column_value,
            output_lifted_column_names=output_lifted_column_names,

            label_select_column_name=label_select_column_name,
            label_select_column_value=label_select_column_value,
            label_match_column_name=label_match_column_name,
            label_value_column_name=label_value_column_name,

            default_value=default_value,

            remove_label_records=remove_label_records,
            sort_lifted_labels=sort_lifted_labels,
            suppress_duplicate_labels=suppress_duplicate_labels,
            suppress_empty_columns=suppress_empty_columns,
            ok_if_no_labels=ok_if_no_labels,
            prefilter_labels=prefilter_labels,
            input_is_presorted=input_is_presorted,
            labels_are_presorted=labels_are_presorted,

            clear_before_lift=clear_before_lift,
            overwrite=overwrite,

            output_only_modified_rows=output_only_modified_rows,

            languages=languages,
            prioritize=prioritize,

            lift_all_columns=lift_all_columns,
            force_input_mode_none=force_input_mode_none,

            input_reader_options=input_reader_options,
            label_reader_options=label_reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        
        kl.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
예제 #2
0
def run(
        input_file: KGTKFiles,
        filter_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        matched_filter_file: KGTKFiles,
        unmatched_filter_file: KGTKFiles,
        join_file: KGTKFiles,
        input_keys: typing.Optional[typing.List[str]],
        filter_keys: typing.Optional[typing.List[str]],
        cache_input: bool = False,
        preserve_order: bool = False,
        presorted: bool = False,
        field_separator: typing.Optional[str] = None,
        left_join: bool = False,
        right_join: bool = False,
        input_prefix: typing.Optional[str] = None,
        filter_prefix: typing.Optional[str] = None,
        join_output: bool = False,
        right_first: bool = False,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.iff.kgtkifexists import KgtkIfExists
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    filter_kgtk_file: Path = KGTKArgumentParser.get_input_file(
        filter_file, who="KGTK filter file")
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            reject_file, who="KGTK reject file")
    matched_filter_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            matched_filter_file, who="KGTK matched filter file")
    unmatched_filter_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            unmatched_filter_file, who="KGTK unmatched filter file")
    join_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            join_file, who="KGTK join file")

    if (str(input_kgtk_file) == "-" and str(filter_kgtk_file) == "-"):
        raise KGTKException(
            "My not use stdin for both --input-file and --filter-on files.")

    field_separator = KgtkIfExists.FIELD_SEPARATOR_DEFAULT if field_separator is None else field_separator

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
        kwargs, who="input", fallback=True)
    filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
        kwargs, who="filter", fallback=True)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--filter-file=%s" % str(filter_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file)
        if matched_filter_kgtk_file is not None:
            print("--matched-filter-file=%s" % str(matched_filter_kgtk_file),
                  file=error_file)
        if unmatched_filter_kgtk_file is not None:
            print("--unmatched-filter-file=%s" %
                  str(unmatched_filter_kgtk_file),
                  file=error_file)
        if join_kgtk_file is not None:
            print("--join-file=%s" % str(join_kgtk_file), file=error_file)
        if input_keys is not None:
            print("--input-keys=%s" % " ".join(input_keys), file=error_file)
        if filter_keys is not None:
            print("--filter-keys=%s" % " ".join(filter_keys), file=error_file)
        print("--cache-input=%s" % str(cache_input), file=error_file)
        print("--preserve-order=%s" % str(preserve_order), file=error_file)
        print("--presortedr=%s" % str(presorted), file=error_file)
        print("--field-separator=%s" % repr(field_separator), file=error_file)
        print("--left-join=%s" % str(left_join), file=error_file)
        print("--right-join=%s" % str(right_join), file=error_file)
        if input_prefix is not None:
            print("--input-prefix=%s" % repr(input_prefix), file=error_file)
        if filter_prefix is not None:
            print("--filter-prefix=%s" % repr(filter_prefix), file=error_file)
        print("--join-output=%s" % str(join_output), file=error_file)
        print("--right-join-first=%s" % str(right_first), file=error_file)
        input_reader_options.show(out=error_file, who="input")
        filter_reader_options.show(out=error_file, who="filter")
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        ie: KgtkIfExists = KgtkIfExists(
            input_file_path=input_kgtk_file,
            input_keys=input_keys,
            filter_file_path=filter_kgtk_file,
            filter_keys=filter_keys,
            output_file_path=output_kgtk_file,
            reject_file_path=reject_kgtk_file,
            matched_filter_file_path=matched_filter_kgtk_file,
            unmatched_filter_file_path=unmatched_filter_kgtk_file,
            join_file_path=join_kgtk_file,
            left_join=left_join,
            right_join=right_join,
            input_prefix=input_prefix,
            filter_prefix=filter_prefix,
            join_output=join_output,
            right_first=right_first,
            invert=False,
            cache_input=cache_input,
            preserve_order=preserve_order,
            presorted=presorted,
            field_separator=field_separator,
            input_reader_options=input_reader_options,
            filter_reader_options=filter_reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ie.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
예제 #3
0
파일: unreify_values.py 프로젝트: yyht/kgtk
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        reified_file: KGTKFiles,
        unreified_file: KGTKFiles,
        uninvolved_file: KGTKFiles,
        trigger_label_value: str,
        trigger_node2_value: str,
        value_label_value: str,
        old_label_value: str,
        new_label_value: typing.Optional[str],
        allow_multiple_values: bool,
        allow_extra_columns: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.unreify.kgtkunreifyvalues import KgtkUnreifyValues
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reified_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            reified_file, who="KGTK reified file")
    unreified_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            unreified_file, who="KGTK unreified file")
    uninvolved_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            uninvolved_file, who="KGTK uninvolved file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-files %s" % str(input_kgtk_file),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(output_kgtk_file),
              file=error_file,
              flush=True)
        if reified_kgtk_file is not None:
            print("--reified-file=%s" % str(reified_kgtk_file),
                  file=error_file,
                  flush=True)
        if unreified_kgtk_file is not None:
            print("--unreified-file=%s" % str(unreified_kgtk_file),
                  file=error_file,
                  flush=True)
        if uninvolved_kgtk_file is not None:
            print("--uninvolved-file=%s" % str(uninvolved_kgtk_file),
                  file=error_file,
                  flush=True)

        print("--trigger-label=%s" % trigger_label_value,
              file=error_file,
              flush=True)
        print("--trigger-node2=%s" % trigger_node2_value,
              file=error_file,
              flush=True)
        print("--value-label=%s" % value_label_value,
              file=error_file,
              flush=True)
        print("--old-label=%s" % old_label_value, file=error_file, flush=True)
        if new_label_value is not None:
            print("--new-label=%s" % new_label_value,
                  file=error_file,
                  flush=True)

        print("--allow-multiple-values=%s" % str(allow_multiple_values),
              file=error_file,
              flush=True)
        print("--allow-extra-columns=%s" % str(allow_extra_columns),
              file=error_file,
              flush=True)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        kuv: KgtkUnreifyValues = KgtkUnreifyValues(
            input_file_path=input_kgtk_file,
            output_file_path=output_kgtk_file,
            reified_file_path=reified_kgtk_file,
            unreified_file_path=unreified_kgtk_file,
            uninvolved_file_path=uninvolved_kgtk_file,
            trigger_label_value=trigger_label_value,
            trigger_node2_value=trigger_node2_value,
            value_label_value=value_label_value,
            old_label_value=old_label_value,
            new_label_value=new_label_value,
            allow_multiple_values=allow_multiple_values,
            allow_extra_columns=allow_extra_columns,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        kuv.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
예제 #4
0
파일: implode.py 프로젝트: yyht/kgtk
def run(input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,

        column_name: str,
        prefix: str,
        type_names: typing.List[str],
        without_fields: typing.Optional[typing.List[str]],
        overwrite_column: bool,
        validate: bool,
        escape_pipes: bool,
        quantities_include_numbers: bool,
        general_strings: bool,
        remove_prefixed_columns: bool,
        ignore_unselected_types: bool,
        retain_unselected_types: bool,
        build_id: bool,
        show_data_types: bool,
        quiet: bool,
        
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want.
)->int:
    # import modules locally
    from pathlib import Path
    import sys
    
    from kgtk.exceptions import KGTKException
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.reshape.kgtkimplode import KgtkImplode
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(reject_file, who="KGTK reject file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(kwargs)    
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file, flush=True)

        print("--column %s" % column_name, file=error_file, flush=True)
        print("--prefix %s" % prefix, file=error_file, flush=True)
        print("--overwrite %s" % str(overwrite_column), file=error_file, flush=True)
        print("--validate %s" % str(validate), file=error_file, flush=True)
        print("--escape-pipes %s" % str(escape_pipes), file=error_file, flush=True)
        print("--quantities-include-numbers %s" % str(quantities_include_numbers), file=error_file, flush=True)
        print("--general-strings %s" % str(general_strings), file=error_file, flush=True)
        print("--remove-prefixed-columns %s" % str(remove_prefixed_columns), file=error_file, flush=True)
        print("--ignore-unselected-types %s" % str(ignore_unselected_types), file=error_file, flush=True)
        print("--retain-unselected-types %s" % str(retain_unselected_types), file=error_file, flush=True)
        if type_names is not None:
            print("--types %s" % " ".join(type_names), file=error_file, flush=True)
        if without_fields is not None:
            print("--without %s" % " ".join(without_fields), file=error_file, flush=True)
        print("--show-data-types %s" % str(show_data_types), file=error_file, flush=True)
        print("--quiet %s" % str(quiet), file=error_file, flush=True)
        print("--build-id=%s" % str(build_id), file=error_file, flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)
    if show_data_types:
        data_type: str
        for data_type in KgtkFormat.DataType.choices():
            print("%s" % data_type, file=error_file, flush=True)
        return 0

    wf: typing.List[str] = without_fields if without_fields is not None else list()

    try:
        ex: KgtkImplode = KgtkImplode(
            input_file_path=input_kgtk_file,
            output_file_path=output_kgtk_file,
            reject_file_path=reject_kgtk_file,
            column_name=column_name,
            prefix=prefix,
            type_names=type_names,
            without_fields=wf,
            overwrite_column=overwrite_column,
            validate=validate,
            escape_pipes=escape_pipes,
            quantities_include_numbers=quantities_include_numbers,
            general_strings=general_strings,
            remove_prefixed_columns=remove_prefixed_columns,
            ignore_unselected_types=ignore_unselected_types,
            retain_unselected_types=retain_unselected_types,
            quiet=quiet,
            build_id=build_id,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
예제 #5
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        namespace_file: KGTKFiles,
        updated_namespace_file: KGTKFiles,
        namespace_id_prefix: str,
        namespace_id_use_uuid: bool,
        namespace_id_counter: int,
        namespace_id_zfill: int,
        output_only_used_namespaces: bool,
        allow_lax_uri: bool,
        local_namespace_prefix: str,
        local_namespace_use_uuid: bool,
        prefix_expansion_label: str,
        structured_value_label: str,
        structured_uri_label: str,
        newnode_prefix: str,
        newnode_use_uuid: bool,
        newnode_counter: int,
        newnode_zfill: int,
        build_id: bool,
        escape_pipes: bool,
        validate: bool,
        override_uuid: typing.Optional[str],
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.imports.kgtkntriples import KgtkNtriples
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    # Select where to send error messages, defaulting to stderr.
    input_file_paths: typing.List[
        Path] = KGTKArgumentParser.get_input_file_list(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_file_path: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            reject_file, who="KGTK reject file")

    namespace_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_input_file(
            namespace_file, who="KGTK namespace file")
    updated_namespace_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            updated_namespace_file, who="KGTK updated namespace file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-files %s" %
              " ".join([str(path) for path in input_file_paths]),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(output_kgtk_file),
              file=error_file,
              flush=True)
        if reject_file_path is not None:
            print("--reject-file=%s" % str(reject_file_path),
                  file=error_file,
                  flush=True)
        if namespace_kgtk_file is not None:
            print("--namespace-file=%s" % str(namespace_kgtk_file),
                  file=error_file,
                  flush=True)
        if updated_namespace_kgtk_file is not None:
            print("--updated-namespace-file=%s" %
                  str(updated_namespace_kgtk_file),
                  file=error_file,
                  flush=True)

        print("--namespace-id-prefix %s" % namespace_id_prefix,
              file=error_file,
              flush=True)
        print("--namespace-id-use-uuid %s" % str(namespace_id_use_uuid),
              file=error_file,
              flush=True)
        print("--namespace-id-counter %s" % str(namespace_id_counter),
              file=error_file,
              flush=True)
        print("--namespace-id-zfill %s" % str(namespace_id_zfill),
              file=error_file,
              flush=True)
        print("--output-only-used-namespaces %s" %
              str(output_only_used_namespaces),
              file=error_file,
              flush=True)

        print("--allow-lax-uri %s" % str(allow_lax_uri),
              file=error_file,
              flush=True)

        print("--local-namespace-prefix %s" % local_namespace_prefix,
              file=error_file,
              flush=True)
        print("--local-namespace-use-uuid %s" % str(local_namespace_use_uuid),
              file=error_file,
              flush=True)

        print("--prefix-expansion-label %s" % prefix_expansion_label,
              file=error_file,
              flush=True)
        print("--structured-value-label %s" % structured_value_label,
              file=error_file,
              flush=True)
        print("--structured-uri-label %s" % structured_uri_label,
              file=error_file,
              flush=True)

        print("--newnode-prefix %s" % newnode_prefix,
              file=error_file,
              flush=True)
        print("--newnode-use-uuid %s" % str(newnode_use_uuid),
              file=error_file,
              flush=True)
        print("--newnode-counter %s" % str(newnode_counter),
              file=error_file,
              flush=True)
        print("--newnode-zfill %s" % str(newnode_zfill),
              file=error_file,
              flush=True)

        print("--build-id=%s" % str(build_id), file=error_file, flush=True)

        print("--escape-pipes=%s" % str(escape_pipes),
              file=error_file,
              flush=True)

        print("--validate=%s" % str(validate), file=error_file, flush=True)

        print("--override-uuid=%s" % str(override_uuid),
              file=error_file,
              flush=True)

        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        kn: KgtkNtriples = KgtkNtriples(
            input_file_paths=input_file_paths,
            output_file_path=output_kgtk_file,
            reject_file_path=reject_file_path,
            updated_namespace_file_path=updated_namespace_kgtk_file,
            namespace_file_path=namespace_kgtk_file,
            namespace_id_prefix=namespace_id_prefix,
            namespace_id_use_uuid=namespace_id_use_uuid,
            namespace_id_counter=namespace_id_counter,
            namespace_id_zfill=namespace_id_zfill,
            output_only_used_namespaces=output_only_used_namespaces,
            newnode_prefix=newnode_prefix,
            newnode_use_uuid=newnode_use_uuid,
            newnode_counter=newnode_counter,
            newnode_zfill=newnode_zfill,
            allow_lax_uri=allow_lax_uri,
            local_namespace_prefix=local_namespace_prefix,
            local_namespace_use_uuid=local_namespace_use_uuid,
            prefix_expansion_label=prefix_expansion_label,
            structured_value_label=structured_value_label,
            structured_uri_label=structured_uri_label,
            build_id=build_id,
            escape_pipes=escape_pipes,
            validate=validate,
            override_uuid=override_uuid,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        kn.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
예제 #6
0
def run(
        input_file: KGTKFiles,
        pattern_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        grouped_input: bool = False,
        reject_node1_groups: bool = False,
        no_complaints: bool = False,
        complain_immediately: bool = False,
        add_isa_column: bool = False,
        isa_column_name: str = "isa;node2",
        autovalidate: bool = True,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = False,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkReaderOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderMode, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.propertypatternvalidator import PropertyPatterns, PropertyPatternValidator
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    pattern_kgtk_file: Path = KGTKArgumentParser.get_input_file(
        pattern_file, default_stdin=False)
    output_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(output_file)
    reject_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(reject_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--pattern-file=%s" % str(pattern_kgtk_file), file=error_file)
        if output_kgtk_file is not None:
            print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file)
        print("--presorted=%s" % str(grouped_input))
        print("--reject-node1-groups=%s" % str(reject_node1_groups))
        print("--complain-immediately=%s" % str(complain_immediately))
        print("--add-isa-column=%s" % str(add_isa_column))
        print("--isa-column-name=%s" % str(isa_column_name))
        print("--autovalidate=%s" % str(autovalidate))
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if verbose:
        print("Reading data from '%s'" % str(input_kgtk_file),
              file=error_file,
              flush=True)
        print("Reading patterns from '%s'" % str(pattern_kgtk_file),
              file=error_file,
              flush=True)
        if output_kgtk_file is not None:
            print("Writing good data to '%s'" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        if reject_kgtk_file is not None:
            print("Writing rejected data to '%s'" % str(reject_kgtk_file),
                  file=error_file,
                  flush=True)

    try:
        pkr: KgtkReader = KgtkReader.open(pattern_kgtk_file,
                                          error_file=error_file,
                                          mode=KgtkReaderMode.EDGE,
                                          options=reader_options,
                                          value_options=value_options,
                                          verbose=verbose,
                                          very_verbose=very_verbose)

        pps: PropertyPatterns = PropertyPatterns.load(
            pkr,
            value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        kr: KgtkReader = KgtkReader.open(input_kgtk_file,
                                         error_file=error_file,
                                         options=reader_options,
                                         value_options=value_options,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        output_column_names: typing.List[str] = []
        isa_column_idx: int = -1
        if output_kgtk_file is not None:
            output_column_names = kr.column_names.copy()
            if add_isa_column:
                if isa_column_name in output_column_names:
                    isa_column_idx = output_column_names.index(isa_column_name)
                else:
                    isa_column_idx = len(output_column_names)
                    output_column_names.append(isa_column_name)

        ppv: PropertyPatternValidator = PropertyPatternValidator.new(
            pps,
            kr,
            grouped_input=grouped_input,
            reject_node1_groups=reject_node1_groups,
            no_complaints=no_complaints,
            complain_immediately=complain_immediately,
            isa_column_idx=isa_column_idx,
            autovalidate=autovalidate,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        kw: typing.Optional[KgtkWriter] = None
        if output_kgtk_file is not None:
            kw = KgtkWriter.open(output_column_names,
                                 output_kgtk_file,
                                 verbose=verbose,
                                 very_verbose=very_verbose)

        rkw: typing.Optional[KgtkWriter] = None
        if reject_kgtk_file is not None:
            rkw = KgtkWriter.open(output_column_names,
                                  reject_kgtk_file,
                                  verbose=verbose,
                                  very_verbose=very_verbose)

        ppv.process(kr, kw, rkw)

        if verbose:
            print("Read %d rows, %d valid" %
                  (ppv.input_row_count, ppv.valid_row_count),
                  file=error_file,
                  flush=True)
            if kw is not None:
                print("Wrote %d good rows" % ppv.output_row_count,
                      file=error_file,
                      flush=True)
            if rkw is not None:
                print("Wrote %d rejected rows" % ppv.reject_row_count,
                      file=error_file,
                      flush=True)

        if kw is not None:
            kw.close()
        if rkw is not None:
            rkw.close()

        return 0

    except Exception as e:
        raise KGTKException(e)
예제 #7
0
def run(input_file: KGTKFiles,
        output_file: KGTKFiles,
        mapping_file: KGTKFiles,
        unmodified_edges_file: KGTKFiles,
        activated_mapping_file: KGTKFiles,
        rejected_mapping_file: KGTKFiles,

        confidence_column_name: str,
        require_confidence: bool,
        default_confidence_str: typing.Optional[str],
        confidence_threshold: float,

        same_as_item_label: str,
        same_as_property_label: str,
        allow_exact_duplicates: bool,
        allow_idempotent_mapping: bool,

        split_output_mode: bool,
        modified_pattern: str,

        node1_column_name: typing.Optional[str],
        label_column_name: typing.Optional[str],
        node2_column_name: typing.Optional[str],
        mapping_rule_mode: str,
        mapping_node1_column_name: typing.Optional[str],
        mapping_label_column_name: typing.Optional[str],
        mapping_node2_column_name: typing.Optional[str],

        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want.
)->int:
    # import modules locally
    from pathlib import Path
    import sys
    
    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    mapping_kgtk_file: Path = KGTKArgumentParser.get_input_file(mapping_file, who="KGTK mappping file")
    unmodified_edges_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmodified_edges_file, who="KGTK unmodified edges output file")
    activated_mapping_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(activated_mapping_file, who="KGTK activated mapping output file")
    rejected_mapping_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(rejected_mapping_file, who="KGTK rejected mapping output file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input", fallback=True)
    mapping_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="mapping", fallback=True)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % repr(str(input_kgtk_file)), file=error_file, flush=True)
        print("--output-file=%s" % repr(str(output_kgtk_file)), file=error_file, flush=True)
        print("--mapping-file=%s" % repr(str(mapping_kgtk_file)), file=error_file, flush=True)
        if unmodified_edges_kgtk_file is not None:
            print("--unmodified-edges-file=%s" % repr(str(unmodified_edges_kgtk_file)), file=error_file, flush=True)
        if activated_mapping_kgtk_file is not None:
            print("--activated-mapping-edges-file=%s" % repr(str(activated_mapping_kgtk_file)), file=error_file, flush=True)
        if rejected_mapping_kgtk_file is not None:
            print("--rejected-mapping-edges-file=%s" % repr(str(rejected_mapping_kgtk_file)), file=error_file, flush=True)

        print("--confidence-column=%s" % repr(confidence_column_name), file=error_file, flush=True)
        print("--require-confidence=%s" % repr(require_confidence), file=error_file, flush=True)
        if default_confidence_str is not None:
            print("--default-confidence-value=%s" % default_confidence_str, file=error_file, flush=True)
        print("--threshold=%f" % confidence_threshold, file=error_file, flush=True)

        print("--same-as-item-label=%s" % repr(same_as_item_label), file=error_file, flush=True)
        print("--same-as-property-label=%s" % repr(same_as_property_label), file=error_file, flush=True)
        print("--allow-exact-duplicates=%s" % repr(allow_exact_duplicates), file=error_file, flush=True)
        print("--allow-idempotent-actions=%s" % repr(allow_idempotent_mapping), file=error_file, flush=True)

        print("--split-output-mode=%s" % repr(split_output_mode), file=error_file, flush=True)
        print("--modified-pattern=%s" % repr(modified_pattern), file=error_file, flush=True)

        if node1_column_name is not None:
            print("--node1-column-=%s" % repr(node1_column_name), file=error_file, flush=True)
        if label_column_name is not None:
            print("--label-column-=%s" % repr(label_column_name), file=error_file, flush=True)
        if node2_column_name is not None:
            print("--node2-column-=%s" % repr(node2_column_name), file=error_file, flush=True)
        print("--mapping-rule-mode=%s" % repr(mapping_rule_mode), file=error_file, flush=True)
        if mapping_node1_column_name is not None:
            print("--mapping-node1-column-=%s" % repr(mapping_node1_column_name), file=error_file, flush=True)
        if mapping_label_column_name is not None:
            print("--mapping-label-column-=%s" % repr(mapping_label_column_name), file=error_file, flush=True)
        if mapping_node2_column_name is not None:
            print("--mapping-node2-column-=%s" % repr(mapping_node2_column_name), file=error_file, flush=True)

        input_reader_options.show(out=error_file, who="input")
        mapping_reader_options.show(out=error_file, who="mapping")
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    default_confidence_value: typing.Optional[float] = None
    if default_confidence_str is not None:
        try:
            default_confidence_value = float(default_confidence_str)
        except:
            raise KGTKException("--default-confidence-value=%s is invalid" % repr(default_confidence_str))

    try:

        if verbose:
            print("Opening the mapping file %s." % repr(str(mapping_kgtk_file)), file=error_file, flush=True)
        mkr:  KgtkReader = KgtkReader.open(mapping_kgtk_file,
                                           options=mapping_reader_options,
                                           value_options = value_options,
                                           error_file=error_file,
                                           verbose=verbose,
                                           very_verbose=very_verbose,
        )
        trouble = False
        mapping_node1_idx: int = mkr.get_node1_column_index(mapping_node1_column_name)
        mapping_label_idx: int = mkr.get_label_column_index(mapping_label_column_name)
        mapping_node2_idx: int = mkr.get_node2_column_index(mapping_node2_column_name)
        if mapping_node1_idx < 0:
            trouble = True
            print("Error: Cannot find the mapping file node1 column.", file=error_file, flush=True)
        if mapping_label_idx < 0 and mapping_rule_mode == "normal":
            trouble = True
            print("Error: Cannot find the mapping file label column.", file=error_file, flush=True)
        if mapping_node2_idx < 0:
            trouble = True
            print("Error: Cannot find the mapping file node2 column.", file=error_file, flush=True)
        if trouble:
            # Clean up:                                                                                                                                               
            mkr.close()
            raise KGTKException("Missing columns in the mapping file.")
        confidence_column_idx: int = mkr.column_name_map.get(confidence_column_name, -1)
        if require_confidence and confidence_column_idx < 0:
            mkr.close()
            raise KGTKException("The mapping file does not have a confidence column, and confidence is required.")
        
        rmkw: typing.Optional[KgtkWriter] = None
        if rejected_mapping_kgtk_file is not None:
            if verbose:
                print("Opening the rejected mapping edges file %s." % repr(str(rejected_mapping_kgtk_file)), file=error_file, flush=True)
            rmkw = KgtkWriter.open(mkr.column_names,
                                   rejected_mapping_kgtk_file,
                                   mode=KgtkWriter.Mode[mkr.mode.name],
                                   use_mgzip=input_reader_options.use_mgzip, # Hack!
                                   mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                   error_file=error_file,
                                   verbose=verbose,
                                   very_verbose=very_verbose)

        # Mapping structures:
        item_map: typing.MutableMapping[str, str] = dict()
        item_line_map: typing.MutableMapping[str, int] = dict()
        property_map: typing.MutableMapping[str, str] = dict()
        property_line_map: typing.MutableMapping[str, int] = dict()

        mapping_rows: typing.MutableMapping[int, typing.List[str]] = dict()
        activated_mapping_rows: typing.MutableMapping[int, typing.List[str]] = dict()

        # Read the mapping file.
        if verbose:
            print("Processing the mapping file.", file=error_file, flush=True)
        mapping_confidence_exclusions: int = 0
        mapping_idempotent_exclusions: int = 0
        mapping_errors: int = 0
        mapping_line_number: int = 0
        mrow: typing.List[str]
        for mrow in mkr:
            mapping_line_number += 1
            mapping_node1: str = mrow[mapping_node1_idx]
            mapping_label: str = mrow[mapping_label_idx] if mapping_rule_mode == "normal" else ""
            mapping_node2: str = mrow[mapping_node2_idx]
            mapping_confidence: typing.Optional[float] = default_confidence_value
            if confidence_column_idx >= 0:
                confidence_value_str: str = mrow[confidence_column_idx]
                if len(confidence_value_str) == 0:
                    if require_confidence:
                        print("In line %d of the mapping file: the required confidence value is missing" % (mapping_line_number),
                              file=error_file, flush=True)
                        mapping_errors += 1
                        continue
                else:
                    try:
                        mapping_confidence = float(confidence_value_str)
                    except ValueError:
                        print("In line %d of the mapping file: cannot parse confidence value %s" % (mapping_line_number, repr(mrow[confidence_column_idx])),
                              file=error_file, flush=True)
                        mapping_errors += 1
                        continue
            if mapping_confidence is not None and mapping_confidence < confidence_threshold:
                mapping_confidence_exclusions += 1
                if rmkw is not None:
                    rmkw.write(mrow)
                continue

            if mapping_node1 == mapping_node2 and not allow_idempotent_mapping:
                mapping_idempotent_exclusions += 1
                continue
        
            if mapping_rule_mode == "same-as-item" or mapping_label == same_as_item_label:
                if mapping_node1 in item_map:
                    if mapping_node2 != item_map[mapping_node1] or not allow_exact_duplicates:
                        print("Duplicate %s for %s at mapping file line %d, originally in line %d" % (mapping_label,
                                                                                                      repr(mapping_node1),
                                                                                                      mapping_line_number,
                                                                                                      item_line_map[mapping_node1]),
                              file=error_file, flush=True)
                        mapping_errors += 1
                    continue

                item_map[mapping_node1] = mapping_node2
                item_line_map[mapping_node1] = mapping_line_number
                mapping_rows[mapping_line_number] = mrow.copy()

            elif mapping_rule_mode == "same-as-property" or mapping_label == same_as_property_label:
                if mapping_node1 in property_map:
                    if mapping_node2 != property_map[mapping_node1] or not allow_exact_duplicates:
                        print("Duplicate %s for %s at mapping file line %d, originally in line %d" % (mapping_label,
                                                                                                      repr(mapping_node1),
                                                                                                      mapping_line_number,
                                                                                                      property_line_map[mapping_node1]),
                              file=error_file, flush=True)
                        mapping_errors += 1
                    continue
                property_map[mapping_node1] = mapping_node2
                property_line_map[mapping_node1] = mapping_line_number
                mapping_rows[mapping_line_number] = mrow.copy()

            else:
                print("Unknown mapping action %s at line %d of mapping file %s" % (mapping_label,
                                                                                   mapping_line_number,
                                                                                   repr(str(mapping_kgtk_file))),
                      file=error_file, flush=True)
                mapping_errors += 1
                continue
                

        # Close the mapping file.
        mkr.close()
        if rmkw is not None:
            rmkw.close()

        if mapping_errors > 0:
            raise KGTKException("%d errors detected in the mapping file %s" % (mapping_errors, repr(str(mapping_kgtk_file))))

        if len(item_map) == 0 and len(property_map) == 0:
            raise KGTKException("Nothing read from the mapping file %s" % repr(str(mapping_kgtk_file)))

        if verbose:
            print("%d mapping lines, %d excluded for confidence, %d excluded for idempotency." % (mapping_line_number,
                                                                                                  mapping_confidence_exclusions,
                                                                                                  mapping_idempotent_exclusions),
                  file=error_file, flush=True)
            print("%d item mapping rules." % len(item_map), file=error_file, flush=True)
            print("%d property mapping rules." % len(property_map), file=error_file, flush=True)

        if verbose:
            print("Opening the input file %s." % repr(str(input_kgtk_file)), file=error_file, flush=True)
        ikr:  KgtkReader = KgtkReader.open(input_kgtk_file,
                                           options=input_reader_options,
                                           value_options = value_options,
                                           error_file=error_file,
                                           verbose=verbose,
                                           very_verbose=very_verbose,
        )
        trouble = False
        input_node1_idx: int = ikr.get_node1_column_index(node1_column_name)
        input_label_idx: int = ikr.get_label_column_index(label_column_name)
        input_node2_idx: int = ikr.get_node2_column_index(node2_column_name)
        if input_node1_idx < 0 and mapping_rule_mode in ["normal", "same-as-item"]:
            trouble = True
            print("Error: Cannot find the input file node1 column.", file=error_file, flush=True)
        if input_label_idx < 0 and mapping_rule_mode in ["normal", "same-as-property"]:
            trouble = True
            print("Error: Cannot find the input file label column.", file=error_file, flush=True)
        if input_node2_idx < 0 and mapping_rule_mode in ["normal", "same-as-item"]:
            trouble = True
            print("Error: Cannot find the input file node2 column.", file=error_file, flush=True)
        if trouble:
            # Clean up:                                                                                                                                               
            ikr.close()
            raise KGTKException("Missing columns in the input file.")

        okw: KgtkWriter = KgtkWriter.open(ikr.column_names,
                                          output_kgtk_file,
                                          mode=KgtkWriter.Mode[ikr.mode.name],
                                          use_mgzip=input_reader_options.use_mgzip, # Hack!
                                          mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                          error_file=error_file,
                                          verbose=verbose,
                                          very_verbose=very_verbose)

        uekw: typing.Optional[KgtkWriter] = None
        if unmodified_edges_kgtk_file is not None:
            if verbose:
                print("Opening the unmodified edges file %s." % repr(str(unmodified_edges_kgtk_file)), file=error_file, flush=True)
            uekw = KgtkWriter.open(ikr.column_names,
                                   unmodified_edges_kgtk_file,
                                   mode=KgtkWriter.Mode[ikr.mode.name],
                                   use_mgzip=input_reader_options.use_mgzip, # Hack!
                                   mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                   error_file=error_file,
                                   verbose=verbose,
                                   very_verbose=very_verbose)

        amkw: typing.Optional[KgtkWriter] = None
        if activated_mapping_kgtk_file is not None:
            if verbose:
                print("Opening the activated mapping edges file %s." % repr(str(activated_mapping_kgtk_file)), file=error_file, flush=True)
            amkw = KgtkWriter.open(mkr.column_names,
                                   activated_mapping_kgtk_file,
                                   mode=KgtkWriter.Mode[mkr.mode.name],
                                   use_mgzip=input_reader_options.use_mgzip, # Hack!
                                   mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                   error_file=error_file,
                                   verbose=verbose,
                                   very_verbose=very_verbose)

        # Process each row of the input file.
        if verbose:
            print("Processing the input file.", file=error_file, flush=True)
        input_count: int = 0
        modified_edge_count: int = 0
        unmodified_edge_count: int = 0
        row: typing.List[str]
        for row in ikr:
            input_count +=1
            newrow: typing.List[str] = row.copy()

            modified_node1: bool = False
            modified_node2: bool = False
            modified_label: bool = False

            if mapping_rule_mode in ["normal", "same-as-item"]:
                input_node1: str = row[input_node1_idx]
                if input_node1 in item_map:
                    newrow[input_node1_idx] = item_map[input_node1]
                    modified_node1 = True
                    if amkw is not None:
                        mapping_line_number = item_line_map[input_node1]
                        if mapping_line_number not in activated_mapping_rows:
                            activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number]
                        
                input_node2: str = row[input_node2_idx]
                if input_node2 in item_map:
                    newrow[input_node2_idx] = item_map[input_node2]
                    modified_node2 = True
                    if amkw is not None:
                        mapping_line_number = item_line_map[input_node2]
                        if mapping_line_number not in activated_mapping_rows:
                            activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number]

            if mapping_rule_mode in ["normal", "same-as-property"]:
                input_label: str = row[input_label_idx]
                if input_label in property_map:
                    newrow[input_label_idx] = property_map[input_label]
                    modified_label = True
                    if amkw is not None:
                        mapping_line_number = property_line_map[input_label]
                        if mapping_line_number not in activated_mapping_rows:
                            activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number]

            modified: bool
            if modified_pattern == "node1|label|node2":
                modified = modified_node1 or modified_label or modified_node2
            elif modified_pattern == "node1|label":
                modified = modified_node1 or modified_label
            elif modified_pattern == "node1|node2":
                modified = modified_node1 or modified_node2
            elif modified_pattern == "label|node2":
                modified = modified_label or modified_node2
            elif modified_pattern == "node1":
                modified = modified_node1
            elif modified_pattern == "label":
                modified = modified_label
            elif modified_pattern == "node2":
                modified = modified_node2
            elif modified_pattern == "node1&label&node2":
                modified = modified_node1 and modified_label and modified_node2
            elif modified_pattern == "node1&label":
                modified = modified_node1 and modified_label
            elif modified_pattern == "node1&node2":
                modified = modified_node1 and modified_node2
            elif modified_pattern == "label&node2":
                modified = modified_label and modified_node2
            else:
                raise KGTKException("Unrecognized modification test pattern %s" % repr(modified_pattern))                

            if modified:
                modified_edge_count += 1
                okw.write(newrow)
            else:
                unmodified_edge_count += 1
                if uekw is not None:
                    uekw.write(row)
                if not split_output_mode:
                    okw.write(row)
                        
        # Done!
        ikr.close()
        okw.close()

        if verbose:
            print("%d edges read. %d modified, %d unmodified." % (input_count, modified_edge_count, unmodified_edge_count), file=error_file, flush=True)

        if uekw is not None:
            uekw.close()

        if amkw is not None:
            activated_count: int = 0
            for mapping_line_number in sorted(activated_mapping_rows.keys()):
                amkw.write(activated_mapping_rows[mapping_line_number])
                activated_count += 1
            amkw.close()

            if verbose:
                print("%d activated mapping edges" % activated_count, file=error_file, flush=True)

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
예제 #8
0
파일: filter.py 프로젝트: usbader/kgtk
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        pattern: str,
        subj_col: typing.Optional[str],
        pred_col: typing.Optional[str],
        obj_col: typing.Optional[str],
        or_pattern: bool,
        invert: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            output_file, who="KGTK reject file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file)
        print("--pattern=%s" % str(pattern), file=error_file)
        if subj_col is not None:
            print("--subj=%s" % str(subj_col), file=error_file)
        if pred_col is not None:
            print("--pred=%s" % str(pred_col), file=error_file)
        if obj_col is not None:
            print("--obj=%s" % str(obj_col), file=error_file)
        print("--or=%s" % str(or_pattern), file=error_file)
        print("--invert=%s" % str(invert), file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    def prepare_filter(pattern: str) -> typing.Set[str]:
        filt: typing.Set[str] = set()
        pattern = pattern.strip()
        if len(pattern) == 0:
            return filt

        target: str
        for target in pattern.split(","):
            target = target.strip()
            if len(target) > 0:
                filt.add(target)

        return filt

    try:

        patterns: typing.List[str] = pattern.split(";")
        if len(patterns) != 3:
            print(
                "Error: The pattern must have three sections separated by semicolons (two semicolons total).",
                file=error_file,
                flush=True)
            raise KGTKException("Bad pattern")

        subj_filter: typing.Set[str] = prepare_filter(patterns[0])
        pred_filter: typing.Set[str] = prepare_filter(patterns[1])
        obj_filter: typing.Set[str] = prepare_filter(patterns[2])
        apply_subj_filter: bool = len(subj_filter) > 0
        apply_pred_filter: bool = len(pred_filter) > 0
        apply_obj_filter: bool = len(obj_filter) > 0

        if verbose and not (apply_subj_filter or apply_pred_filter
                            or apply_obj_filter):
            print("Warning: the filter is empty.", file=error_file, flush=True)

        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        subj_idx: int = kr.get_node1_column_index(subj_col)
        pred_idx: int = kr.get_label_column_index(pred_col)
        obj_idx: int = kr.get_node2_column_index(obj_col)

        # Complain about a missing column only when it is needed by the pattern.
        trouble: bool = False
        if subj_idx < 0 and len(subj_filter) > 0:
            trouble = True
            print("Error: Cannot find the subject column '%s'." %
                  kr.get_node1_canonical_name(subj_col),
                  file=error_file,
                  flush=True)
        if pred_idx < 0 and len(pred_filter) > 0:
            trouble = True
            print("Error: Cannot find the predicate column '%s'." %
                  kr.get_label_canonical_name(pred_col),
                  file=error_file,
                  flush=True)
        if obj_idx < 0 and len(obj_filter) > 0:
            trouble = True
            print("Error: Cannot find the object column '%s'." %
                  kr.get_node2_canonical_name(obj_col),
                  file=error_file,
                  flush=True)
        if trouble:
            raise KGTKException("Missing columns.")

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        rw: typing.Optional[KgtkWriter] = None
        if reject_kgtk_file is not None:
            if verbose:
                print("Opening the reject file: %s" % str(reject_kgtk_file),
                      file=error_file,
                      flush=True)
            rw = KgtkWriter.open(kr.column_names,
                                 reject_kgtk_file,
                                 mode=KgtkWriter.Mode[kr.mode.name],
                                 verbose=verbose,
                                 very_verbose=very_verbose)

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0
        subj_filter_keep_count: int = 0
        pred_filter_keep_count: int = 0
        obj_filter_keep_count: int = 0
        subj_filter_reject_count: int = 0
        pred_filter_reject_count: int = 0
        obj_filter_reject_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            keep: bool = False
            reject: bool = False
            if apply_subj_filter:
                if row[subj_idx] in subj_filter:
                    keep = True
                    subj_filter_keep_count += 1
                else:
                    reject = True
                    subj_filter_reject_count += 1

            if apply_pred_filter:
                if row[pred_idx] in pred_filter:
                    keep = True
                    pred_filter_keep_count += 1
                else:
                    reject = True
                    pred_filter_reject_count += 1

            if apply_obj_filter:
                if row[obj_idx] in obj_filter:
                    keep = True
                    obj_filter_keep_count += 1
                else:
                    reject = True
                    obj_filter_reject_count += 1

            if (not keep ^ invert) if or_pattern else (reject ^ invert):
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1
            else:
                kw.write(row)
                output_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))
            print("Keep counts: subject=%d, predicate=%d, object=%d." %
                  (subj_filter_keep_count, pred_filter_keep_count,
                   obj_filter_keep_count))
            print("Reject counts: subject=%d, predicate=%d, object=%d." %
                  (subj_filter_reject_count, pred_filter_reject_count,
                   obj_filter_reject_count))

        kw.close()
        if rw is not None:
            rw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
예제 #9
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        list_output_file: KGTKFiles,
        key_column_names: typing.List[str],
        keep_first_names: typing.List[str],
        compact_id: bool,
        deduplicate: bool,
        sorted_input: bool,
        verify_sort: bool,
        lists_in_input: bool,
        report_lists: bool,
        exclude_lists: bool,
        output_only_lists: bool,
        build_id: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkcompact import KgtkCompact
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    list_output_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            list_output_file, who="KGTK list output file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if list_output_kgtk_file is not None:
            print("--list-output-file=%s" % str(list_output_kgtk_file),
                  file=error_file,
                  flush=True)
        print("--columns=%s" % " ".join(key_column_names), file=error_file)
        print("--keep-first=%s" % " ".join(keep_first_names), file=error_file)
        print("--compact-id=%s" % str(compact_id), file=error_file, flush=True)
        print("--deduplicate=%s" % str(deduplicate),
              file=error_file,
              flush=True)
        print("--presorted=%s" % str(sorted_input),
              file=error_file,
              flush=True)
        print("--verify-sort=%s" % str(verify_sort),
              file=error_file,
              flush=True)
        print("--lists-in-input=%s" % str(lists_in_input),
              file=error_file,
              flush=True)
        print("--report-lists=%s" % str(report_lists),
              file=error_file,
              flush=True)
        print("--exclude-lists=%s" % str(exclude_lists),
              file=error_file,
              flush=True)
        print("--output-only-lists=%s" % str(output_only_lists),
              file=error_file,
              flush=True)
        print("--build-id=%s" % str(build_id), file=error_file, flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        KgtkReader.show_debug_arguments(errors_to_stdout=errors_to_stdout,
                                        errors_to_stderr=errors_to_stderr,
                                        show_options=show_options,
                                        verbose=verbose,
                                        very_verbose=very_verbose,
                                        out=error_file)
        print("=======", file=error_file, flush=True)

    if exclude_lists and output_only_lists:
        raise KGTKException(
            "--exclude-lists and --output-only-lists may not be used together."
        )

    try:
        ex: KgtkCompact = KgtkCompact(
            input_file_path=input_kgtk_file,
            output_file_path=output_kgtk_file,
            list_output_file_path=list_output_kgtk_file,
            key_column_names=key_column_names,
            keep_first_names=keep_first_names,
            compact_id=compact_id,
            deduplicate=deduplicate,
            sorted_input=sorted_input,
            verify_sort=verify_sort,
            lists_in_input=lists_in_input,
            report_lists=report_lists,
            exclude_lists=exclude_lists,
            output_only_lists=output_only_lists,
            build_id=build_id,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
예제 #10
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        filter_column_names: typing.List[str],
        all_are: bool = False,
        only_count: bool = False,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.iff.kgtkifempty import KgtkIfEmpty
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            reject_file, who="KGTK reject file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file)
        print("--columns=%s" % " ".join(filter_column_names), file=error_file)
        print("--count=%s" % str(only_count), file=error_file)
        print("--all=%s" % str(all_are), file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        ie: KgtkIfEmpty = KgtkIfEmpty(
            input_file_path=input_kgtk_file,
            filter_column_names=filter_column_names,
            output_file_path=output_kgtk_file,
            reject_file_path=reject_kgtk_file,
            all_are=all_are,
            notempty=True,
            only_count=only_count,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ie.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
예제 #11
0
파일: export_gt.py 프로젝트: nicklein/kgtk
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        undirected: bool,
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    import kgtk.gt.analysis_utils as gtanalysis
    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    try:
        # Select where to send error messages, defaulting to stderr.
        error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

        # Build the option structures.
        reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

        input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
        output_gt_file: typing.Optional[
            Path] = KGTKArgumentParser.get_optional_output_file(output_file)

        if verbose:
            print('loading the KGTK input file...\n',
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        sub: int = kr.get_node1_column_index()
        if sub < 0:
            print("Missing node1 (subject) column.",
                  file=error_file,
                  flush=True)
        pred: int = kr.get_label_column_index()
        if pred < 0:
            print("Missing label (predicate) column.",
                  file=error_file,
                  flush=True)
        obj: int = kr.get_node2_column_index()
        if obj < 0:
            print("Missing node2 (object) column", file=error_file, flush=True)
        if sub < 0 or pred < 0 or obj < 0:
            kr.close()
            raise KGTKException("Exiting due to missing columns.")

        G2 = load_graph_from_kgtk(kr,
                                  directed=not undirected,
                                  ecols=(sub, obj),
                                  verbose=verbose,
                                  out=error_file)

        if verbose:
            print('graph loaded! It has %d nodes and %d edges.' %
                  (G2.num_vertices(), G2.num_edges()),
                  file=error_file,
                  flush=True)
            print('\n###Top relations:', file=error_file, flush=True)
            for rel, freq in gtanalysis.get_topN_relations(
                    G2, pred_property=kr.column_names[pred]):
                print('%s\t%d' % (rel, freq), file=error_file, flush=True)

        if output_gt_file is not None:
            if verbose:
                print('\nNow saving the graph to %s' % str(output_gt_file),
                      file=error_file,
                      flush=True)
            G2.save(str(output_gt_file))
            if verbose:
                print('Done saving the graph.', file=error_file, flush=True)
    except Exception as e:
        raise KGTKException('Error: ' + str(e))
예제 #12
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        new_edges_file: KGTKFiles,
        base_columns: typing.Optional[typing.List[str]] = None,
        columns_to_lower: typing.Optional[typing.List[str]] = None,
        label_value: str = KgtkLift.DEFAULT_LABEL_SELECT_COLUMN_VALUE,
        lift_separator: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SEPARATOR,
        lower: bool = False,
        normalize: bool = False,
        deduplicate_new_edges: bool = True,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    new_edges_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(new_edges_file,
                                                            who="Label file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if new_edges_kgtk_file is not None:
            print("--label-file=%s" % str(new_edges_kgtk_file),
                  file=error_file)

        if base_columns is not None:
            print("--base-columns=%s" % " ".join(base_columns),
                  file=error_file)
        if columns_to_lower is not None:
            print("--columns-to-lower=%s" % " ".join(columns_to_lower),
                  file=error_file)
        print("--label-value=%s" % label_value, file=error_file)
        print("--lift-separator=%s" % lift_separator, file=error_file)
        print("--lower=%s" % lower, file=error_file)
        print("--normalize=%s" % normalize, file=error_file)
        print("--deduplicate-labels=%s" % deduplicate_new_edges,
              file=error_file)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if not lower and not normalize:
        raise KGTKException(
            "One or both of --lower and --normalize must be requested.")

    try:
        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Map the index of a column being removed to the index of the base column that supplies its node1 value.
        lower_map: typing.MutableMapping[int, typing.Tuple[int, str]] = dict()

        node1_column_name: str = kr.get_node1_column_actual_name()
        label_column_name: str = kr.get_label_column_actual_name()
        node2_column_name: str = kr.get_node2_column_actual_name()
        id_column_name: str = kr.get_id_column_actual_name()

        key_column_names: typing.List[str] = list()
        key_column_idxs: typing.Set[int] = set()

        if node1_column_name != "":
            if verbose:
                print("Node1 column name: %s" % node1_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(node1_column_name)
            key_column_idxs.add(kr.node1_column_idx)

        if label_column_name != "":
            if verbose:
                print("Label column name: %s" % label_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(label_column_name)
            key_column_idxs.add(kr.label_column_idx)

        if node2_column_name != "":
            if verbose:
                print("Node2 column name: %s" % node2_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(node2_column_name)
            key_column_idxs.add(kr.node2_column_idx)

        if id_column_name != "":
            if verbose:
                print("Id column name: %s" % id_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(id_column_name)
            key_column_idxs.add(kr.id_column_idx)
        elif normalize:
            raise KGTKException(
                "--normalize was requested but the ID column was not found.")

        base_name: str
        new_label_value: str
        column_name: str
        idx: int
        # There are three option patterns.

        if columns_to_lower is not None and len(
                columns_to_lower) > 0 and base_columns is not None and len(
                    base_columns) > 0:
            # Pattern 1: len(columns_to_lower) > 0 and len(base_columns) == len(columns_to_lower)
            # column_names and base_columns are paired. New records use label_value.
            if len(columns_to_lower) != len(base_columns):
                raise KGTKException(
                    "There are %d columns to remove but only %d base columns."
                    % (len(columns_to_lower), len(base_columns)))

            if len(label_value) == 0:
                raise KGTKException("The --label-value must not be empty.")

            for idx, column_name in enumerate(columns_to_lower):
                base_name = base_columns[idx]
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if base_name not in kr.column_names:
                    raise KGTKException(
                        "For column name %s, base name %s is unknown" %
                        (repr(column_name), repr(base_name)))

                if normalize and base_name == id_column_name:
                    lower_map[kr.column_name_map[column_name]] = (
                        kr.column_name_map[base_name], column_name)
                else:
                    if not lower:
                        raise KGTKException(
                            "--lower is not enabled for column %s, base name %s"
                            % (repr(column_name), repr(base_name)))
                    lower_map[kr.column_name_map[column_name]] = (
                        kr.column_name_map[base_name], label_value)

        elif columns_to_lower is not None and len(columns_to_lower) > 0 and (
                base_columns is None or len(base_columns) == 0):
            # Pattern 2: len(columns_to_lower) > 0 and len(base_columns) == 0
            # Each column name is split at the lift separator to determine the base name and label value.
            if len(lift_separator) == 0:
                raise KGTKException("The --lift-separator must not be empty.")

            for idx, column_name in enumerate(columns_to_lower):
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if lower and lift_separator in column_name:
                    base_name, new_label_value = column_name.split(
                        lift_separator, 1)
                    if base_name not in kr.column_names:
                        raise KGTKException(
                            "For column name %s, base name %s is not known" %
                            (repr(column_name), repr(base_name)))

                elif normalize:
                    base_name = id_column_name
                    new_label_value = column_name

                else:
                    raise KGTKException(
                        "Unable to parse column name %s, no separator (%s)." %
                        (repr(column_name), repr(lift_separator)))

                lower_map[kr.column_name_map[column_name]] = (
                    kr.column_name_map[base_name], new_label_value)

        elif columns_to_lower is None or len(columns_to_lower) == 0:
            # Pattern 3: len(columns_to_lower) == 0.
            # Any column that matches a lift pattern against one of the
            # key columns (node1, label, node2, id, or their aliases)
            # will be lowered.
            if len(lift_separator) == 0:
                raise KGTKException("The --lift-separator must not be empty.")

            if base_columns is None or len(base_columns) == 0:
                # The base name list wasn't supplied.  Use [node1, label, node2, id]
                base_columns = list(key_column_names)
                if verbose:
                    print("Using the default base columns: %s" %
                          " ".join(base_columns),
                          file=error_file,
                          flush=True)
            else:
                if verbose:
                    print("Using these base columns: %s" %
                          " ".join(base_columns),
                          file=error_file,
                          flush=True)

            for idx, column_name in enumerate(kr.column_names):
                # Skip the node1, label, node12, and id columns
                if idx in key_column_idxs:
                    if verbose:
                        print("column %s is a key column, skipping." %
                              repr(column_name),
                              file=error_file,
                              flush=True)
                    continue

                # Does this column match a lifting pattern?
                if lower and lift_separator in column_name:
                    base_name, new_label_value = column_name.split(
                        lift_separator, 1)

                    if base_name not in base_columns:
                        if verbose:
                            print(
                                "Column %s contains base name %s, which is not a base column."
                                % (repr(column_name), repr(base_name)),
                                file=error_file,
                                flush=True)
                        continue

                elif normalize:
                    base_name = id_column_name
                    new_label_value = column_name

                else:
                    if verbose:
                        print(
                            "Column %s does not contain the separator %s and not normalizing, skipping."
                            % (repr(column_name), repr(lift_separator)),
                            file=error_file,
                            flush=True)
                    continue

                # This test should be redundant.
                if base_name in kr.column_names:
                    lower_map[idx] = (kr.column_name_map[base_name],
                                      new_label_value)
                else:
                    raise KGTKException(
                        "Base name %s was unexpectedly not found." %
                        repr(base_name))

        if len(lower_map) == 0:
            raise KGTKException("There are no columns to lower or normalize.")

        if verbose:
            print("The following columns will be lowered or normalized",
                  file=error_file,
                  flush=True)
            for idx in sorted(lower_map.keys()):
                column_name = kr.column_names[idx]
                base_idx, new_label_value = lower_map[idx]
                base_name = kr.column_names[base_idx]
                print(" %s from %s (label %s)" %
                      (column_name, base_name, repr(new_label_value)),
                      file=error_file,
                      flush=True)

        output_column_names: typing.List[str] = list()
        for idx, column_name in enumerate(kr.column_names):
            if idx not in lower_map:
                output_column_names.append(column_name)
        if verbose:
            print("The output columns are: %s" % " ".join(output_column_names),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(
            output_column_names,
            output_kgtk_file,
            mode=KgtkWriter.Mode.EDGE,
            require_all_columns=False,  # Simplifies writing the labels
            verbose=verbose,
            very_verbose=very_verbose)
        shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names)

        lkw: typing.Optional[KgtkWriter] = None
        if new_edges_kgtk_file is not None:
            if verbose:
                print("Opening the label output file: %s" %
                      str(new_edges_kgtk_file),
                      file=error_file,
                      flush=True)

            label_column_names = [
                node1_column_name, label_column_name, node2_column_name
            ]
            lkw = KgtkWriter.open(label_column_names,
                                  new_edges_kgtk_file,
                                  mode=KgtkWriter.Mode.EDGE,
                                  verbose=verbose,
                                  very_verbose=very_verbose)

        # Optionally deduplicate the labels
        #  set(node1_value + KgtkFormat.SEPARATOR + node2_value)
        label_set: typing.Set[str] = set()
        label_key: str

        input_line_count: int = 0
        output_line_count: int = 0
        label_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            kw.write(row, shuffle_list=shuffle_list)
            output_line_count += 1

            column_idx: int
            for column_idx in lower_map.keys():
                node1_idx: int
                node1_idx, new_label_value = lower_map[column_idx]
                node1_value: str
                node1_value = row[node1_idx]
                if len(node1_value) == 0:
                    continue  # TODO: raise an exception

                item: str = row[column_idx]
                if len(item) == 0:
                    continue  # Ignore empty node2 values.

                # Ths item might be a KGTK list.  Let's split it, because
                # lists aren't allow in the node2 values we'll generate.
                node2_value: str
                for node2_value in KgtkValue.split_list(item):
                    if len(node2_value) == 0:
                        continue  # Ignore empty node2 values.

                    if deduplicate_new_edges:
                        label_key = node1_value + KgtkFormat.KEY_FIELD_SEPARATOR + new_label_value + KgtkFormat.KEY_FIELD_SEPARATOR + node2_value
                        if label_key in label_set:
                            continue
                        else:
                            label_set.add(label_key)

                    output_map: typing.Mapping[str, str] = {
                        node1_column_name: node1_value,
                        label_column_name: new_label_value,
                        node2_column_name: node2_value,
                    }
                    if lkw is None:
                        kw.writemap(output_map)
                        label_line_count += 1
                        output_line_count += 1
                    else:
                        lkw.writemap(output_map)
                        label_line_count += 1

        if verbose:
            print("Read %d rows, wrote %d rows with %d labels." %
                  (input_line_count, output_line_count, label_line_count),
                  file=error_file,
                  flush=True)

        kw.close()
        if lkw is not None:
            lkw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
예제 #13
0
def run(input_file: KGTKFiles, output_file: KGTKFiles, directed, log_file):
    from kgtk.exceptions import KGTKException

    def infer_index(h, options=[]):
        for o in options:
            if o in h:
                return h.index(o)
        return -1

    def infer_predicate(h, options=[]):
        for o in options:
            if o in h:
                return o
        return ''

    try:
        # import modules locally
        from pathlib import Path
        import socket
        import sys
        import typing
        from graph_tool import load_graph_from_csv
        from graph_tool import centrality
        import kgtk.gt.analysis_utils as gtanalysis
        import csv
        csv.field_size_limit(sys.maxsize)

        filename: Path = KGTKArgumentParser.get_input_file(input_file)
        output: typing.Optional[
            Path] = KGTKArgumentParser.get_optional_output_file(output_file)

        with open(filename, 'r') as f:
            header = next(f).split('\t')
            subj_index = infer_index(header, options=['node1', 'subject'])
            obj_index = infer_index(header,
                                    options=['node2', 'object', 'value'])
            predicate = infer_predicate(
                header,
                options=['relation', 'predicate', 'label', 'relationship'])

            p = []
            for i, header_col in enumerate(header):
                if i in [subj_index, obj_index]: continue
                p.append(header_col)
        with open(log_file, 'w') as writer:
            writer.write('loading the TSV graph now ...\n')
            G2 = load_graph_from_csv(str(filename),
                                     skip_first=True,
                                     directed=directed,
                                     hashed=True,
                                     ecols=[subj_index, obj_index],
                                     eprop_names=p,
                                     csv_options={'delimiter': '\t'})

            writer.write('graph loaded! It has %d nodes and %d edges\n' %
                         (G2.num_vertices(), G2.num_edges()))
            writer.write('\n###Top relations:\n')
            for rel, freq in gtanalysis.get_topN_relations(
                    G2, pred_property=predicate):
                writer.write('%s\t%d\n' % (rel, freq))

            if output:
                writer.write('now saving the graph to %s\n' % str(output))
                G2.save(str(output))
    except Exception as e:
        raise KGTKException('Error: ' + str(e))
예제 #14
0
파일: filter.py 프로젝트: yyht/kgtk
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        pattern: str,
        subj_col: typing.Optional[str],
        pred_col: typing.Optional[str],
        obj_col: typing.Optional[str],
        or_pattern: bool,
        invert: bool,
        show_version: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            reject_file, who="KGTK reject file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    UPDATE_VERSION: str = "2020-08-06T17:06:06.829542+00:00#Mu9vz3KEPh+beQeSwZ8qGMKrTJzHWZFfZFXY6UrYXJAnNpPSin+5NvkSfxKLMkyJtGyeavgGAz8+74bup7eYaQ=="
    if show_version or verbose:
        print("kgtk filter version: %s" % UPDATE_VERSION,
              file=error_file,
              flush=True)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file)
        print("--pattern=%s" % str(pattern), file=error_file)
        if subj_col is not None:
            print("--subj=%s" % str(subj_col), file=error_file)
        if pred_col is not None:
            print("--pred=%s" % str(pred_col), file=error_file)
        if obj_col is not None:
            print("--obj=%s" % str(obj_col), file=error_file)
        print("--or=%s" % str(or_pattern), file=error_file)
        print("--invert=%s" % str(invert), file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    def prepare_filter(pattern: str) -> typing.Set[str]:
        filt: typing.Set[str] = set()
        pattern = pattern.strip()
        if len(pattern) == 0:
            return filt

        target: str
        for target in pattern.split(","):
            target = target.strip()
            if len(target) > 0:
                filt.add(target)

        return filt

    def single_predicate_filter(
        kr: KgtkReader,
        kw: KgtkWriter,
        rw: typing.Optional[KgtkWriter],
        pred_idx: int,
        pred_filter: typing.Set[str],
    ):
        if verbose:
            print("Applying a single predicate filter",
                  file=error_file,
                  flush=True)

        pred_filter_value: str = list(pred_filter)[0]

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if row[pred_idx] == pred_filter_value:
                kw.write(row)
                output_line_count += 1

            else:
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))

    def single_predicate_filter_inverted(
        kr: KgtkReader,
        kw: KgtkWriter,
        rw: typing.Optional[KgtkWriter],
        pred_idx: int,
        pred_filter: typing.Set[str],
    ):
        if verbose:
            print("Applying a single predicate filter inverted",
                  file=error_file,
                  flush=True)

        pred_filter_value: str = list(pred_filter)[0]

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if row[pred_idx] != pred_filter_value:
                kw.write(row)
                output_line_count += 1

            else:
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))

    def single_object_filter(
        kr: KgtkReader,
        kw: KgtkWriter,
        rw: typing.Optional[KgtkWriter],
        obj_idx: int,
        obj_filter: typing.Set[str],
    ):
        if verbose:
            print("Applying a single object filter",
                  file=error_file,
                  flush=True)

        obj_filter_value: str = list(obj_filter)[0]

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if row[obj_idx] == obj_filter_value:
                kw.write(row)
                output_line_count += 1

            else:
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))

    def single_object_filter_inverted(
        kr: KgtkReader,
        kw: KgtkWriter,
        rw: typing.Optional[KgtkWriter],
        obj_idx: int,
        obj_filter: typing.Set[str],
    ):
        if verbose:
            print("Applying a single object filter inverted",
                  file=error_file,
                  flush=True)

        obj_filter_value: str = list(obj_filter)[0]

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if row[obj_idx] != obj_filter_value:
                kw.write(row)
                output_line_count += 1

            else:
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))

    def general_filter(kr: KgtkReader, kw: KgtkWriter,
                       rw: typing.Optional[KgtkWriter], subj_idx: int,
                       subj_filter: typing.Set[str], pred_idx: int,
                       pred_filter: typing.Set[str], obj_idx: int,
                       obj_filter: typing.Set[str]):
        if verbose:
            print("Applying a general filter", file=error_file, flush=True)

        apply_subj_filter: bool = len(subj_filter) > 0
        apply_pred_filter: bool = len(pred_filter) > 0
        apply_obj_filter: bool = len(obj_filter) > 0
        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0
        subj_filter_keep_count: int = 0
        pred_filter_keep_count: int = 0
        obj_filter_keep_count: int = 0
        subj_filter_reject_count: int = 0
        pred_filter_reject_count: int = 0
        obj_filter_reject_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            keep: bool = False
            reject: bool = False
            if apply_subj_filter:
                if row[subj_idx] in subj_filter:
                    keep = True
                    subj_filter_keep_count += 1
                else:
                    reject = True
                    subj_filter_reject_count += 1

            if apply_pred_filter:
                if row[pred_idx] in pred_filter:
                    keep = True
                    pred_filter_keep_count += 1
                else:
                    reject = True
                    pred_filter_reject_count += 1

            if apply_obj_filter:
                if row[obj_idx] in obj_filter:
                    keep = True
                    obj_filter_keep_count += 1
                else:
                    reject = True
                    obj_filter_reject_count += 1

            if (not keep ^ invert) if or_pattern else (reject ^ invert):
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1
            else:
                kw.write(row)
                output_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))
            print("Keep counts: subject=%d, predicate=%d, object=%d." %
                  (subj_filter_keep_count, pred_filter_keep_count,
                   obj_filter_keep_count))
            print("Reject counts: subject=%d, predicate=%d, object=%d." %
                  (subj_filter_reject_count, pred_filter_reject_count,
                   obj_filter_reject_count))

    try:

        patterns: typing.List[str] = pattern.split(";")
        if len(patterns) != 3:
            print(
                "Error: The pattern must have three sections separated by semicolons (two semicolons total).",
                file=error_file,
                flush=True)
            raise KGTKException("Bad pattern")

        subj_filter: typing.Set[str] = prepare_filter(patterns[0])
        pred_filter: typing.Set[str] = prepare_filter(patterns[1])
        obj_filter: typing.Set[str] = prepare_filter(patterns[2])

        if verbose and len(subj_filter) == 0 and len(pred_filter) == 0 and len(
                obj_filter) == 0:
            print("Warning: the filter is empty.", file=error_file, flush=True)

        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        subj_idx: int = kr.get_node1_column_index(subj_col)
        pred_idx: int = kr.get_label_column_index(pred_col)
        obj_idx: int = kr.get_node2_column_index(obj_col)

        # Complain about a missing column only when it is needed by the pattern.
        trouble: bool = False
        if subj_idx < 0 and len(subj_filter) > 0:
            trouble = True
            print("Error: Cannot find the subject column '%s'." %
                  kr.get_node1_canonical_name(subj_col),
                  file=error_file,
                  flush=True)
        if pred_idx < 0 and len(pred_filter) > 0:
            trouble = True
            print("Error: Cannot find the predicate column '%s'." %
                  kr.get_label_canonical_name(pred_col),
                  file=error_file,
                  flush=True)
        if obj_idx < 0 and len(obj_filter) > 0:
            trouble = True
            print("Error: Cannot find the object column '%s'." %
                  kr.get_node2_canonical_name(obj_col),
                  file=error_file,
                  flush=True)
        if trouble:
            raise KGTKException("Missing columns.")

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        rw: typing.Optional[KgtkWriter] = None
        if reject_kgtk_file is not None:
            if verbose:
                print("Opening the reject file: %s" % str(reject_kgtk_file),
                      file=error_file,
                      flush=True)
            rw = KgtkWriter.open(kr.column_names,
                                 reject_kgtk_file,
                                 mode=KgtkWriter.Mode[kr.mode.name],
                                 verbose=verbose,
                                 very_verbose=very_verbose)

        if len(subj_filter) == 0 and len(pred_filter) == 1 and len(
                obj_filter) == 0:
            if invert:
                single_predicate_filter_inverted(kr, kw, rw, pred_idx,
                                                 pred_filter)
            else:
                single_predicate_filter(kr, kw, rw, pred_idx, pred_filter)
        elif len(subj_filter) == 0 and len(pred_filter) == 0 and len(
                obj_filter) == 1:
            if invert:
                single_object_filter_inverted(kr, kw, rw, obj_idx, obj_filter)
            else:
                single_object_filter(kr, kw, rw, obj_idx, obj_filter)
        else:
            general_filter(kr, kw, rw, subj_idx, subj_filter, pred_idx,
                           pred_filter, obj_idx, obj_filter)

        kw.close()
        if rw is not None:
            rw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
예제 #15
0
파일: clean_data.py 프로젝트: yyht/kgtk
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = False,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkReaderOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys
    import typing

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file_path: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file_path: Path = KGTKArgumentParser.get_output_file(
        output_file)
    reject_kgtk_file_path: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(reject_file,
                                                            who="Reject file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file_path), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file_path), file=error_file)
        if reject_kgtk_file_path is not None:
            print("--reject-file=%s" % str(reject_kgtk_file_path),
                  file=error_file)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if verbose:
        if str(input_kgtk_file_path) == "-":
            print("Cleaning data from stdin", file=error_file, flush=True)
        else:
            print("Cleaning data from '%s'" % str(input_kgtk_file_path),
                  file=error_file,
                  flush=True)
        if str(output_kgtk_file_path) == "-":
            print("Writing data to stdout", file=error_file, flush=True)
        else:
            print("Writing data to '%s'" % str(output_kgtk_file_path),
                  file=error_file,
                  flush=True)
        if str(reject_kgtk_file_path) == "-":
            print("Writing reject data to stdout", file=error_file, flush=True)
        else:
            print("Writing reject data to '%s'" % str(reject_kgtk_file_path),
                  file=error_file,
                  flush=True)

    reject_kgtk_file: typing.Optional[typing.TextIO] = None
    if reject_kgtk_file_path is not None:
        reject_kgtk_file = open(reject_kgtk_file_path, mode="wt")

    try:
        kr: KgtkReader = KgtkReader.open(input_kgtk_file_path,
                                         error_file=error_file,
                                         reject_file=reject_kgtk_file,
                                         options=reader_options,
                                         value_options=value_options,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        kw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                         output_kgtk_file_path,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        line_count: int = 0
        row: typing.List[str]
        for row in kr:
            kw.write(row)
            line_count += 1

        kw.close()
        if reject_kgtk_file is not None:
            reject_kgtk_file.close()

        if verbose:
            print("Copied %d clean data lines" % line_count,
                  file=error_file,
                  flush=True)
        return 0

    except Exception as e:
        raise KGTKException(e)
예제 #16
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        label_file: KGTKFiles,
        base_columns: typing.Optional[typing.List[str]] = None,
        columns_to_remove: typing.Optional[typing.List[str]] = None,
        label_value: str = KgtkLift.DEFAULT_LABEL_SELECT_COLUMN_VALUE,
        lift_suffix: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SUFFIX,
        deduplicate_labels: bool = True,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    label_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(label_file,
                                                            who="Label file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if label_kgtk_file is not None:
            print("--label-file=%s" % str(label_kgtk_file), file=error_file)

        if base_columns is not None:
            print("--base-columns=%s" % " ".join(base_columns),
                  file=error_file)
        if columns_to_remove is not None:
            print("--columns-to-lower=%s" % " ".join(columns_to_remove),
                  file=error_file)
        print("--label-value=%s" % label_value, file=error_file)
        print("--lift-suffix=%s" % lift_suffix, file=error_file)
        print("--deduplicate-labels=%s" % deduplicate_labels, file=error_file)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Map the index of a column being removed to the index of the base column that supplies its node1 value.
        lower_map: typing.MutableMapping[int, int] = dict()

        # These columns will never be removed:
        key_column_idxs: typing.Set[int] = set(
            (kr.node1_column_idx, kr.label_column_idx, kr.node2_column_idx,
             kr.id_column_idx))
        key_column_idxs.discard(-1)
        key_column_names: typing.Set[str] = set(
            (kr.column_names[idx] for idx in key_column_idxs))

        base_name: str
        column_name: str
        idx: int
        # There are three option patterns.

        if columns_to_remove is not None and len(
                columns_to_remove) > 0 and base_columns is not None and len(
                    base_columns) > 0:
            # Pattern 1: len(columns_to_remove) > 0 and len(base_columns) == len(columns_to_remove)
            # column_names and base_columns are paired.
            if len(columns_to_remove) != len(base_columns):
                raise KGTKException(
                    "There are %d columns to remove but only %d base columns."
                    % (len(columns_to_remove), len(base_columns)))

            for idx, column_name in enumerate(columns_to_remove):
                base_name = base_columns[idx]
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if base_name not in kr.column_names:
                    raise KGTKException(
                        "For column name %s, base name %s is unknown" %
                        (repr(column_name), repr(base_name)))

                lower_map[kr.column_name_map[
                    column_name]] = kr.column_name_map[base_name]

        elif columns_to_remove is not None and len(columns_to_remove) > 0 and (
                base_columns is None or len(base_columns) == 0):
            # Pattern 2: len(columns_to_remove) > 0 and len(base_columns) == 0
            # Each column name is stripped of the lift suffix to determine the base name.
            if len(lift_suffix) == 0:
                raise KGTKException("The --lift-suffix must not be empty.")

            for idx, column_name in enumerate(columns_to_remove):
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if not column_name.endswith(lift_suffix):
                    raise KGTKException("Unable to parse column name %s." %
                                        repr(column_name))

                base_name = column_name[:-len(lift_suffix)]

                if base_name not in kr.column_names:
                    raise KGTKException(
                        "For column name %s, base name %s is not known" %
                        (repr(column_name), repr(base_name)))

                lower_map[kr.column_name_map[
                    column_name]] = kr.column_name_map[base_name]

        elif columns_to_remove is None or len(columns_to_remove) == 0:
            # Pattern 3: len(columns_to_remove) == 0.
            if len(lift_suffix) == 0:
                raise KGTKException("The --lift-suffix must not be empty.")

            if base_columns is None or len(base_columns) == 0:
                # The base name list wasn't supplied.  Use [node1, label, node2, id]
                base_columns = list(key_column_names)

            for idx, column_name in enumerate(kr.column_names):
                # Skip the node1, label, node12, and id columns
                if idx in key_column_idxs:
                    continue

                # Does this column match a lifting pattern?
                for base_name in base_columns:
                    if len(base_name) == 0:
                        continue
                    if column_name == base_name + lift_suffix:
                        lower_map[idx] = kr.column_name_map[base_name]

        if len(lower_map) == 0:
            raise KGTKException("There are no columns to lower.")

        if verbose:
            print("The following columns will be lowered",
                  file=error_file,
                  flush=True)
            for idx in sorted(lower_map.keys()):
                column_name = kr.column_names[idx]
                base_name = kr.column_names[lower_map[idx]]
                print(" %s from %s" % (column_name, base_name),
                      file=error_file,
                      flush=True)

        output_column_names: typing.List[str] = list()
        for idx, column_name in enumerate(kr.column_names):
            if idx not in lower_map:
                output_column_names.append(column_name)
        if verbose:
            print("The output columns are: %s" % " ".join(output_column_names),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(
            output_column_names,
            output_kgtk_file,
            mode=KgtkWriter.Mode.EDGE,
            require_all_columns=False,  # Simplifies writing the labels
            verbose=verbose,
            very_verbose=very_verbose)
        shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names)

        lkw: typing.Optional[KgtkWriter] = None
        if label_kgtk_file is not None:
            if verbose:
                print("Opening the label output file: %s" %
                      str(label_kgtk_file),
                      file=error_file,
                      flush=True)

            label_column_names = [
                KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2
            ]
            lkw = KgtkWriter.open(label_column_names,
                                  label_kgtk_file,
                                  mode=KgtkWriter.Mode.EDGE,
                                  verbose=verbose,
                                  very_verbose=very_verbose)

        # Optionally deduplicate the labels
        #  set(node1_value + KgtkFormat.SEPARATOR + node2_value)
        label_set: typing.Set[str] = set()
        label_key: str

        # If labels will be written to the output file and deduplication is enabled:
        check_existing_labels: bool = \
            deduplicate_labels and \
            lkw is None and \
            kr.node1_column_idx >= 0 and \
            kr.label_column_idx >= 0 and \
            kr.node2_column_idx >= 0

        input_line_count: int = 0
        output_line_count: int = 0
        label_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if check_existing_labels and row[
                    kr.label_column_idx] == label_value:
                label_key = row[
                    kr.node1_column_idx] + KgtkFormat.COLUMN_SEPARATOR + row[
                        kr.node2_column_idx]
                if label_key in label_set:
                    continue
                else:
                    label_set.add(label_key)

            kw.write(row, shuffle_list=shuffle_list)
            output_line_count += 1

            column_idx: int
            for column_idx in lower_map.keys():
                node1_value: str = row[lower_map[column_idx]]
                if len(node1_value) == 0:
                    continue  # TODO: raise an exception

                item: str = row[column_idx]
                if len(item) == 0:
                    continue  # Ignore empty node2 values.

                # Ths item might be a KGTK list.  Let's split it, because
                # lists aren't allow in the node2 values we'll generate.
                node2_value: str
                for node2_value in KgtkValue.split_list(item):
                    if len(node2_value) == 0:
                        continue  # Ignore empty node2 values.

                    if deduplicate_labels:
                        label_key = node1_value + KgtkFormat.COLUMN_SEPARATOR + node2_value
                        if label_key in label_set:
                            continue
                        else:
                            label_set.add(label_key)

                    output_map: typing.Mapping[str, str] = {
                        KgtkFormat.NODE1: node1_value,
                        KgtkFormat.LABEL: label_value,
                        KgtkFormat.NODE2: node2_value,
                    }
                    if lkw is None:
                        kw.writemap(output_map)
                        label_line_count += 1
                        output_line_count += 1
                    else:
                        lkw.writemap(output_map)
                        label_line_count += 1

        if verbose:
            print("Read %d rows, wrote %d rows with %d labels." %
                  (input_line_count, output_line_count, label_line_count),
                  file=error_file,
                  flush=True)

        kw.close()
        if lkw is not None:
            lkw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1