示例#1
0
文件: ifempty.py 项目: usbader/kgtk
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        filter_column_names: typing.List[str],
        all_are: bool = False,
        only_count: bool = False,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        print("--columns=%s" % " ".join(filter_column_names), file=error_file)
        print("--count=%s" % str(only_count), file=error_file)
        print("--all=%s" % str(all_are), file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        ie: KgtkIfEmpty = KgtkIfEmpty(
            input_file_path=input_kgtk_file,
            filter_column_names=filter_column_names,
            output_file_path=output_kgtk_file,
            all_are=all_are,
            notempty=False,
            only_count=only_count,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ie.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#2
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        key_column_names: typing.List[str],
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkexpand import KgtkExpand
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        print("--columns=%s" % " ".join(key_column_names), file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        ex: KgtkExpand = KgtkExpand(
            input_file_path=input_kgtk_file,
            key_column_names=key_column_names,
            output_file_path=output_kgtk_file,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#3
0
文件: md.py 项目: usbader/kgtk
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        output_format: str,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import KGTKException

    input_file_path: Path = KGTKArgumentParser.get_input_file(input_file)
    output_file_path: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # TODO: check that at most one input file is stdin?

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_file_path),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(output_file_path),
              file=error_file,
              flush=True)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        kc: KgtkCat = KgtkCat(input_file_paths=[input_file_path],
                              output_path=output_file_path,
                              output_format=output_format,
                              reader_options=reader_options,
                              value_options=value_options,
                              error_file=error_file,
                              verbose=verbose,
                              very_verbose=very_verbose)

        kc.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#4
0
def run(
        input_kgtk_file: typing.Optional[Path],
        output_kgtk_file: typing.Optional[Path],
        key_column_names: typing.List[str],
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import KGTKException

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("input: %s" %
              (str(input_kgtk_file) if input_kgtk_file is not None else "-"),
              file=error_file)
        print("--columns=%s" % " ".join(key_column_names), file=error_file)
        print("--output-file=%s" %
              (str(output_kgtk_file) if output_kgtk_file is not None else "-"),
              file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        ex: KgtkExpand = KgtkExpand(
            input_file_path=input_kgtk_file,
            key_column_names=key_column_names,
            output_file_path=output_kgtk_file,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#5
0
def run(
        left_file: KGTKFiles,
        right_file: KGTKFiles,
        output_file: KGTKFiles,
        join_on_id: bool = False,
        join_on_label: bool = False,
        join_on_node2: bool = False,
        left_prefix: typing.Optional[str] = None,
        left_join_columns: typing.Optional[typing.List[str]] = None,
        left_join: bool = False,
        right_prefix: typing.Optional[str] = None,
        right_join_columns: typing.Optional[typing.List[str]] = None,
        right_join: bool = False,
        field_separator: typing.Optional[str] = None,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.join.kgtkjoiner import KgtkJoiner
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    left_file_path: Path = KGTKArgumentParser.get_input_file(
        left_file, who="KGTK left file")
    right_file_path: Path = KGTKArgumentParser.get_input_file(
        right_file, who="KGTK right file")
    output_file_path: Path = KGTKArgumentParser.get_output_file(output_file)

    field_separator = KgtkJoiner.FIELD_SEPARATOR_DEFAULT if field_separator is None else field_separator

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    if not right_join:
        if str(left_file_path) == "-":
            print(
                "The left file may not be stdin when an inner join or left join is requested.",
                file=error_file,
                flush=True)
            return 1

    if not left_join:
        if str(right_file_path) == "-":
            print(
                "The right file may not be stdin when an inner join or right join is requested.",
                file=error_file,
                flush=True)
            return 1

    if str(left_file_path) == "-" and str(right_file_path) == "-":
        print("The left and right files may not both be stdin.",
              file=error_file,
              flush=True)
        return 1

    # Build the option structures.
    left_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
        kwargs, who="left", fallback=True)
    right_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
        kwargs, who="right", fallback=True)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        # TODO: left_file_path, right_file_path, --join-on-label, etc.
        print("--left-file=%s" % str(left_file_path), file=error_file)
        print("--right-file=%s" % str(right_file_path), file=error_file)
        print("--output-file=%s" % str(output_file_path), file=error_file)

        print("--left-join=%s" % str(left_join), file=error_file)
        print("--right-join=%s" % str(right_join), file=error_file)
        print("--join-on-id=%s" % str(join_on_id), file=error_file)
        print("--join-on-label=%s" % str(join_on_label), file=error_file)
        print("--join-on-node2=%s" % str(join_on_node2), file=error_file)
        if left_join_columns is not None:
            print("--left-join-columns=%s" % " ".join(left_join_columns),
                  file=error_file)
        if right_join_columns is not None:
            print("--right-join-columns=%s" % " ".join(right_join_columns),
                  file=error_file)
        if left_prefix is not None:
            print("--left-prefix=%s" % str(left_prefix), file=error_file)
        if right_prefix is not None:
            print("--right-prefix=%s" % str(right_prefix), file=error_file)
        print("--field-separator=%s" % repr(field_separator), file=error_file)

        left_reader_options.show(out=error_file, who="left")
        right_reader_options.show(out=error_file, who="right")
        value_options.show(out=error_file)

    try:
        kr: KgtkJoiner = KgtkJoiner(
            left_file_path=left_file_path,
            right_file_path=right_file_path,
            output_path=output_file_path,
            left_join=left_join,
            right_join=right_join,
            join_on_id=join_on_id,
            join_on_label=join_on_label,
            join_on_node2=join_on_node2,
            left_join_columns=left_join_columns,
            right_join_columns=right_join_columns,
            left_prefix=left_prefix,
            right_prefix=right_prefix,
            field_separator=field_separator,
            left_reader_options=left_reader_options,
            right_reader_options=right_reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        kr.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#6
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        reified_file: KGTKFiles,
        unreified_file: KGTKFiles,
        uninvolved_file: KGTKFiles,
        trigger_label_value: str,
        trigger_node2_value: str,
        value_label_value: str,
        old_label_value: str,
        new_label_value: typing.Optional[str],
        allow_multiple_values: bool,
        allow_extra_columns: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.unreify.kgtkunreifyvalues import KgtkUnreifyValues
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reified_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            reified_file, who="KGTK reified file")
    unreified_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            unreified_file, who="KGTK unreified file")
    uninvolved_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            uninvolved_file, who="KGTK uninvolved file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-files %s" % str(input_kgtk_file),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(output_kgtk_file),
              file=error_file,
              flush=True)
        if reified_kgtk_file is not None:
            print("--reified-file=%s" % str(reified_kgtk_file),
                  file=error_file,
                  flush=True)
        if unreified_kgtk_file is not None:
            print("--unreified-file=%s" % str(unreified_kgtk_file),
                  file=error_file,
                  flush=True)
        if uninvolved_kgtk_file is not None:
            print("--uninvolved-file=%s" % str(uninvolved_kgtk_file),
                  file=error_file,
                  flush=True)

        print("--trigger-label=%s" % trigger_label_value,
              file=error_file,
              flush=True)
        print("--trigger-node2=%s" % trigger_node2_value,
              file=error_file,
              flush=True)
        print("--value-label=%s" % value_label_value,
              file=error_file,
              flush=True)
        print("--old-label=%s" % old_label_value, file=error_file, flush=True)
        if new_label_value is not None:
            print("--new-label=%s" % new_label_value,
                  file=error_file,
                  flush=True)

        print("--allow-multiple-values=%s" % str(allow_multiple_values),
              file=error_file,
              flush=True)
        print("--allow-extra-columns=%s" % str(allow_extra_columns),
              file=error_file,
              flush=True)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        kuv: KgtkUnreifyValues = KgtkUnreifyValues(
            input_file_path=input_kgtk_file,
            output_file_path=output_kgtk_file,
            reified_file_path=reified_kgtk_file,
            unreified_file_path=unreified_kgtk_file,
            uninvolved_file_path=uninvolved_kgtk_file,
            trigger_label_value=trigger_label_value,
            trigger_node2_value=trigger_node2_value,
            value_label_value=value_label_value,
            old_label_value=old_label_value,
            new_label_value=new_label_value,
            allow_multiple_values=allow_multiple_values,
            allow_extra_columns=allow_extra_columns,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        kuv.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#7
0
def run(
        input_file: KGTKFiles,
        path_file: KGTKFiles,
        output_file: KGTKFiles,
        statistics_only: bool,
        undirected: bool,
        max_hops: int,
        source_column_name: typing.Optional[str],
        target_column_name: typing.Optional[str],
        shortest_path: bool,
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    # import modules locally
    from pathlib import Path
    import sys

    from graph_tool.all import find_vertex
    from graph_tool.topology import all_paths
    from graph_tool.topology import all_shortest_paths

    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    from kgtk.exceptions import KGTKException
    try:

        # Select where to send error messages, defaulting to stderr.
        error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

        # Build the option structures.
        input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
            kwargs, who="input", fallback=True)
        path_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
            kwargs, who="path", fallback=True)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

        input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
        path_kgtk_file: Path = KGTKArgumentParser.get_input_file(path_file)
        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)

        id_col = 'name'

        if verbose:
            print("Reading the path file: %s" % str(path_kgtk_file),
                  file=error_file,
                  flush=True)
        pairs = []
        pkr: KgtkReader = KgtkReader.open(
            path_kgtk_file,
            error_file=error_file,
            options=path_reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        path_source_idx: int = pkr.get_node1_column_index(source_column_name)
        if path_source_idx < 0:
            print("Missing node1 (source) column name in the path file.",
                  file=error_file,
                  flush=True)

        path_target_idx: int = pkr.get_node2_column_index(target_column_name)
        if path_target_idx < 0:
            print("Missing node1 (target) column name in the path file.",
                  file=error_file,
                  flush=True)
        if path_source_idx < 0 or path_target_idx < 0:
            pkr.close()
            raise KGTKException("Exiting due to missing columns.")

        paths_read: int = 0
        path_row: typing.List[str]
        for path_row in pkr:
            paths_read += 1
            if len(path_row) != pkr.column_count:
                raise KGTKException(
                    "Exiting because line %d in the path file (%s) is the wrong length: %d columns expected, %d were read."
                    % (paths_read, str(path_kgtk_file), pkr.column_count,
                       len(path_row)))
            src: str = path_row[path_source_idx]
            tgt: str = path_row[path_target_idx]
            pairs.append((src, tgt))
        pkr.close()
        if verbose:
            print("%d path rows read" % paths_read,
                  file=error_file,
                  flush=True)
        if len(pairs) == 0:
            print("No path pairs found, the output will be empty.",
                  file=error_file,
                  flush=True)
        elif verbose:
            print("%d path pairs found" % len(pairs),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Reading the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=input_reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        sub_index: int = kr.get_node1_column_index()
        if sub_index < 0:
            print("Missing node1 (subject) column.",
                  file=error_file,
                  flush=True)
        pred_index: int = kr.get_label_column_index()
        if pred_index < 0:
            print("Missing label (predicate) column.",
                  file=error_file,
                  flush=True)
        obj_index: int = kr.get_node2_column_index()
        if obj_index < 0:
            print("Missing node2 (object) column", file=error_file, flush=True)
        id_index: int = kr.get_id_column_index()
        if id_index < 0:
            print("Missing id column", file=error_file, flush=True)
        if sub_index < 0 or pred_index < 0 or obj_index < 0 or id_index < 0:
            kr.close()
            raise KGTKException("Exiting due to missing columns.")

        predicate: str = kr.column_names[pred_index]
        id_col_name: str = kr.column_names[id_index]

        G = load_graph_from_kgtk(kr,
                                 directed=not undirected,
                                 ecols=(sub_index, obj_index),
                                 verbose=verbose,
                                 out=error_file)

        output_columns: typing.List[str] = ['node1', 'label', 'node2', 'id']
        kw: KgtkWriter = KgtkWriter.open(output_columns,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        id_count = 0
        if not statistics_only:
            for e in G.edges():
                sid, oid = e
                lbl = G.ep[predicate][e]
                kw.write([
                    G.vp[id_col][sid], lbl, G.vp[id_col][oid],
                    '{}-{}-{}'.format(G.vp[id_col][sid], lbl, id_count)
                ])
                id_count += 1
            if verbose:
                print("%d edges found." % id_count,
                      file=error_file,
                      flush=True)

        id_count = 0
        path_id = 0
        for pair in pairs:
            source_node, target_node = pair
            source_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=source_node)
            target_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=target_node)
            if len(source_ids) == 1 and len(target_ids) == 1:
                source_id = source_ids[0]
                target_id = target_ids[0]
                if shortest_path:
                    _all_paths = all_shortest_paths(G,
                                                    source_id,
                                                    target_id,
                                                    edges=True)
                else:
                    _all_paths = all_paths(G,
                                           source_id,
                                           target_id,
                                           cutoff=max_hops,
                                           edges=True)

                for path in _all_paths:
                    for edge_num, an_edge in enumerate(path):
                        edge_id = G.properties[('e', 'id')][an_edge]
                        node1: str = 'p%d' % path_id
                        kw.write([
                            node1,
                            str(edge_num), edge_id,
                            '{}-{}-{}'.format(node1, edge_num, id_count)
                        ])
                        id_count += 1
                    path_id += 1

        if verbose:
            print("%d paths contining %d edges found." % (path_id, id_count),
                  file=error_file,
                  flush=True)

        kw.close()
        kr.close()

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
示例#8
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys
    import typing

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        # First create the KgtkReader.  It provides parameters used by the ID
        # column builder. Next, create the ID column builder, which provides a
        # possibly revised list of column names for the KgtkWriter.  Create
        # the KgtkWriter.  Last, process the data stream.

        # Open the input file.
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Create the ID builder.
        idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options)

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(idb.column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        # Process the input file, building IDs.
        idb.process(kr, ew)

        # Clean up.
        ew.close()
        kr.close()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#9
0
文件: implode.py 项目: yyht/kgtk
def run(input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,

        column_name: str,
        prefix: str,
        type_names: typing.List[str],
        without_fields: typing.Optional[typing.List[str]],
        overwrite_column: bool,
        validate: bool,
        escape_pipes: bool,
        quantities_include_numbers: bool,
        general_strings: bool,
        remove_prefixed_columns: bool,
        ignore_unselected_types: bool,
        retain_unselected_types: bool,
        build_id: bool,
        show_data_types: bool,
        quiet: bool,
        
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want.
)->int:
    # import modules locally
    from pathlib import Path
    import sys
    
    from kgtk.exceptions import KGTKException
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.reshape.kgtkimplode import KgtkImplode
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(reject_file, who="KGTK reject file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(kwargs)    
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file, flush=True)

        print("--column %s" % column_name, file=error_file, flush=True)
        print("--prefix %s" % prefix, file=error_file, flush=True)
        print("--overwrite %s" % str(overwrite_column), file=error_file, flush=True)
        print("--validate %s" % str(validate), file=error_file, flush=True)
        print("--escape-pipes %s" % str(escape_pipes), file=error_file, flush=True)
        print("--quantities-include-numbers %s" % str(quantities_include_numbers), file=error_file, flush=True)
        print("--general-strings %s" % str(general_strings), file=error_file, flush=True)
        print("--remove-prefixed-columns %s" % str(remove_prefixed_columns), file=error_file, flush=True)
        print("--ignore-unselected-types %s" % str(ignore_unselected_types), file=error_file, flush=True)
        print("--retain-unselected-types %s" % str(retain_unselected_types), file=error_file, flush=True)
        if type_names is not None:
            print("--types %s" % " ".join(type_names), file=error_file, flush=True)
        if without_fields is not None:
            print("--without %s" % " ".join(without_fields), file=error_file, flush=True)
        print("--show-data-types %s" % str(show_data_types), file=error_file, flush=True)
        print("--quiet %s" % str(quiet), file=error_file, flush=True)
        print("--build-id=%s" % str(build_id), file=error_file, flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)
    if show_data_types:
        data_type: str
        for data_type in KgtkFormat.DataType.choices():
            print("%s" % data_type, file=error_file, flush=True)
        return 0

    wf: typing.List[str] = without_fields if without_fields is not None else list()

    try:
        ex: KgtkImplode = KgtkImplode(
            input_file_path=input_kgtk_file,
            output_file_path=output_kgtk_file,
            reject_file_path=reject_kgtk_file,
            column_name=column_name,
            prefix=prefix,
            type_names=type_names,
            without_fields=wf,
            overwrite_column=overwrite_column,
            validate=validate,
            escape_pipes=escape_pipes,
            quantities_include_numbers=quantities_include_numbers,
            general_strings=general_strings,
            remove_prefixed_columns=remove_prefixed_columns,
            ignore_unselected_types=ignore_unselected_types,
            retain_unselected_types=retain_unselected_types,
            quiet=quiet,
            build_id=build_id,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#10
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        list_output_file: KGTKFiles,
        key_column_names: typing.List[str],
        keep_first_names: typing.List[str],
        compact_id: bool,
        deduplicate: bool,
        sorted_input: bool,
        verify_sort: bool,
        lists_in_input: bool,
        report_lists: bool,
        exclude_lists: bool,
        output_only_lists: bool,
        build_id: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkcompact import KgtkCompact
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    list_output_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            list_output_file, who="KGTK list output file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if list_output_kgtk_file is not None:
            print("--list-output-file=%s" % str(list_output_kgtk_file),
                  file=error_file,
                  flush=True)
        print("--columns=%s" % " ".join(key_column_names), file=error_file)
        print("--keep-first=%s" % " ".join(keep_first_names), file=error_file)
        print("--compact-id=%s" % str(compact_id), file=error_file, flush=True)
        print("--deduplicate=%s" % str(deduplicate),
              file=error_file,
              flush=True)
        print("--presorted=%s" % str(sorted_input),
              file=error_file,
              flush=True)
        print("--verify-sort=%s" % str(verify_sort),
              file=error_file,
              flush=True)
        print("--lists-in-input=%s" % str(lists_in_input),
              file=error_file,
              flush=True)
        print("--report-lists=%s" % str(report_lists),
              file=error_file,
              flush=True)
        print("--exclude-lists=%s" % str(exclude_lists),
              file=error_file,
              flush=True)
        print("--output-only-lists=%s" % str(output_only_lists),
              file=error_file,
              flush=True)
        print("--build-id=%s" % str(build_id), file=error_file, flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        KgtkReader.show_debug_arguments(errors_to_stdout=errors_to_stdout,
                                        errors_to_stderr=errors_to_stderr,
                                        show_options=show_options,
                                        verbose=verbose,
                                        very_verbose=very_verbose,
                                        out=error_file)
        print("=======", file=error_file, flush=True)

    if exclude_lists and output_only_lists:
        raise KGTKException(
            "--exclude-lists and --output-only-lists may not be used together."
        )

    try:
        ex: KgtkCompact = KgtkCompact(
            input_file_path=input_kgtk_file,
            output_file_path=output_kgtk_file,
            list_output_file_path=list_output_kgtk_file,
            key_column_names=key_column_names,
            keep_first_names=keep_first_names,
            compact_id=compact_id,
            deduplicate=deduplicate,
            sorted_input=sorted_input,
            verify_sort=verify_sort,
            lists_in_input=lists_in_input,
            report_lists=report_lists,
            exclude_lists=exclude_lists,
            output_only_lists=output_only_lists,
            build_id=build_id,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#11
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        columns: typing.Optional[typing.List[str]],
        split_on_commas: bool,
        split_on_spaces: bool,
        strip_spaces: bool,
        all_except: bool,
        ignore_missing_columns: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if columns is not None:
            print("--columns=%s" % " ".join(columns), file=error_file)
        print("--split-on-commas=%s" % str(split_on_commas), file=error_file)
        print("--split-on-spaces=%s" % str(split_on_spaces), file=error_file)
        print("--strip-spaces=%s" % str(strip_spaces), file=error_file)
        print("--all-except=%s" % str(all_except), file=error_file)
        print("--ignore-missing-columns=%s" % str(ignore_missing_columns),
              file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        if columns is None:
            columns = []  # This simplifies matters.

        if split_on_spaces:
            # We will be very lenient, and allow space-seperated arguments
            # *inside* shell quoting, e.g.
            #
            # kgtk remove_columns -c 'name name2 name3'
            #
            # Do not enable this option if spaces are legal inside your
            # column names.
            columns = " ".join(columns).split()
        remove_columns: typing.List[str] = []
        arg: str
        column_name: str
        for arg in columns:
            if split_on_commas:
                for column_name in arg.split(","):
                    if strip_spaces:
                        column_name = column_name.strip()
                    if len(column_name) > 0:
                        remove_columns.append(column_name)
            else:
                if strip_spaces:
                    arg = arg.strip()
                if len(arg) > 0:
                    remove_columns.append(arg)
        if verbose:
            if all_except:
                print("Removing all columns except %d columns: %s" %
                      (len(remove_columns), " ".join(remove_columns)),
                      file=error_file,
                      flush=True)
            else:
                print("Removing %d columns: %s" %
                      (len(remove_columns), " ".join(remove_columns)),
                      file=error_file,
                      flush=True)
        if len(remove_columns) == 0:
            raise KGTKException("No columns to remove")

        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        output_column_names: typing.List[str]

        trouble_column_names: typing.List[str] = []
        if all_except:
            if not ignore_missing_columns:
                for column_name in remove_columns:
                    if column_name not in kr.column_names:
                        print("Error: cannot retain unknown column '%s'." %
                              column_name,
                              file=error_file,
                              flush=True)
                        trouble_column_names.append(column_name)

            output_column_names = []
            for column_name in kr.column_names:
                if column_name in remove_columns:
                    output_column_names.append(column_name)

        else:
            output_column_names = kr.column_names.copy()
            for column_name in remove_columns:
                if column_name in output_column_names:
                    output_column_names.remove(column_name)

                elif not ignore_missing_columns:
                    print("Error: cannot remove unknown column '%s'." %
                          column_name,
                          file=error_file,
                          flush=True)
                    trouble_column_names.append(column_name)

        if len(trouble_column_names) > 0:
            raise KGTKException("Unknown columns %s" %
                                " ".join(trouble_column_names))

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(output_column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names)

        input_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1
            kw.write(row, shuffle_list=shuffle_list)

        if verbose:
            print("Processed %d rows." % (input_line_count),
                  file=error_file,
                  flush=True)

        kw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
示例#12
0
def run(input_file: KGTKFiles,
        output_file: KGTKFiles,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs):
    """
    **kwargs stores all parameters providing by user
    """
    # print(kwargs)

    # import modules locally
    import sys
    import typing
    import os
    import logging
    from pathlib import Path
    import json, os, h5py, gzip, torch, shutil
    from torchbiggraph.config import parse_config
    from kgtk.exceptions import KGTKException
    # copy  missing file under kgtk/graph_embeddings
    from kgtk.templates.kgtkcopytemplate import KgtkCopyTemplate
    from kgtk.graph_embeddings.importers import TSVEdgelistReader, convert_input_data
    from torchbiggraph.train import train
    from torchbiggraph.util import SubprocessInitializer, setup_logging
    from kgtk.graph_embeddings.export_to_tsv import make_tsv
    # from torchbiggraph.converters.export_to_tsv import make_tsv

    try:
        input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)

        # store the data into log file, then the console will not output anything
        if kwargs['log_file_path'] != None:
            log_file_path = kwargs['log_file_path']
            logging.basicConfig(
                format='%(asctime)s - %(filename)s[line:%(lineno)d] \
            - %(levelname)s: %(message)s',
                level=logging.DEBUG,
                filename=str(log_file_path),
                filemode='w')
            print(
                f'In Processing, Please go to {kwargs["log_file_path"]} to check details',
                file=sys.stderr,
                flush=True)

        tmp_folder = kwargs['temporary_directory']
        tmp_tsv_path: Path = tmp_folder / f'tmp_{input_kgtk_file.name}'
        # tmp_tsv_path:Path = input_kgtk_file.parent/f'tmp_{input_kgtk_file.name}'

        #  make sure the tmp folder exists, otherwise it will raise an exception
        if not os.path.exists(tmp_folder):
            os.makedirs(tmp_folder)

        try:  #if output_kgtk_file is not empty, delete it
            output_kgtk_file.unlink()
        except:
            pass  # didn't find, then let it go

        # *********************************************
        # 0. PREPARE PBG TSV FILE
        # *********************************************
        reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)
        error_file: typing.TextIO = sys.stdout if kwargs.get(
            "errors_to_stdout") else sys.stderr
        kct: KgtkCopyTemplate = KgtkCreateTmpTsv(
            input_file_path=input_kgtk_file,
            output_file_path=tmp_tsv_path,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        # prepare the graph file
        # create a tmp tsv file for PBG embedding

        logging.info('Generate the valid tsv format for embedding ...')
        kct.process()
        logging.info('Embedding file is ready...')

        # *********************************************
        # 1. DEFINE CONFIG
        # *********************************************
        raw_config = get_config(**kwargs)

        ## setting corresponding learning rate and loss function for different algorthim
        processed_config = config_preprocess(raw_config)

        # temporry output folder
        tmp_output_folder = Path(processed_config['entity_path'])

        # before moving, need to check whether the tmp folder is not empty in case of bug
        try:  #if temporry output folder is alrady existing then delete it
            shutil.rmtree(tmp_output_folder)
        except:
            pass  # didn't find, then let it go

        # **************************************************
        # 2. TRANSFORM GRAPH TO A BIGGRAPH-FRIENDLY FORMAT
        # **************************************************
        setup_logging()
        config = parse_config(processed_config)
        subprocess_init = SubprocessInitializer()
        input_edge_paths = [tmp_tsv_path]

        convert_input_data(
            config.entities,
            config.relations,
            config.entity_path,
            config.edge_paths,
            input_edge_paths,
            TSVEdgelistReader(lhs_col=0, rel_col=1, rhs_col=2),
            dynamic_relations=config.dynamic_relations,
        )

        # ************************************************
        # 3. TRAIN THE EMBEDDINGS
        #*************************************************
        train(config, subprocess_init=subprocess_init)

        # ************************************************
        # 4. GENERATE THE OUTPUT
        # ************************************************
        # entities_output = output_kgtk_file
        entities_output = tmp_output_folder / 'entities_output.tsv'
        relation_types_output = tmp_output_folder / 'relation_types_tf.tsv'

        with open(entities_output,
                  "xt") as entities_tf, open(relation_types_output,
                                             "xt") as relation_types_tf:
            make_tsv(config, entities_tf, relation_types_tf)

        # output  correct format for embeddings
        if kwargs['output_format'] == 'glove':  # glove format output
            shutil.copyfile(entities_output, output_kgtk_file)
        elif kwargs['output_format'] == 'w2v':  # w2v format output
            generate_w2v_output(entities_output, output_kgtk_file, kwargs)

        else:  # write to the kgtk output format tsv
            generate_kgtk_output(entities_output, output_kgtk_file,
                                 kwargs.get('output_no_header', False),
                                 verbose, very_verbose)

        logging.info(f'Embeddings has been generated in {output_kgtk_file}.')

        # ************************************************
        # 5. Garbage collection
        # ************************************************
        if kwargs['retain_temporary_data'] == False:
            shutil.rmtree(kwargs['temporary_directory'])
            # tmp_tsv_path.unlink() # delete temporay tsv file
            # shutil.rmtree(tmp_output_folder) # deleter temporay output folder

        if kwargs["log_file_path"] != None:
            print('Processed Finished.', file=sys.stderr, flush=True)
            logging.info(
                f"Process Finished.\nOutput has been saved in {repr(str(output_kgtk_file))}"
            )
        else:
            print(
                f"Process Finished.\nOutput has been saved in {repr(str(output_kgtk_file))}",
                file=sys.stderr,
                flush=True)

    except Exception as e:
        raise KGTKException(str(e))
示例#13
0
def run(input_file: KGTKFiles,
        entity_label_files: KGTKFiles,
        output_file: KGTKFiles,

        label_properties: typing.Optional[typing.List[str]],
        description_properties: typing.Optional[typing.List[str]],
        isa_properties: typing.Optional[typing.List[str]],
        has_properties: typing.Optional[typing.List[str]],
        property_values: typing.Optional[typing.List[str]],
        sentence_label: str,
        explain: bool,
        presorted: bool,
        add_entity_labels_from_input: bool,

        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want.
)->int:
        
    # import modules locally
    from pathlib import Path
    import sys
    
    from kgtk.exceptions import KGTKException

    from kgtk.gt.lexicalize_utils import Lexicalize

    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    entity_label_kgtk_files: typing.List[Path] = KGTKArgumentParser.get_input_file_list(entity_label_files,
                                                                                        who="The entity label file(s)",
                                                                                        default_stdin=False)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    if label_properties is None:
        label_properties = DEFAULT_LABEL_PROPERTIES

    if description_properties is None:
        description_properties = DEFAULT_DESCRIPTION_PROPERTIES

    if isa_properties is None:
        isa_properties = DEFAULT_ISA_PROPERTIES

    if has_properties is None:
        has_properties = DEFAULT_HAS_PROPERTIES

    if property_values is None:
        property_values = DEFAULT_PROPERTY_VALUES

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file, flush=True)
        if len(entity_label_kgtk_files) > 0:
            print("--entity-label-files %s" % " ".join([str(f) for f in entity_label_kgtk_files]), file=error_file, flush=True)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True)

        if len(label_properties) > 0:
            print("--label-properties %s" % " ".join(label_properties), file=error_file, flush=True)

        if len(description_properties) > 0:
            print("--description-properties %s" % " ".join(description_properties), file=error_file, flush=True)

        if len(isa_properties) > 0:
            print("--isa-properties %s" % " ".join(isa_properties), file=error_file, flush=True)

        if len(has_properties) > 0:
            print("--has-properties %s" % " ".join(has_properties), file=error_file, flush=True)

        if len(property_values) > 0:
            print("--property-values %s" % " ".join(property_values), file=error_file, flush=True)

        print("--sentence-label=%s" % str(sentence_label), file=error_file, flush=True)
        print("--explain=%s" % str(explain), file=error_file, flush=True)
        print("--presorted=%s" % str(presorted), file=error_file, flush=True)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)


    lexer: Lexicalize = Lexicalize(label_properties,
                                   description_properties,
                                   isa_properties,
                                   has_properties,
                                   property_values,
                                   sentence_label,
                                   explain=explain,
                                   error_file=error_file,
                                   verbose=verbose,
                                   very_verbose=very_verbose)
    if len(entity_label_kgtk_files) > 0:
        lexer.load_entity_label_files(entity_label_kgtk_files,
                                      error_file,
                                      reader_options,
                                      value_options,
                                      label_properties=label_properties,
                                      verbose=verbose)
        
    kr: typing.Optional[KgtkReader] = None
    kw: typing.Optional[KgtkWriter] = None

    try:
        if verbose:
            print("Opening the input file %s" % str(input_kgtk_file), file=error_file, flush=True)
        kr = KgtkReader.open(input_kgtk_file,
                             options=reader_options,
                             value_options = value_options,
                             error_file=error_file,
                             verbose=verbose,
                             very_verbose=very_verbose,
                             )

        if kr.node1_column_idx < 0:
            raise KGTKException("Missing column: node1 or alias")
        if kr.label_column_idx < 0:
            raise KGTKException("Missing column: label or alias")
        if kr.node2_column_idx < 0:
            raise KGTKException("Missing column: node2 or alias")

        if verbose:
            print("node1 column index = {}".format(kr.node1_column_idx),  file=error_file, flush=True)
            print("label column index = {}".format(kr.label_column_idx),  file=error_file, flush=True)
            print("node2 column index = {}".format(kr.node2_column_idx),  file=error_file, flush=True)

        output_columns: typing.List[str] = OUTPUT_COLUMNS.copy()
        if explain:
            output_columns.append("explaination")
            if verbose:
                print("Including an explaination column in the output.", file=error_file, flush=True)

        if verbose:
            print("Opening the output file %s" % str(output_kgtk_file), file=error_file, flush=True)
        kw = KgtkWriter.open(output_columns,
                             output_kgtk_file,
                             require_all_columns=True,
                             prohibit_extra_columns=True,
                             fill_missing_columns=False,
                             gzip_in_parallel=False,
                             verbose=verbose,
                             very_verbose=very_verbose,
                             )

        if presorted:
            lexer.process_presorted_input(kr, kw)
        else:
            lexer.process_unsorted_input(kr, kw, add_entity_labels=add_entity_labels_from_input)

        return 0

    except Exception as e:
        raise KGTKException(str(e))

    finally:
        if kw is not None:
            kw.close()
            
        if kr is not None:
            kr.close()
示例#14
0
文件: filter.py 项目: usbader/kgtk
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        pattern: str,
        subj_col: typing.Optional[str],
        pred_col: typing.Optional[str],
        obj_col: typing.Optional[str],
        or_pattern: bool,
        invert: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            output_file, who="KGTK reject file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file)
        print("--pattern=%s" % str(pattern), file=error_file)
        if subj_col is not None:
            print("--subj=%s" % str(subj_col), file=error_file)
        if pred_col is not None:
            print("--pred=%s" % str(pred_col), file=error_file)
        if obj_col is not None:
            print("--obj=%s" % str(obj_col), file=error_file)
        print("--or=%s" % str(or_pattern), file=error_file)
        print("--invert=%s" % str(invert), file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    def prepare_filter(pattern: str) -> typing.Set[str]:
        filt: typing.Set[str] = set()
        pattern = pattern.strip()
        if len(pattern) == 0:
            return filt

        target: str
        for target in pattern.split(","):
            target = target.strip()
            if len(target) > 0:
                filt.add(target)

        return filt

    try:

        patterns: typing.List[str] = pattern.split(";")
        if len(patterns) != 3:
            print(
                "Error: The pattern must have three sections separated by semicolons (two semicolons total).",
                file=error_file,
                flush=True)
            raise KGTKException("Bad pattern")

        subj_filter: typing.Set[str] = prepare_filter(patterns[0])
        pred_filter: typing.Set[str] = prepare_filter(patterns[1])
        obj_filter: typing.Set[str] = prepare_filter(patterns[2])
        apply_subj_filter: bool = len(subj_filter) > 0
        apply_pred_filter: bool = len(pred_filter) > 0
        apply_obj_filter: bool = len(obj_filter) > 0

        if verbose and not (apply_subj_filter or apply_pred_filter
                            or apply_obj_filter):
            print("Warning: the filter is empty.", file=error_file, flush=True)

        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        subj_idx: int = kr.get_node1_column_index(subj_col)
        pred_idx: int = kr.get_label_column_index(pred_col)
        obj_idx: int = kr.get_node2_column_index(obj_col)

        # Complain about a missing column only when it is needed by the pattern.
        trouble: bool = False
        if subj_idx < 0 and len(subj_filter) > 0:
            trouble = True
            print("Error: Cannot find the subject column '%s'." %
                  kr.get_node1_canonical_name(subj_col),
                  file=error_file,
                  flush=True)
        if pred_idx < 0 and len(pred_filter) > 0:
            trouble = True
            print("Error: Cannot find the predicate column '%s'." %
                  kr.get_label_canonical_name(pred_col),
                  file=error_file,
                  flush=True)
        if obj_idx < 0 and len(obj_filter) > 0:
            trouble = True
            print("Error: Cannot find the object column '%s'." %
                  kr.get_node2_canonical_name(obj_col),
                  file=error_file,
                  flush=True)
        if trouble:
            raise KGTKException("Missing columns.")

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        rw: typing.Optional[KgtkWriter] = None
        if reject_kgtk_file is not None:
            if verbose:
                print("Opening the reject file: %s" % str(reject_kgtk_file),
                      file=error_file,
                      flush=True)
            rw = KgtkWriter.open(kr.column_names,
                                 reject_kgtk_file,
                                 mode=KgtkWriter.Mode[kr.mode.name],
                                 verbose=verbose,
                                 very_verbose=very_verbose)

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0
        subj_filter_keep_count: int = 0
        pred_filter_keep_count: int = 0
        obj_filter_keep_count: int = 0
        subj_filter_reject_count: int = 0
        pred_filter_reject_count: int = 0
        obj_filter_reject_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            keep: bool = False
            reject: bool = False
            if apply_subj_filter:
                if row[subj_idx] in subj_filter:
                    keep = True
                    subj_filter_keep_count += 1
                else:
                    reject = True
                    subj_filter_reject_count += 1

            if apply_pred_filter:
                if row[pred_idx] in pred_filter:
                    keep = True
                    pred_filter_keep_count += 1
                else:
                    reject = True
                    pred_filter_reject_count += 1

            if apply_obj_filter:
                if row[obj_idx] in obj_filter:
                    keep = True
                    obj_filter_keep_count += 1
                else:
                    reject = True
                    obj_filter_reject_count += 1

            if (not keep ^ invert) if or_pattern else (reject ^ invert):
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1
            else:
                kw.write(row)
                output_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))
            print("Keep counts: subject=%d, predicate=%d, object=%d." %
                  (subj_filter_keep_count, pred_filter_keep_count,
                   obj_filter_keep_count))
            print("Reject counts: subject=%d, predicate=%d, object=%d." %
                  (subj_filter_reject_count, pred_filter_reject_count,
                   obj_filter_reject_count))

        kw.close()
        if rw is not None:
            rw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
示例#15
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        label_file: KGTKFiles,
        input_select_column_name: typing.Optional[str],
        input_select_column_value: typing.Optional[str],
        input_lifting_column_names: typing.List[str],
        output_lifted_column_names: typing.List[str],
        output_lifted_column_suffix: str,
        output_select_column_value: str,
        label_select_column_name: typing.Optional[str],
        label_select_column_value: str,
        label_match_column_name: typing.Optional[str],
        label_value_column_name: typing.Optional[str],
        remove_label_records: bool = False,
        sort_lifted_labels: bool = True,
        suppress_duplicate_labels: bool = True,
        suppress_empty_columns: bool = False,
        ok_if_no_labels: bool = False,
        prefilter_labels: bool = False,
        input_is_presorted: bool = False,
        labels_are_presorted: bool = False,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.lift.kgtklift import KgtkLift
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    label_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_input_file(
            label_file, who="KGTK label file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(output_kgtk_file),
              file=error_file,
              flush=True)
        if label_kgtk_file is not None:
            print("-label-file=%s" % label_kgtk_file,
                  file=error_file,
                  flush=True)

        if input_select_column_name is not None:
            print("--input-select-column=%s" % input_select_column_name,
                  file=error_file,
                  flush=True)
        if input_select_column_value is not None:
            print("--input-select-value=%s" % input_select_column_value,
                  file=error_file,
                  flush=True)
        if input_lifting_column_names is not None and len(
                input_lifting_column_names) > 0:
            print("--columns-to-lift %s" %
                  " ".join(input_lifting_column_names),
                  file=error_file,
                  flush=True)
        if output_lifted_column_names is not None and len(
                output_lifted_column_names) > 0:
            print("--columns-to-write %s" %
                  " ".join(output_lifted_column_names),
                  file=error_file,
                  flush=True)

        print("--lift-suffix=%s" % output_lifted_column_suffix,
              file=error_file,
              flush=True)
        if output_select_column_value is not None:
            print("--update-select-value=%s" % output_select_column_value,
                  file=error_file,
                  flush=True)

        if label_select_column_name is not None:
            print("--label-select-column=%s" % label_select_column_name,
                  file=error_file,
                  flush=True)
        print("--label-select-value=%s" % label_select_column_value,
              file=error_file,
              flush=True)
        if label_match_column_name is not None:
            print("--label-match-column=%s" % label_match_column_name,
                  file=error_file,
                  flush=True)
        if label_value_column_name is not None:
            print("--label-value-column=%s" % label_value_column_name,
                  file=error_file,
                  flush=True)

        print("--remove-label-records=%s" % str(remove_label_records))
        print("--sort-lifted-labels=%s" % str(sort_lifted_labels))
        print("--suppress-duplicate-labels=%s" %
              str(suppress_duplicate_labels))
        print("--suppress-empty-columns=%s" % str(suppress_empty_columns))
        print("--ok-if-no-labels=%s" % str(ok_if_no_labels))
        print("--prefilter-labels=%s" % str(prefilter_labels))
        print("--input-file-is-presorted=%s" % str(input_is_presorted))
        print("--label-file-is-presorted=%s" % str(labels_are_presorted))
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        kl: KgtkLift = KgtkLift(
            input_file_path=input_kgtk_file,
            label_file_path=label_kgtk_file,
            output_file_path=output_kgtk_file,
            input_select_column_name=input_select_column_name,
            input_select_column_value=input_select_column_value,
            input_lifting_column_names=input_lifting_column_names,
            output_lifted_column_suffix=output_lifted_column_suffix,
            output_select_column_value=output_select_column_value,
            output_lifted_column_names=output_lifted_column_names,
            label_select_column_name=label_select_column_name,
            label_select_column_value=label_select_column_value,
            label_match_column_name=label_match_column_name,
            label_value_column_name=label_value_column_name,
            remove_label_records=remove_label_records,
            sort_lifted_labels=sort_lifted_labels,
            suppress_duplicate_labels=suppress_duplicate_labels,
            suppress_empty_columns=suppress_empty_columns,
            ok_if_no_labels=ok_if_no_labels,
            prefilter_labels=prefilter_labels,
            input_is_presorted=input_is_presorted,
            labels_are_presorted=labels_are_presorted,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        kl.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#16
0
def run(input_files: KGTKFiles,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = False,
        header_only: bool = False,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs # Whatever KgtkReaderOptions and KgtkValueOptions want.
)->int:
    # import modules locally
    from kgtk.exceptions import KGTKException

    kgtk_files: typing.List[Path] = KGTKArgumentParser.get_input_file_list(input_files)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stderr if errors_to_stderr else sys.stdout

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-files: %s" % " ".join((str(kgtk_file) for kgtk_file in kgtk_files)), file=error_file)
        print("--header-only=%s" % str(header_only), file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        kgtk_file: Path
        for kgtk_file in kgtk_files:
            if verbose:
                print("\n====================================================", flush=True)
                if str(kgtk_file) != "-":
                    print("Validating '%s'" % str(kgtk_file), file=error_file, flush=True)
                else:
                    print ("Validating from stdin", file=error_file, flush=True)

            kr: KgtkReader = KgtkReader.open(kgtk_file,
                                             error_file=error_file,
                                             options=reader_options,
                                             value_options=value_options,
                                             verbose=verbose,
                                             very_verbose=very_verbose)
        
            if header_only:
                kr.close()
                if verbose:
                    print("Validated the header only.", file=error_file, flush=True)
            else:
                line_count: int = 0
                row: typing.List[str]
                for row in kr:
                    line_count += 1
                if verbose:
                    print("Validated %d data lines" % line_count, file=error_file, flush=True)
        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#17
0
def run(
    input_file: KGTKFiles,
    output_file: KGTKFiles,
    columns: typing.Optional[typing.List[str]] = None,
    locale: str = "C",
    reverse_sort: bool = False,
    reverse_columns: typing.Optional[typing.List[str]] = None,
    numeric_sort: bool = False,
    numeric_columns: typing.Optional[typing.List[str]] = None,
    pure_python: bool = False,
    extra: typing.Optional[str] = None,
    bash_command: str = "bash",
    bzip2_command: str = "bzip2",
    gzip_command: str = "gzip",
    pgrep_command: str = "pgrep",
    sort_command: str = "sort",
    xz_command: str = "xz",
    errors_to_stdout: bool = False,
    errors_to_stderr: bool = True,
    show_options: bool = False,
    verbose: bool = False,
    very_verbose: bool = False,
    **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.       
) -> int:
    from io import StringIO
    import os
    from pathlib import Path
    import sh  # type: ignore
    import sys
    import typing

    from kgtk.cli_entry import progress_startup
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_path: Path = KGTKArgumentParser.get_input_file(input_file)
    output_path: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    def python_sort():
        if numeric_columns is not None and len(numeric_columns) > 0:
            raise KGTKException(
                'Error: the pure Python sorter does not currently support numeric column sorts.'
            )

        if reverse_columns is not None and len(reverse_columns) > 0:
            raise KGTKException(
                'Error: the pure Python sorter does not currently support reverse column sorts.'
            )

        if verbose:
            print("Opening the input file: %s" % str(input_path),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_path,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        sort_idx: int
        key_idxs: typing.List[int] = []
        if columns is not None and len(columns) > 0:
            # Process the list of column names, including splitting
            # comma-separated lists of column names.
            column_name: str
            for column_name in columns:
                column_name_2: str
                for column_name_2 in column_name.split(","):
                    column_name_2 = column_name_2.strip()
                    if len(column_name_2) == 0:
                        continue
                    if column_name_2.isdigit():
                        sort_idx = int(column_name_2)
                        if sort_idx > len(kr.column_names):
                            kr.close()
                            cleanup()
                            raise KGTKException(
                                "Invalid column number %d (max %d)." %
                                (sort_idx, len(kr.column_names)))
                        key_idxs.append(sort_idx - 1)
                    else:
                        if column_name_2 not in kr.column_names:
                            kr.close()
                            cleanup()
                            raise KGTKException("Unknown column_name %s" %
                                                column_name_2)
                        key_idxs.append(kr.column_name_map[column_name_2])
        else:
            if kr.is_node_file:
                key_idxs.append(kr.id_column_idx)

            elif kr.is_edge_file:
                if kr.id_column_idx >= 0:
                    key_idxs.append(kr.id_column_idx)

                key_idxs.append(kr.node1_column_idx)
                key_idxs.append(kr.label_column_idx)
                key_idxs.append(kr.node2_column_idx)
            else:
                cleanup()
                raise KGTKException(
                    "Unknown KGTK file mode, please specify the sorting columns."
                )

        if verbose:
            print("sorting keys: %s" % " ".join([str(x) for x in key_idxs]),
                  file=error_file,
                  flush=True)

        if numeric_sort and len(key_idxs) > 1:
            raise KGTKException(
                'Error: the pure Python sorter does not currently support numeric sorts on multiple columns.'
            )

        lines: typing.MutableMapping[typing.Union[str, float],
                                     typing.List[typing.List[str]]] = dict()

        progress_startup()
        key: typing.Union[str, float]
        row: typing.List[str]
        for row in kr:
            key = KgtkFormat.KEY_FIELD_SEPARATOR.join(row[idx]
                                                      for idx in key_idxs)
            if numeric_sort:
                key = float(key)
            if key in lines:
                # There are multiple rows with the same key.  Make this a stable sort.
                lines[key].append(row)
            else:
                lines[key] = [row]
        if verbose:
            print("\nRead %d data lines." % len(lines),
                  file=error_file,
                  flush=True)

        kw = KgtkWriter.open(kr.column_names,
                             output_path,
                             mode=KgtkWriter.Mode[kr.mode.name],
                             verbose=verbose,
                             very_verbose=very_verbose)

        for key in sorted(lines.keys(), reverse=reverse_sort):
            for row in lines[key]:
                kw.write(row)

        kw.close()
        kr.close()

    if pure_python:
        return python_sort()

    try:
        global header_read_fd
        global header_write_fd
        header_read_fd, header_write_fd = os.pipe()
        os.set_inheritable(header_write_fd, True)
        if verbose:
            print("header pipe: read_fd=%d write_fd=%d" %
                  (header_read_fd, header_write_fd),
                  file=error_file,
                  flush=True)

        global sortopt_read_fd
        global sortopt_write_fd
        sortopt_read_fd, sortopt_write_fd = os.pipe()
        os.set_inheritable(sortopt_read_fd, True)
        if verbose:
            print("sort options pipe: read_fd=%d write_fd=%d" %
                  (sortopt_read_fd, sortopt_write_fd),
                  file=error_file,
                  flush=True)

        locale_envar: str = "LC_ALL=%s" % locale if len(locale) > 0 else ""

        # Note: "read -u n", used below, is not supported by some shells.
        # bash and zsh support it.
        # ash, csh, dash, and tcsh do not.
        # The original standard Bourne shell, sh, does not.
        # ksh might do it, if the FD number is a single digit.
        cmd: str = "".join((
            "{ IFS= read -r header ; ",  # Read the header line
            " { printf \"%s\\n\" \"$header\" >&" + str(header_write_fd) +
            " ; } ; ",  # Send the header to Python
            " printf \"%s\\n\" \"$header\" ; ",  # Send the header to standard output (which may be redirected to a file, below).
            " IFS= read -u " + str(sortopt_read_fd) +
            " -r options ; ",  # Read the sort command options from Python.
            " %s %s -t '\t' $options ; } " % (
                locale_envar, sort_command
            ),  # Sort the remaining input lines using the options read from Python.
        ))
        if str(output_path) != "-":
            # Do we want to compress the output?
            output_suffix: str = output_path.suffix.lower()
            if output_suffix in [".gz", ".z"]:
                if verbose:
                    print("gzip output file: %s" % repr(str(output_path)),
                          file=error_file,
                          flush=True)
                cmd += " | " + gzip_command + " -"

            elif output_suffix in [".bz2", ".bz"]:
                if verbose:
                    print("bzip2 output file: %s" % repr(str(output_path)),
                          file=error_file,
                          flush=True)
                cmd += " | " + bzip2_command + " -z"

            elif output_suffix in [".xz", ".lzma"]:
                if verbose:
                    print("xz output file: %s" % repr(str(output_path)),
                          file=error_file,
                          flush=True)
                cmd += " | " + xz_command + " -z -"

            # Feed the sorted output to the named file.  Otherwise, the sorted
            # output goes to standard output without passing through Python.
            cmd += " > " + repr(str(output_path))

        if verbose:
            print("sort command: %s" % cmd, file=error_file, flush=True)

        global cat_proc
        cat_proc = None
        global cmd_proc
        cmd_proc = None

        def cat_done(cmd, success, exit_code):
            # When the cat command finishes, monitor the progress of the sort command.
            if verbose:
                print("\nDone reading the input file",
                      file=error_file,
                      flush=True)
            if cmd_proc is None:
                return

            # Locate the sort command using pgrep
            buf = StringIO()
            try:
                sh_pgrep = sh.Command(pgrep_command)
                sh_pgrep("-g",
                         cmd_proc.pgid,
                         "--newest",
                         sort_command,
                         _out=buf)
                pgrep_output = buf.getvalue()
                if len(pgrep_output) == 0:
                    if verbose:
                        print("Unable to locate the sort command.",
                              file=error_file,
                              flush=True)
                    return
                sort_pid = int(pgrep_output)
            except Exception as e:
                if verbose:
                    print("Exception looking for sort command: %s" % str(e),
                          file=error_file,
                          flush=True)
                return
            finally:
                buf.close()

            if verbose:
                print("Monitoring the sort command (pid=%d)" % sort_pid,
                      file=error_file,
                      flush=True)
            progress_startup(pid=sort_pid)

        if str(input_path) == "-":
            # Read from standard input.
            #
            # Sh version 1.13 or greater is required for _pass_fds.
            sh_bash = sh.Command(bash_command)
            cmd_proc = sh_bash("-c",
                               cmd,
                               _in=sys.stdin,
                               _out=sys.stdout,
                               _err=sys.stderr,
                               _bg=True,
                               _bg_exc=False,
                               _internal_bufsize=1,
                               _pass_fds={header_write_fd, sortopt_read_fd})

            # It would be nice to monitor the sort command here.  Unfortunately, there
            # is a race condition that makes this difficult.  We could loop until the
            # sort command is created, then monitor it.

        else:
            # Feed the named file into the data processing pipeline,
            input_suffix: str = input_path.suffix.lower()
            if input_suffix in [".gz", ".z"]:
                if verbose:
                    print("gunzip input file: %s" % repr(str(input_path)),
                          file=error_file,
                          flush=True)
                sh_gzip = sh.Command(gzip_command)
                cat_proc = sh_gzip(input_path,
                                   "-dc",
                                   _in=sys.stdin,
                                   _piped=True,
                                   _err=sys.stderr,
                                   _bg=True,
                                   _bg_exc=False,
                                   _internal_bufsize=1,
                                   _done=cat_done)

                if verbose:
                    print("full command: %s -dc %s | %s" %
                          (gzip_command, repr(str(input_path)), cmd),
                          file=error_file,
                          flush=True)

            elif input_suffix in [".bz2", ".bz"]:
                if verbose:
                    print("bunzip2 input file: %s" % repr(str(input_path)),
                          file=error_file,
                          flush=True)
                sh_bzip2 = sh.Command(bzip2_command)
                cat_proc = sh_bzip2(input_path,
                                    "-dc",
                                    _in=sys.stdin,
                                    _piped=True,
                                    _err=sys.stderr,
                                    _bg=True,
                                    _bg_exc=False,
                                    _internal_bufsize=1,
                                    _done=cat_done)

                if verbose:
                    print("full command: %s -dc %s | %s" %
                          (bzip2_command, repr(str(input_path)), cmd),
                          file=error_file,
                          flush=True)

            elif input_suffix in [".xz", ".lzma"]:
                if verbose:
                    print("unxz input file: %s" % repr(str(input_path)),
                          file=error_file,
                          flush=True)
                sh_xz = sh.Command(xz_command)
                cat_proc = sh_xz(input_path,
                                 "-dc",
                                 _in=sys.stdin,
                                 _piped=True,
                                 _err=sys.stderr,
                                 _bg=True,
                                 _bg_exc=False,
                                 _internal_bufsize=1,
                                 _done=cat_done)
                if verbose:
                    print("full command: %s -dc %s | %s" %
                          (xz_command, repr(str(input_path)), cmd),
                          file=error_file,
                          flush=True)

            else:
                if verbose:
                    print("input file: %s" % repr(str(input_path)),
                          file=error_file,
                          flush=True)
                cat_proc = sh.cat(input_path,
                                  _in=sys.stdin,
                                  _piped=True,
                                  _err=sys.stderr,
                                  _bg=True,
                                  _bg_exc=False,
                                  _internal_bufsize=1,
                                  _done=cat_done)
                if verbose:
                    print("full command: cat %s | %s" %
                          (repr(str(input_path)), cmd),
                          file=error_file,
                          flush=True)

            # If enabled, monitor the progress of reading the input file.
            # Since we do not have access to the pid of the sort command,
            # we cannot monitor the progress of the merge phases.
            if verbose:
                print("Monitoring the cat command (pid=%d)." % cat_proc.pid,
                      file=error_file,
                      flush=True)
            progress_startup(pid=cat_proc.pid)

            # Sh version 1.13 or greater is required for _pass_fds.
            sh_bash = sh.Command(bash_command)
            cmd_proc = sh_bash(cat_proc,
                               "-c",
                               cmd,
                               _out=sys.stdout,
                               _err=sys.stderr,
                               _bg=True,
                               _bg_exc=False,
                               _internal_bufsize=1,
                               _pass_fds={header_write_fd, sortopt_read_fd})
            # Since we do not have access to the pid of the sort command,
            # we cannot monitor the progress of the merge phases.

        if verbose:
            print("Running the sort script (pid=%d)." % cmd_proc.pid,
                  file=error_file,
                  flush=True)

        if verbose:
            print("Reading the KGTK input file header line with KgtkReader",
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            Path("<%d" % header_read_fd),
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        if verbose:
            print("KGTK header: %s" % " ".join(kr.column_names),
                  file=error_file,
                  flush=True)

        sort_options: str = ""
        if reverse_sort:
            sort_options += " --reverse"
        if numeric_sort:
            sort_options += " --numeric"

        if extra is not None and len(extra) > 0:
            sort_options += " " + extra

        # We will consume entries in reverse_columns and numeric_columns,
        # then complain if any are left over.
        if reverse_columns is not None:
            reverse_columns = reverse_columns[:]  # Protect against modifying a shared list.
        if numeric_columns is not None:
            numeric_columns = numeric_columns[:]  # Protect against modifying a shared list.

        column_name: str
        sort_idx: int
        if columns is not None and len(columns) > 0:
            # Process the list of column names, including splitting
            # comma-separated lists of column names.
            for column_name in columns:
                column_name_2: str
                for column_name_2 in column_name.split(","):
                    column_name_2 = column_name_2.strip()
                    if len(column_name_2) == 0:
                        continue
                    if column_name_2.isdigit():
                        sort_idx = int(column_name_2)
                        if sort_idx > len(kr.column_names):
                            kr.close()
                            cleanup()
                            raise KGTKException(
                                "Invalid column number %d (max %d)." %
                                (sort_idx, len(kr.column_names)))
                    else:
                        if column_name_2 not in kr.column_names:
                            kr.close()
                            cleanup()
                            raise KGTKException("Unknown column_name %s" %
                                                repr(column_name_2))
                        sort_idx = kr.column_name_map[column_name_2] + 1
                    sort_options += " -k %d,%d" % (sort_idx, sort_idx)
                    if reverse_columns is not None and column_name_2 in reverse_columns:
                        sort_options += "r"
                        reverse_columns.remove(column_name_2)
                    if numeric_columns is not None and column_name_2 in numeric_columns:
                        sort_options += "n"
                        numeric_columns.remove(column_name_2)
        else:
            # TODO: support the case where the column name in reverse_columns
            # or numeric_columns is an alias of the name used in the file header.
            if kr.is_node_file:
                sort_idx = kr.id_column_idx + 1
                sort_options += " -k %d,%d" % (sort_idx, sort_idx)
                column_name = kr.column_names[kr.id_column_idx]
                if reverse_columns is not None and column_name in reverse_columns:
                    sort_options += "r"
                    reverse_columns.remove(column_name)
                if numeric_columns is not None and column_name in numeric_columns:
                    sort_options += "n"
                    numeric_columns.remove(column_name)

            elif kr.is_edge_file:
                if kr.id_column_idx >= 0:
                    sort_idx = kr.id_column_idx + 1
                    sort_options += " -k %d,%d" % (sort_idx, sort_idx)
                    column_name = kr.column_names[kr.id_column_idx]
                    if reverse_columns is not None and column_name in reverse_columns:
                        sort_options += "r"
                        reverse_columns.remove(column_name)
                    if numeric_columns is not None and column_name in numeric_columns:
                        sort_options += "n"
                        numeric_columns.remove(column_name)

                sort_idx = kr.node1_column_idx + 1
                sort_options += " -k %d,%d" % (sort_idx, sort_idx)
                column_name = kr.column_names[kr.node1_column_idx]
                if reverse_columns is not None and column_name in reverse_columns:
                    sort_options += "r"
                    reverse_columns.remove(column_name)
                if numeric_columns is not None and column_name in numeric_columns:
                    sort_options += "n"
                    numeric_columns.remove(column_name)

                sort_idx = kr.label_column_idx + 1
                sort_options += " -k %d,%d" % (sort_idx, sort_idx)
                column_name = kr.column_names[kr.label_column_idx]
                if reverse_columns is not None and column_name in reverse_columns:
                    sort_options += "r"
                    reverse_columns.remove(column_name)
                if numeric_columns is not None and column_name in numeric_columns:
                    sort_options += "n"
                    numeric_columns.remove(column_name)

                sort_idx = kr.node2_column_idx + 1
                sort_options += " -k %d,%d" % (sort_idx, sort_idx)
                column_name = kr.column_names[kr.node2_column_idx]
                if reverse_columns is not None and column_name in reverse_columns:
                    sort_options += "r"
                    reverse_columns.remove(column_name)
                if numeric_columns is not None and column_name in numeric_columns:
                    numeric_columns.remove(column_name)
                    sort_options += "n"

            else:
                cleanup()
                raise KGTKException(
                    "Unknown KGTK file mode, please specify the sorting columns."
                )

        # Check for unconsumed entries in reverse_columns and numeric_columns:
        if reverse_columns is not None and len(reverse_columns) > 0:
            raise KGTKException("Unknown reverse column(s) %s" % " ".join(
                [repr(column_name) for column_name in reverse_columns]))
        if numeric_columns is not None and len(numeric_columns) > 0:
            raise KGTKException("Unknown numeric column(s) %s" % " ".join(
                [repr(column_name) for column_name in numeric_columns]))

        if verbose:
            print("sort options: %s" % sort_options,
                  file=error_file,
                  flush=True)

        kr.close()  # We are done with the KgtkReader now.

        # Send the sort options back to the data processing pipeline.
        with open(sortopt_write_fd, "w") as options_file:
            options_file.write(sort_options + "\n")

        if verbose:
            print("\nWaiting for the sort command to complete.\n",
                  file=error_file,
                  flush=True)
        cmd_proc.wait()

        if verbose:
            print("Cleanup.", file=error_file, flush=True)
        cleanup()

        return 0

    except Exception as e:
        # import traceback
        # traceback.print_tb(sys.exc_info()[2], 10)
        raise KGTKException('INTERNAL ERROR: ' + type(e).__module__ + '.' +
                            str(e) + '\n')
示例#18
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        columns: typing.Optional[typing.List[str]] = None,
        labels: typing.Optional[typing.List[str]] = None,
        id_column_name: typing.Optional[str] = None,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    import os

    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)

        if columns is not None:
            print("--columns=%s" % " ".join(columns), file=error_file)
        if labels is not None:
            print("--labels=%s" % " ".join(labels), file=error_file)
        if id_column_name is not None:
            print("--id-column=%s" % id_column_name, file=error_file)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if verbose:
        print("Starting normalize_nodes pid=%d" % (os.getpid()),
              file=error_file,
              flush=True)

    label_map: typing.MutableMapping[str, str] = dict()
    if labels is not None and len(labels) > 0:
        if columns is None:
            raise KGTKException(
                "--columns must be supplied when --labels is used.")
        if len(columns) != len(labels):
            raise KGTKException("%d columns were supplied, but %d labels." %
                                (len(columns), len(labels)))
        idx: int
        label: str
        for idx, label in enumerate(labels):
            label_map[columns[idx]] = label

    try:
        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        id_column_idx: int = kr.get_id_column_index(id_column_name)
        if id_column_idx < 0:
            raise KGTKException("Unknown ID column %s" % repr(id_column_name))

        output_column_names: typing.List[str] = [
            KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2
        ]

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(output_column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        input_line_count: int = 0
        output_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            node1_value: str = row[id_column_idx]

            column_idx: int
            column_name: str
            for column_idx, column_name in enumerate(kr.column_names):
                if column_idx == id_column_idx:
                    continue
                if columns is not None and column_name not in columns:
                    continue

                label_value: str = label_map.get(column_name, column_name)

                new_value: str = row[column_idx]
                if len(new_value) == 0:
                    continue  # ignore empty values.

                # The column value might contain a KGTK list.  Since node2 isn't supposed
                # to contain lists, we'll split it.
                node2_value: str
                for node2_value in KgtkValue.split_list(new_value):
                    if len(node2_value) == 0:
                        continue  # node2 shouldn't contain empty values

                    output_row: typing.List[str] = [
                        node1_value, label_value, node2_value
                    ]
                    kw.write(output_row)
                    output_line_count += 1

        if verbose:
            print("Read %d node rows, wrote %d edge rows." %
                  (input_line_count, output_line_count),
                  file=error_file,
                  flush=True)

        kw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
示例#19
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        undirected: bool,
        compute_degrees: bool,
        compute_pagerank: bool,
        compute_hits: bool,
        log_file: str,
        statistics_only: bool,
        vertex_in_degree: str,
        vertex_out_degree: str,
        vertex_pagerank: str,
        vertex_auth: str,
        vertex_hubs: str,
        top_n: int,
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    # import modules locally
    from pathlib import Path
    import sys

    from graph_tool import centrality
    from kgtk.exceptions import KGTKException
    import kgtk.gt.analysis_utils as gtanalysis
    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    v_prop_dict = {
        'vertex_pagerank': vertex_pagerank,
        'vertex_hubs': vertex_hubs,
        'vertex_auth': vertex_auth
    }
    try:

        # Select where to send error messages, defaulting to stderr.
        error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

        # Build the option structures.
        reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

        input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)

        # hardcoded values useful for the script. Perhaps some of them should be exposed as arguments later
        directions = ['in', 'out', 'total']
        id_col = 'name'
        output_columns = ["node1", "label", "node2", "id"]

        if verbose:
            print('loading the KGTK input file...\n',
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        sub: int = kr.get_node1_column_index()
        if sub < 0:
            print("Missing node1 (subject) column.",
                  file=error_file,
                  flush=True)
        pred: int = kr.get_label_column_index()
        if pred < 0:
            print("Missing label (predicate) column.",
                  file=error_file,
                  flush=True)
        obj: int = kr.get_node2_column_index()
        if obj < 0:
            print("Missing node2 (object) column", file=error_file, flush=True)
        if sub < 0 or pred < 0 or obj < 0:
            kr.close()
            raise KGTKException("Exiting due to missing columns.")

        predicate: str = kr.column_names[pred]

        G2 = load_graph_from_kgtk(kr,
                                  directed=not undirected,
                                  ecols=(sub, obj),
                                  verbose=verbose,
                                  out=error_file)
        if verbose:
            print('graph loaded! It has %d nodes and %d edges.' %
                  (G2.num_vertices(), G2.num_edges()),
                  file=error_file,
                  flush=True)

        kw: KgtkWriter = KgtkWriter.open(output_columns,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        with open(log_file, 'w') as writer:
            writer.write('graph loaded! It has %d nodes and %d edges\n' %
                         (G2.num_vertices(), G2.num_edges()))
            writer.write('\n###Top relations:\n')
            for rel, freq in gtanalysis.get_topN_relations(
                    G2, pred_property=predicate):
                writer.write('%s\t%d\n' % (rel, freq))

            if compute_degrees:
                writer.write('\n###Degrees:\n')
                for direction in directions:
                    degree_data = gtanalysis.compute_node_degree_hist(
                        G2, direction)
                    max_degree = len(degree_data) - 1
                    mean_degree, std_degree = gtanalysis.compute_avg_node_degree(
                        G2, direction)
                    writer.write(
                        '%s degree stats: mean=%f, std=%f, max=%d\n' %
                        (direction, mean_degree, std_degree, max_degree))

            if compute_pagerank:
                writer.write('\n###PageRank\n')
                v_pr = G2.new_vertex_property('float')
                centrality.pagerank(G2, prop=v_pr)
                G2.properties[('v', 'vertex_pagerank')] = v_pr
                writer.write('Max pageranks\n')
                result = gtanalysis.get_topn_indices(G2, 'vertex_pagerank',
                                                     top_n, id_col)
                for n_id, n_label, pr in result:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, pr))

            if compute_hits:
                writer.write('\n###HITS\n')
                hits_eig, G2.vp['vertex_hubs'], G2.vp[
                    'vertex_auth'] = gtanalysis.compute_hits(G2)
                writer.write('HITS hubs\n')
                main_hubs = gtanalysis.get_topn_indices(
                    G2, 'vertex_hubs', top_n, id_col)
                for n_id, n_label, hubness in main_hubs:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, hubness))
                writer.write('HITS auth\n')
                main_auth = gtanalysis.get_topn_indices(
                    G2, 'vertex_auth', top_n, id_col)
                for n_id, n_label, authority in main_auth:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority))

        id_count = 0
        if not statistics_only:
            for e in G2.edges():
                sid, oid = e
                lbl = G2.ep[predicate][e]
                kw.write([
                    G2.vp[id_col][sid], lbl, G2.vp[id_col][oid],
                    '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count)
                ])
                id_count += 1

        id_count = 0
        for v in G2.vertices():
            v_id = G2.vp[id_col][v]
            kw.write([
                v_id, vertex_in_degree,
                str(v.in_degree()), '{}-{}-{}'.format(v_id, vertex_in_degree,
                                                      id_count)
            ])
            id_count += 1
            kw.write([
                v_id, vertex_out_degree,
                str(v.out_degree()), '{}-{}-{}'.format(v_id, vertex_out_degree,
                                                       id_count)
            ])
            id_count += 1

            for vprop in G2.vertex_properties.keys():
                if vprop == id_col:
                    continue
                kw.write([
                    v_id, v_prop_dict[vprop],
                    str(G2.vp[vprop][v]),
                    '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count)
                ])
                id_count += 1

        kw.close()
        kr.close()

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
示例#20
0
def run(
        input_file: KGTKFiles,
        pattern_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        grouped_input: bool = False,
        reject_node1_groups: bool = False,
        no_complaints: bool = False,
        complain_immediately: bool = False,
        add_isa_column: bool = False,
        isa_column_name: str = "isa;node2",
        autovalidate: bool = True,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = False,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkReaderOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderMode, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.propertypatternvalidator import PropertyPatterns, PropertyPatternValidator
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    pattern_kgtk_file: Path = KGTKArgumentParser.get_input_file(
        pattern_file, default_stdin=False)
    output_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(output_file)
    reject_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(reject_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--pattern-file=%s" % str(pattern_kgtk_file), file=error_file)
        if output_kgtk_file is not None:
            print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file)
        print("--presorted=%s" % str(grouped_input))
        print("--reject-node1-groups=%s" % str(reject_node1_groups))
        print("--complain-immediately=%s" % str(complain_immediately))
        print("--add-isa-column=%s" % str(add_isa_column))
        print("--isa-column-name=%s" % str(isa_column_name))
        print("--autovalidate=%s" % str(autovalidate))
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if verbose:
        print("Reading data from '%s'" % str(input_kgtk_file),
              file=error_file,
              flush=True)
        print("Reading patterns from '%s'" % str(pattern_kgtk_file),
              file=error_file,
              flush=True)
        if output_kgtk_file is not None:
            print("Writing good data to '%s'" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        if reject_kgtk_file is not None:
            print("Writing rejected data to '%s'" % str(reject_kgtk_file),
                  file=error_file,
                  flush=True)

    try:
        pkr: KgtkReader = KgtkReader.open(pattern_kgtk_file,
                                          error_file=error_file,
                                          mode=KgtkReaderMode.EDGE,
                                          options=reader_options,
                                          value_options=value_options,
                                          verbose=verbose,
                                          very_verbose=very_verbose)

        pps: PropertyPatterns = PropertyPatterns.load(
            pkr,
            value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        kr: KgtkReader = KgtkReader.open(input_kgtk_file,
                                         error_file=error_file,
                                         options=reader_options,
                                         value_options=value_options,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        output_column_names: typing.List[str] = []
        isa_column_idx: int = -1
        if output_kgtk_file is not None:
            output_column_names = kr.column_names.copy()
            if add_isa_column:
                if isa_column_name in output_column_names:
                    isa_column_idx = output_column_names.index(isa_column_name)
                else:
                    isa_column_idx = len(output_column_names)
                    output_column_names.append(isa_column_name)

        ppv: PropertyPatternValidator = PropertyPatternValidator.new(
            pps,
            kr,
            grouped_input=grouped_input,
            reject_node1_groups=reject_node1_groups,
            no_complaints=no_complaints,
            complain_immediately=complain_immediately,
            isa_column_idx=isa_column_idx,
            autovalidate=autovalidate,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        kw: typing.Optional[KgtkWriter] = None
        if output_kgtk_file is not None:
            kw = KgtkWriter.open(output_column_names,
                                 output_kgtk_file,
                                 verbose=verbose,
                                 very_verbose=very_verbose)

        rkw: typing.Optional[KgtkWriter] = None
        if reject_kgtk_file is not None:
            rkw = KgtkWriter.open(output_column_names,
                                  reject_kgtk_file,
                                  verbose=verbose,
                                  very_verbose=very_verbose)

        ppv.process(kr, kw, rkw)

        if verbose:
            print("Read %d rows, %d valid" %
                  (ppv.input_row_count, ppv.valid_row_count),
                  file=error_file,
                  flush=True)
            if kw is not None:
                print("Wrote %d good rows" % ppv.output_row_count,
                      file=error_file,
                      flush=True)
            if rkw is not None:
                print("Wrote %d rejected rows" % ppv.reject_row_count,
                      file=error_file,
                      flush=True)

        if kw is not None:
            kw.close()
        if rkw is not None:
            rkw.close()

        return 0

    except Exception as e:
        raise KGTKException(e)
示例#21
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        namespace_file: KGTKFiles,
        updated_namespace_file: KGTKFiles,
        namespace_id_prefix: str,
        namespace_id_use_uuid: bool,
        namespace_id_counter: int,
        namespace_id_zfill: int,
        output_only_used_namespaces: bool,
        allow_lax_uri: bool,
        local_namespace_prefix: str,
        local_namespace_use_uuid: bool,
        prefix_expansion_label: str,
        structured_value_label: str,
        structured_uri_label: str,
        newnode_prefix: str,
        newnode_use_uuid: bool,
        newnode_counter: int,
        newnode_zfill: int,
        build_id: bool,
        escape_pipes: bool,
        validate: bool,
        override_uuid: typing.Optional[str],
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.imports.kgtkntriples import KgtkNtriples
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    # Select where to send error messages, defaulting to stderr.
    input_file_paths: typing.List[
        Path] = KGTKArgumentParser.get_input_file_list(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_file_path: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            reject_file, who="KGTK reject file")

    namespace_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_input_file(
            namespace_file, who="KGTK namespace file")
    updated_namespace_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            updated_namespace_file, who="KGTK updated namespace file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-files %s" %
              " ".join([str(path) for path in input_file_paths]),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(output_kgtk_file),
              file=error_file,
              flush=True)
        if reject_file_path is not None:
            print("--reject-file=%s" % str(reject_file_path),
                  file=error_file,
                  flush=True)
        if namespace_kgtk_file is not None:
            print("--namespace-file=%s" % str(namespace_kgtk_file),
                  file=error_file,
                  flush=True)
        if updated_namespace_kgtk_file is not None:
            print("--updated-namespace-file=%s" %
                  str(updated_namespace_kgtk_file),
                  file=error_file,
                  flush=True)

        print("--namespace-id-prefix %s" % namespace_id_prefix,
              file=error_file,
              flush=True)
        print("--namespace-id-use-uuid %s" % str(namespace_id_use_uuid),
              file=error_file,
              flush=True)
        print("--namespace-id-counter %s" % str(namespace_id_counter),
              file=error_file,
              flush=True)
        print("--namespace-id-zfill %s" % str(namespace_id_zfill),
              file=error_file,
              flush=True)
        print("--output-only-used-namespaces %s" %
              str(output_only_used_namespaces),
              file=error_file,
              flush=True)

        print("--allow-lax-uri %s" % str(allow_lax_uri),
              file=error_file,
              flush=True)

        print("--local-namespace-prefix %s" % local_namespace_prefix,
              file=error_file,
              flush=True)
        print("--local-namespace-use-uuid %s" % str(local_namespace_use_uuid),
              file=error_file,
              flush=True)

        print("--prefix-expansion-label %s" % prefix_expansion_label,
              file=error_file,
              flush=True)
        print("--structured-value-label %s" % structured_value_label,
              file=error_file,
              flush=True)
        print("--structured-uri-label %s" % structured_uri_label,
              file=error_file,
              flush=True)

        print("--newnode-prefix %s" % newnode_prefix,
              file=error_file,
              flush=True)
        print("--newnode-use-uuid %s" % str(newnode_use_uuid),
              file=error_file,
              flush=True)
        print("--newnode-counter %s" % str(newnode_counter),
              file=error_file,
              flush=True)
        print("--newnode-zfill %s" % str(newnode_zfill),
              file=error_file,
              flush=True)

        print("--build-id=%s" % str(build_id), file=error_file, flush=True)

        print("--escape-pipes=%s" % str(escape_pipes),
              file=error_file,
              flush=True)

        print("--validate=%s" % str(validate), file=error_file, flush=True)

        print("--override-uuid=%s" % str(override_uuid),
              file=error_file,
              flush=True)

        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        kn: KgtkNtriples = KgtkNtriples(
            input_file_paths=input_file_paths,
            output_file_path=output_kgtk_file,
            reject_file_path=reject_file_path,
            updated_namespace_file_path=updated_namespace_kgtk_file,
            namespace_file_path=namespace_kgtk_file,
            namespace_id_prefix=namespace_id_prefix,
            namespace_id_use_uuid=namespace_id_use_uuid,
            namespace_id_counter=namespace_id_counter,
            namespace_id_zfill=namespace_id_zfill,
            output_only_used_namespaces=output_only_used_namespaces,
            newnode_prefix=newnode_prefix,
            newnode_use_uuid=newnode_use_uuid,
            newnode_counter=newnode_counter,
            newnode_zfill=newnode_zfill,
            allow_lax_uri=allow_lax_uri,
            local_namespace_prefix=local_namespace_prefix,
            local_namespace_use_uuid=local_namespace_use_uuid,
            prefix_expansion_label=prefix_expansion_label,
            structured_value_label=structured_value_label,
            structured_uri_label=structured_uri_label,
            build_id=build_id,
            escape_pipes=escape_pipes,
            validate=validate,
            override_uuid=override_uuid,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        kn.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#22
0
def run(input_files: KGTKFiles,
        output_file: KGTKFiles,
        output_format: typing.Optional[str],

        output_column_names: typing.Optional[typing.List[str]],
        old_column_names: typing.Optional[typing.List[str]],
        new_column_names: typing.Optional[typing.List[str]],

        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want.
)->int:
    # import modules locally
    from kgtk.exceptions import KGTKException

    from kgtk.join.kgtkcat import KgtkCat
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions
    
    input_file_paths: typing.List[Path] = KGTKArgumentParser.get_input_file_list(input_files)
    output_file_path: Path = KGTKArgumentParser.get_output_file(output_file)
    
    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # print("cat running", file=error_file, flush=True) # ***

    # TODO: check that at most one input file is stdin?

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-files %s" % " ".join((str(input_file_path) for input_file_path in input_file_paths)), file=error_file, flush=True)
        print("--output-file=%s" % str(output_file_path), file=error_file, flush=True)
        if output_format is not None:
            print("--output-format=%s" % output_format, file=error_file, flush=True)
        if output_column_names is not None:
            print("--output-coloumns %s" % " ".join(output_column_names), file=error_file, flush=True)
        if old_column_names is not None:
            print("--old-columns %s" % " ".join(old_column_names), file=error_file, flush=True)
        if new_column_names is not None:
            print("--new-columns %s" % " ".join(new_column_names), file=error_file, flush=True)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    # Check for comsistent options.  argparse doesn't support this yet.
    if output_column_names is not None and len(output_column_names) > 0:
        if (old_column_names is not None and len(old_column_names) > 0) or \
           (new_column_names is not None and len(new_column_names) > 0):
            raise KGTKException("When --output-columns is used, --old-columns and --new-columns may not be used.")
    elif (old_column_names is not None and len(old_column_names) > 0) ^ \
         (new_column_names is not None and len(new_column_names) > 0):
        raise KGTKException("Both --old-columns and --new-columns must be used when either is used.")

    elif (old_column_names is not None and len(old_column_names) > 0) and \
         (new_column_names is not None and len(new_column_names) > 0):
        if len(old_column_names) != len(new_column_names):
            raise KGTKException("Both --old-columns and --new-columns must have the same number of columns.")
    try:
        kc: KgtkCat = KgtkCat(input_file_paths=input_file_paths,
                              output_path=output_file_path,
                              output_format=output_format,
                              output_column_names=output_column_names,
                              old_column_names=old_column_names,
                              new_column_names=new_column_names,
                              reader_options=reader_options,
                              value_options=value_options,
                              error_file=error_file,
                              verbose=verbose,
                              very_verbose=very_verbose
        )
        
        kc.process()

        # print("cat done", file=error_file, flush=True) # ***
        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#23
0
def run(input_file: KGTKFiles,
        output_path: str,
        file_prefix: str,
        split_by_qnode: bool,
        lines: int,
        gzipped_output: bool,
        errors_to_stdout: bool = False,
        **kwargs) -> int:
    import sys
    from pathlib import Path
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    def write_files(error_file, file_number, file_prefix, kr, lines_to_write,
                    output_path, Qnode, reader_options, split_by_qnode,
                    suffix):
        if split_by_qnode:
            output_kgtk_file = Path(f'{output_path}/{Qnode}{suffix}')
        else:
            output_kgtk_file = Path(
                f'{output_path}/{file_prefix}{file_number}{suffix}')
        kw = KgtkWriter.open(
            kr.column_names,
            output_kgtk_file,
            mode=KgtkWriter.Mode[kr.mode.name],
            use_mgzip=reader_options.use_mgzip,  # Hack!
            mgzip_threads=reader_options.mgzip_threads,  # Hack!
            error_file=error_file,
            verbose=False,
            very_verbose=False)
        for r in lines_to_write:
            kw.write(r)
        kw.close()

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)

    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr
    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    suffix = ".tsv.gz" if gzipped_output else ".tsv"

    kr: KgtkReader = KgtkReader.open(
        input_kgtk_file,
        options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=False,
        very_verbose=False,
    )

    node1_idx: int = kr.get_node1_column_index()
    label_idx: int = kr.get_label_column_index()
    node2_idx: int = kr.get_node2_column_index()

    if node1_idx < 0 or label_idx < 0 or node2_idx < 0:
        print(
            "Error: Not a valid file: {}. A valid edge file should have these columns: node1, label and node2"
            .format(input_file),
            file=error_file,
            flush=True)
        kr.close()
        raise KGTKException("Missing columns.")

    prev = None
    lines_to_write = list()
    file_number = 0

    for row in kr:
        node = row[node1_idx]
        if node.startswith('Q') or node.startswith('P'):
            if prev is None:
                prev = node

            if not are_nodes_equal(prev, node):
                if split_by_qnode or len(lines_to_write) >= lines:
                    write_files(error_file, file_number, file_prefix, kr,
                                lines_to_write, output_path, prev,
                                reader_options, split_by_qnode, suffix)

                    lines_to_write = list()
                    file_number += 1

                prev = node

            lines_to_write.append(row)

    if len(lines_to_write) > 0:
        write_files(error_file, file_number, file_prefix, kr, lines_to_write,
                    output_path, prev, reader_options, split_by_qnode, suffix)
        return 0
示例#24
0
def main(**kwargs):
    from kgtk.exceptions import KGTKException
    import logging
    import os
    from time import strftime
    do_logging = kwargs.get("_debug", False)
    if do_logging:
        logging_level_class = logging.DEBUG
        logger_path = os.path.join(
            os.environ.get("HOME"), "kgtk_text_embedding_log_{}.log".format(
                strftime("%Y-%m-%d-%H-%M")))
        logging.basicConfig(
            level=logging_level_class,
            format=
            "%(asctime)s [%(levelname)s] %(name)s %(lineno)d -- %(message)s",
            datefmt='%m-%d %H:%M:%S',
            filename=logger_path,
            filemode='w')

    _logger = logging.getLogger(__name__)
    _logger.warning("Running with logging level {}".format(
        _logger.getEffectiveLevel()))

    try:
        import pandas as pd
        from pathlib import Path
        from kgtk.gt.embedding_utils import EmbeddingVector

        # get input parameters from kwargs
        output_uri = kwargs.get("output_uri", "")
        parallel_count = kwargs.get("parallel_count", "1")
        black_list_files = kwargs.get("black_list_files", [])
        all_models_names = kwargs.get(
            "all_models_names", ['bert-base-wikipedia-sections-mean-tokens'])
        data_format = kwargs.get("data_format", "kgtk_format")
        output_format = kwargs.get("output_data_format", "kgtk_format")
        property_labels_files = kwargs.get("property_labels_file_uri", [])
        property_labels_filter = kwargs.get("property_labels_filter", [])
        query_server = kwargs.get("query_server")
        save_embedding_sentence = kwargs.get("save_embedding_sentence", False)

        # Select where to send error messages, defaulting to stderr.
        error_file: typing.TextIO = sys.stdout if kwargs.get(
            "errors_to_stdout") else sys.stderr

        # Build the option structures.
        reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

        verbose: bool = kwargs.get("verbose")

        input_file_path: Path = KGTKArgumentParser.get_input_file(
            kwargs.get("input_file"))

        cache_config = {
            "use_cache": kwargs.get("use_cache", False),
            "host": kwargs.get("cache_host", "dsbox01.isi.edu"),
            "port": kwargs.get("cache_port", 6379)
        }
        property_values = kwargs.get("property_values", [])
        if kwargs.get("property_values_file") is not None:
            # TODO: Use KgtkReader to read this file.
            _ = pd.read_csv(kwargs.get("property_values_file"), sep='\t')
            property_values = list(_['node1'].unique())
        sentence_properties = {
            "label_properties":
            kwargs.get("label_properties", ["label"]),
            "description_properties":
            kwargs.get("description_properties", ["description"]),
            "isa_properties":
            kwargs.get("isa_properties", ["P31"]),
            "has_properties":
            kwargs.get("has_properties", ["all"]),
            "property_values":
            property_values
        }

        output_properties = {
            "metadata_properties": kwargs.get("metadata_properties", []),
            "output_properties": kwargs.get("output_properties",
                                            "text_embedding")
        }
        if isinstance(all_models_names, str):
            all_models_names = [all_models_names]
        # if isinstance(input_uris, str):
        #     input_uris = [input_uris]
        if len(all_models_names) == 0:
            raise KGTKException("No embedding vector model name given!")

        if output_uri == "":
            output_uri = os.getenv("HOME")
        if black_list_files:
            black_list_set = load_black_list_files(black_list_files)
        else:
            black_list_set = set()
        if property_labels_files:
            property_labels_dict = load_property_labels_file(
                property_labels_files,
                error_file,
                reader_options,
                value_options,
                label_filter=property_labels_filter,
                verbose=verbose)
            _logger.info("Totally {} property labels loaded.".format(
                len(property_labels_dict)))
        else:
            property_labels_dict = {}

        dimensional_reduction = kwargs.get("dimensional_reduction", "none")
        dimension_val = kwargs.get("dimension_val", 2)

        # try:
        #     input_file_name = input_file.name
        # except AttributeError:
        #     input_file_name = "input from memory"

        for each_model_name in all_models_names:
            # _logger.info("Running {} model on {}".format(each_model_name, input_file_name))
            _logger.info("Running {} model on {}".format(
                each_model_name, str(input_file_path)))
            process = EmbeddingVector(each_model_name,
                                      query_server=query_server,
                                      cache_config=cache_config,
                                      parallel_count=parallel_count)
            process.read_input(input_file_path=input_file_path,
                               skip_nodes_set=black_list_set,
                               input_format=data_format,
                               target_properties=sentence_properties,
                               property_labels_dict=property_labels_dict,
                               error_file=error_file,
                               reader_options=reader_options,
                               value_options=value_options,
                               verbose=verbose)
            process.get_vectors()

            process.plot_result(
                output_properties=output_properties,
                input_format=data_format,
                output_uri=output_uri,
                dimensional_reduction=dimensional_reduction,
                dimension_val=dimension_val,
                output_format=output_format,
                save_embedding_sentence=save_embedding_sentence)
            # process.evaluate_result()
            _logger.info("*" * 20 + "finished" + "*" * 20)
    except Exception as e:
        _logger.debug(e, exc_info=True)
        raise KGTKException(str(e))
示例#25
0
def run(
        input_file: KGTKFiles,
        filter_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        matched_filter_file: KGTKFiles,
        unmatched_filter_file: KGTKFiles,
        join_file: KGTKFiles,
        input_keys: typing.Optional[typing.List[str]],
        filter_keys: typing.Optional[typing.List[str]],
        cache_input: bool = False,
        preserve_order: bool = False,
        presorted: bool = False,
        field_separator: typing.Optional[str] = None,
        left_join: bool = False,
        right_join: bool = False,
        input_prefix: typing.Optional[str] = None,
        filter_prefix: typing.Optional[str] = None,
        join_output: bool = False,
        right_first: bool = False,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.iff.kgtkifexists import KgtkIfExists
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    filter_kgtk_file: Path = KGTKArgumentParser.get_input_file(
        filter_file, who="KGTK filter file")
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            reject_file, who="KGTK reject file")
    matched_filter_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            matched_filter_file, who="KGTK matched filter file")
    unmatched_filter_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            unmatched_filter_file, who="KGTK unmatched filter file")
    join_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            join_file, who="KGTK join file")

    if (str(input_kgtk_file) == "-" and str(filter_kgtk_file) == "-"):
        raise KGTKException(
            "My not use stdin for both --input-file and --filter-on files.")

    field_separator = KgtkIfExists.FIELD_SEPARATOR_DEFAULT if field_separator is None else field_separator

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
        kwargs, who="input", fallback=True)
    filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
        kwargs, who="filter", fallback=True)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--filter-file=%s" % str(filter_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file)
        if matched_filter_kgtk_file is not None:
            print("--matched-filter-file=%s" % str(matched_filter_kgtk_file),
                  file=error_file)
        if unmatched_filter_kgtk_file is not None:
            print("--unmatched-filter-file=%s" %
                  str(unmatched_filter_kgtk_file),
                  file=error_file)
        if join_kgtk_file is not None:
            print("--join-file=%s" % str(join_kgtk_file), file=error_file)
        if input_keys is not None:
            print("--input-keys=%s" % " ".join(input_keys), file=error_file)
        if filter_keys is not None:
            print("--filter-keys=%s" % " ".join(filter_keys), file=error_file)
        print("--cache-input=%s" % str(cache_input), file=error_file)
        print("--preserve-order=%s" % str(preserve_order), file=error_file)
        print("--presortedr=%s" % str(presorted), file=error_file)
        print("--field-separator=%s" % repr(field_separator), file=error_file)
        print("--left-join=%s" % str(left_join), file=error_file)
        print("--right-join=%s" % str(right_join), file=error_file)
        if input_prefix is not None:
            print("--input-prefix=%s" % repr(input_prefix), file=error_file)
        if filter_prefix is not None:
            print("--filter-prefix=%s" % repr(filter_prefix), file=error_file)
        print("--join-output=%s" % str(join_output), file=error_file)
        print("--right-join-first=%s" % str(right_first), file=error_file)
        input_reader_options.show(out=error_file, who="input")
        filter_reader_options.show(out=error_file, who="filter")
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        ie: KgtkIfExists = KgtkIfExists(
            input_file_path=input_kgtk_file,
            input_keys=input_keys,
            filter_file_path=filter_kgtk_file,
            filter_keys=filter_keys,
            output_file_path=output_kgtk_file,
            reject_file_path=reject_kgtk_file,
            matched_filter_file_path=matched_filter_kgtk_file,
            unmatched_filter_file_path=unmatched_filter_kgtk_file,
            join_file_path=join_kgtk_file,
            left_join=left_join,
            right_join=right_join,
            input_prefix=input_prefix,
            filter_prefix=filter_prefix,
            join_output=join_output,
            right_first=right_first,
            invert=False,
            cache_input=cache_input,
            preserve_order=preserve_order,
            presorted=presorted,
            field_separator=field_separator,
            input_reader_options=input_reader_options,
            filter_reader_options=filter_reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ie.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#26
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        key_column_names: typing.List[str],
        compact_id: bool,
        sorted_input: bool,
        verify_sort: bool,
        build_id: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        print("--columns=%s" % " ".join(key_column_names), file=error_file)
        print("--compact-id=%s" % str(compact_id), file=error_file, flush=True)
        print("--presorted=%s" % str(sorted_input))
        print("--verify-sort=%s" % str(verify_sort),
              file=error_file,
              flush=True)
        print("--build-id=%s" % str(build_id), file=error_file, flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        ex: KgtkCompact = KgtkCompact(
            input_file_path=input_kgtk_file,
            key_column_names=key_column_names,
            compact_id=compact_id,
            sorted_input=sorted_input,
            verify_sort=verify_sort,
            output_file_path=output_kgtk_file,
            build_id=build_id,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#27
0
文件: lift.py 项目: nicklein/kgtk
def run(input_file: KGTKFiles,
        output_file: KGTKFiles,
        label_file: KGTKFiles,
        unmodified_row_file: KGTKFiles,
        matched_label_file: KGTKFiles,
        unmatched_label_file: KGTKFiles,

        input_select_column_name: typing.Optional[str],
        input_select_column_value: typing.Optional[str],
        input_lifting_column_names: typing.List[str],

        output_lifted_column_names: typing.List[str],
        output_lifted_column_suffix: str,
        output_select_column_value: str,

        label_select_column_name: typing.Optional[str],
        label_select_column_value: str,
        label_match_column_name: typing.Optional[str],
        label_value_column_name: typing.Optional[str],

        default_value: str,

        remove_label_records: bool = False,
        sort_lifted_labels: bool = True,
        suppress_duplicate_labels: bool = True,
        suppress_empty_columns: bool = False,
        ok_if_no_labels: bool = False,
        prefilter_labels: bool = False,
        input_is_presorted: bool = False,
        labels_are_presorted: bool = False,

        clear_before_lift: bool = False,
        overwrite: bool = False,

        output_only_modified_rows: bool = False,

        languages: typing.Optional[typing.List[str]] = None,
        prioritize: bool = False,

        use_label_envar: bool = False,
        lift_all_columns: bool = False,
        require_label_file: bool = False,
        force_input_mode_none: bool = False,

        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want.
)->int:
    # import modules locally
    import os
    from pathlib import Path
    import sys
    
    from kgtk.exceptions import KGTKException
    from kgtk.lift.kgtklift import KgtkLift
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    label_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_input_file(label_file, who="KGTK label file")
    unmodified_row_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmodified_row_file, who="KGTK unmodified row output file")
    matched_label_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(matched_label_file, who="KGTK matched label output file")
    unmatched_label_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmatched_label_file, who="KGTK unmatched label output file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input", fallback=True)
    label_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="label", fallback=True)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file, flush=True)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True)
        if label_kgtk_file is not None:
            print("--label-file=%s" % label_kgtk_file, file=error_file, flush=True)
        if unmodified_row_kgtk_file is not None:
            print("--unmodified-row-output-file=%s" % unmodified_row_kgtk_file, file=error_file, flush=True)
        if matched_label_kgtk_file is not None:
            print("--matched-label-output-file=%s" % matched_label_kgtk_file, file=error_file, flush=True)
        if unmatched_label_kgtk_file is not None:
            print("--unmatched-label-output-file=%s" % unmatched_label_kgtk_file, file=error_file, flush=True)

        if input_select_column_name is not None:
            print("--input-select-column=%s" % input_select_column_name, file=error_file, flush=True)
        if input_select_column_value is not None:
            print("--input-select-value=%s" % input_select_column_value, file=error_file, flush=True)
        if input_lifting_column_names is not None and len(input_lifting_column_names) > 0:
            print("--columns-to-lift %s" % " ".join(input_lifting_column_names), file=error_file, flush=True)
        if output_lifted_column_names is not None and len(output_lifted_column_names) > 0:
            print("--columns-to-write %s" % " ".join(output_lifted_column_names), file=error_file, flush=True)

        print("--lift-suffix=%s" % output_lifted_column_suffix, file=error_file, flush=True)
        if output_select_column_value is not None:
            print("--update-select-value=%s" % output_select_column_value, file=error_file, flush=True)


        if label_select_column_name is not None:
            print("--label-select-column=%s" % label_select_column_name, file=error_file, flush=True)
        print("--label-select-value=%s" % label_select_column_value, file=error_file, flush=True)
        if label_match_column_name is not None:
            print("--label-match-column=%s" % label_match_column_name, file=error_file, flush=True)
        if label_value_column_name is not None:
            print("--label-value-column=%s" % label_value_column_name, file=error_file, flush=True)

        print("--default-value=%s" % repr(default_value), file=error_file, flush=True)
        print("--remove-label-records=%s" % repr(remove_label_records), file=error_file, flush=True)
        print("--sort-lifted-labels=%s" % repr(sort_lifted_labels), file=error_file, flush=True)
        print("--suppress-duplicate-labels=%s" % repr(suppress_duplicate_labels), file=error_file, flush=True)
        print("--suppress-empty-columns=%s" % repr(suppress_empty_columns), file=error_file, flush=True)
        print("--ok-if-no-labels=%s" % repr(ok_if_no_labels), file=error_file, flush=True)
        print("--prefilter-labels=%s" % repr(prefilter_labels), file=error_file, flush=True)
        print("--input-file-is-presorted=%s" % repr(input_is_presorted), file=error_file, flush=True)
        print("--label-file-is-presorted=%s" % repr(labels_are_presorted), file=error_file, flush=True)
        print("--clear-before-lift=%s" % repr(clear_before_lift), file=error_file, flush=True)
        print("--overwrite=%s" % repr(overwrite), file=error_file, flush=True)
        print("--output-only-modified-rows=%s" % repr(output_only_modified_rows), file=error_file, flush=True)

        if languages is not None:
            print("--languages %s" % " ".join(repr(l) for l in languages), file=error_file, flush=True)
        print("--prioritize=%s" % repr(prioritize), file=error_file, flush=True)
        
        print("--use-label-envar=%s" % repr(use_label_envar), file=error_file, flush=True)
        print("--lift-all-columns=%s" % repr(lift_all_columns), file=error_file, flush=True)
        print("--require-label-files=%s" % repr(require_label_file), file=error_file, flush=True)
        print("--force-input-mode-none=%s" % repr(force_input_mode_none), file=error_file, flush=True)
        input_reader_options.show(out=error_file, who="input")
        label_reader_options.show(out=error_file, who="label")
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    # Should the following functionality be moved to KgtkLift?
    if label_kgtk_file is None and use_label_envar:
        label_file_envar: str = 'KGTK_LABEL_FILE' # TODO: Move this to a common file.
        label_file_envar_value: typing.Optional[str] = os.getenv(label_file_envar)
        if label_file_envar_value is not None:
            label_kgtk_file = Path(label_file_envar_value)
            if verbose:
                print("Using label file %s from envar %s" % (repr(label_file_envar_value), repr(label_file_envar)), file=error_file, flush=True)

    if require_label_file and label_kgtk_file is None:
        raise KGTKException("A label file must be specified using --label-file or KGTK_LABEL_FILE")

    try:
        kl: KgtkLift = KgtkLift(
            input_file_path=input_kgtk_file,
            label_file_path=label_kgtk_file,
            output_file_path=output_kgtk_file,
            unmodified_row_file_path=unmodified_row_kgtk_file,
            matched_label_file_path=matched_label_kgtk_file,
            unmatched_label_file_path=unmatched_label_kgtk_file,

            input_select_column_name=input_select_column_name,
            input_select_column_value=input_select_column_value,
            input_lifting_column_names=input_lifting_column_names,

            output_lifted_column_suffix=output_lifted_column_suffix,
            output_select_column_value=output_select_column_value,
            output_lifted_column_names=output_lifted_column_names,

            label_select_column_name=label_select_column_name,
            label_select_column_value=label_select_column_value,
            label_match_column_name=label_match_column_name,
            label_value_column_name=label_value_column_name,

            default_value=default_value,

            remove_label_records=remove_label_records,
            sort_lifted_labels=sort_lifted_labels,
            suppress_duplicate_labels=suppress_duplicate_labels,
            suppress_empty_columns=suppress_empty_columns,
            ok_if_no_labels=ok_if_no_labels,
            prefilter_labels=prefilter_labels,
            input_is_presorted=input_is_presorted,
            labels_are_presorted=labels_are_presorted,

            clear_before_lift=clear_before_lift,
            overwrite=overwrite,

            output_only_modified_rows=output_only_modified_rows,

            languages=languages,
            prioritize=prioritize,

            lift_all_columns=lift_all_columns,
            force_input_mode_none=force_input_mode_none,

            input_reader_options=input_reader_options,
            label_reader_options=label_reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        
        kl.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#28
0
def run(input_file: KGTKFiles,
        output_file: KGTKFiles,
        mapping_file: KGTKFiles,
        unmodified_edges_file: KGTKFiles,
        activated_mapping_file: KGTKFiles,
        rejected_mapping_file: KGTKFiles,

        confidence_column_name: str,
        require_confidence: bool,
        default_confidence_str: typing.Optional[str],
        confidence_threshold: float,

        same_as_item_label: str,
        same_as_property_label: str,
        allow_exact_duplicates: bool,
        allow_idempotent_mapping: bool,

        split_output_mode: bool,
        modified_pattern: str,

        node1_column_name: typing.Optional[str],
        label_column_name: typing.Optional[str],
        node2_column_name: typing.Optional[str],
        mapping_rule_mode: str,
        mapping_node1_column_name: typing.Optional[str],
        mapping_label_column_name: typing.Optional[str],
        mapping_node2_column_name: typing.Optional[str],

        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want.
)->int:
    # import modules locally
    from pathlib import Path
    import sys
    
    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    mapping_kgtk_file: Path = KGTKArgumentParser.get_input_file(mapping_file, who="KGTK mappping file")
    unmodified_edges_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmodified_edges_file, who="KGTK unmodified edges output file")
    activated_mapping_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(activated_mapping_file, who="KGTK activated mapping output file")
    rejected_mapping_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(rejected_mapping_file, who="KGTK rejected mapping output file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input", fallback=True)
    mapping_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="mapping", fallback=True)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % repr(str(input_kgtk_file)), file=error_file, flush=True)
        print("--output-file=%s" % repr(str(output_kgtk_file)), file=error_file, flush=True)
        print("--mapping-file=%s" % repr(str(mapping_kgtk_file)), file=error_file, flush=True)
        if unmodified_edges_kgtk_file is not None:
            print("--unmodified-edges-file=%s" % repr(str(unmodified_edges_kgtk_file)), file=error_file, flush=True)
        if activated_mapping_kgtk_file is not None:
            print("--activated-mapping-edges-file=%s" % repr(str(activated_mapping_kgtk_file)), file=error_file, flush=True)
        if rejected_mapping_kgtk_file is not None:
            print("--rejected-mapping-edges-file=%s" % repr(str(rejected_mapping_kgtk_file)), file=error_file, flush=True)

        print("--confidence-column=%s" % repr(confidence_column_name), file=error_file, flush=True)
        print("--require-confidence=%s" % repr(require_confidence), file=error_file, flush=True)
        if default_confidence_str is not None:
            print("--default-confidence-value=%s" % default_confidence_str, file=error_file, flush=True)
        print("--threshold=%f" % confidence_threshold, file=error_file, flush=True)

        print("--same-as-item-label=%s" % repr(same_as_item_label), file=error_file, flush=True)
        print("--same-as-property-label=%s" % repr(same_as_property_label), file=error_file, flush=True)
        print("--allow-exact-duplicates=%s" % repr(allow_exact_duplicates), file=error_file, flush=True)
        print("--allow-idempotent-actions=%s" % repr(allow_idempotent_mapping), file=error_file, flush=True)

        print("--split-output-mode=%s" % repr(split_output_mode), file=error_file, flush=True)
        print("--modified-pattern=%s" % repr(modified_pattern), file=error_file, flush=True)

        if node1_column_name is not None:
            print("--node1-column-=%s" % repr(node1_column_name), file=error_file, flush=True)
        if label_column_name is not None:
            print("--label-column-=%s" % repr(label_column_name), file=error_file, flush=True)
        if node2_column_name is not None:
            print("--node2-column-=%s" % repr(node2_column_name), file=error_file, flush=True)
        print("--mapping-rule-mode=%s" % repr(mapping_rule_mode), file=error_file, flush=True)
        if mapping_node1_column_name is not None:
            print("--mapping-node1-column-=%s" % repr(mapping_node1_column_name), file=error_file, flush=True)
        if mapping_label_column_name is not None:
            print("--mapping-label-column-=%s" % repr(mapping_label_column_name), file=error_file, flush=True)
        if mapping_node2_column_name is not None:
            print("--mapping-node2-column-=%s" % repr(mapping_node2_column_name), file=error_file, flush=True)

        input_reader_options.show(out=error_file, who="input")
        mapping_reader_options.show(out=error_file, who="mapping")
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    default_confidence_value: typing.Optional[float] = None
    if default_confidence_str is not None:
        try:
            default_confidence_value = float(default_confidence_str)
        except:
            raise KGTKException("--default-confidence-value=%s is invalid" % repr(default_confidence_str))

    try:

        if verbose:
            print("Opening the mapping file %s." % repr(str(mapping_kgtk_file)), file=error_file, flush=True)
        mkr:  KgtkReader = KgtkReader.open(mapping_kgtk_file,
                                           options=mapping_reader_options,
                                           value_options = value_options,
                                           error_file=error_file,
                                           verbose=verbose,
                                           very_verbose=very_verbose,
        )
        trouble = False
        mapping_node1_idx: int = mkr.get_node1_column_index(mapping_node1_column_name)
        mapping_label_idx: int = mkr.get_label_column_index(mapping_label_column_name)
        mapping_node2_idx: int = mkr.get_node2_column_index(mapping_node2_column_name)
        if mapping_node1_idx < 0:
            trouble = True
            print("Error: Cannot find the mapping file node1 column.", file=error_file, flush=True)
        if mapping_label_idx < 0 and mapping_rule_mode == "normal":
            trouble = True
            print("Error: Cannot find the mapping file label column.", file=error_file, flush=True)
        if mapping_node2_idx < 0:
            trouble = True
            print("Error: Cannot find the mapping file node2 column.", file=error_file, flush=True)
        if trouble:
            # Clean up:                                                                                                                                               
            mkr.close()
            raise KGTKException("Missing columns in the mapping file.")
        confidence_column_idx: int = mkr.column_name_map.get(confidence_column_name, -1)
        if require_confidence and confidence_column_idx < 0:
            mkr.close()
            raise KGTKException("The mapping file does not have a confidence column, and confidence is required.")
        
        rmkw: typing.Optional[KgtkWriter] = None
        if rejected_mapping_kgtk_file is not None:
            if verbose:
                print("Opening the rejected mapping edges file %s." % repr(str(rejected_mapping_kgtk_file)), file=error_file, flush=True)
            rmkw = KgtkWriter.open(mkr.column_names,
                                   rejected_mapping_kgtk_file,
                                   mode=KgtkWriter.Mode[mkr.mode.name],
                                   use_mgzip=input_reader_options.use_mgzip, # Hack!
                                   mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                   error_file=error_file,
                                   verbose=verbose,
                                   very_verbose=very_verbose)

        # Mapping structures:
        item_map: typing.MutableMapping[str, str] = dict()
        item_line_map: typing.MutableMapping[str, int] = dict()
        property_map: typing.MutableMapping[str, str] = dict()
        property_line_map: typing.MutableMapping[str, int] = dict()

        mapping_rows: typing.MutableMapping[int, typing.List[str]] = dict()
        activated_mapping_rows: typing.MutableMapping[int, typing.List[str]] = dict()

        # Read the mapping file.
        if verbose:
            print("Processing the mapping file.", file=error_file, flush=True)
        mapping_confidence_exclusions: int = 0
        mapping_idempotent_exclusions: int = 0
        mapping_errors: int = 0
        mapping_line_number: int = 0
        mrow: typing.List[str]
        for mrow in mkr:
            mapping_line_number += 1
            mapping_node1: str = mrow[mapping_node1_idx]
            mapping_label: str = mrow[mapping_label_idx] if mapping_rule_mode == "normal" else ""
            mapping_node2: str = mrow[mapping_node2_idx]
            mapping_confidence: typing.Optional[float] = default_confidence_value
            if confidence_column_idx >= 0:
                confidence_value_str: str = mrow[confidence_column_idx]
                if len(confidence_value_str) == 0:
                    if require_confidence:
                        print("In line %d of the mapping file: the required confidence value is missing" % (mapping_line_number),
                              file=error_file, flush=True)
                        mapping_errors += 1
                        continue
                else:
                    try:
                        mapping_confidence = float(confidence_value_str)
                    except ValueError:
                        print("In line %d of the mapping file: cannot parse confidence value %s" % (mapping_line_number, repr(mrow[confidence_column_idx])),
                              file=error_file, flush=True)
                        mapping_errors += 1
                        continue
            if mapping_confidence is not None and mapping_confidence < confidence_threshold:
                mapping_confidence_exclusions += 1
                if rmkw is not None:
                    rmkw.write(mrow)
                continue

            if mapping_node1 == mapping_node2 and not allow_idempotent_mapping:
                mapping_idempotent_exclusions += 1
                continue
        
            if mapping_rule_mode == "same-as-item" or mapping_label == same_as_item_label:
                if mapping_node1 in item_map:
                    if mapping_node2 != item_map[mapping_node1] or not allow_exact_duplicates:
                        print("Duplicate %s for %s at mapping file line %d, originally in line %d" % (mapping_label,
                                                                                                      repr(mapping_node1),
                                                                                                      mapping_line_number,
                                                                                                      item_line_map[mapping_node1]),
                              file=error_file, flush=True)
                        mapping_errors += 1
                    continue

                item_map[mapping_node1] = mapping_node2
                item_line_map[mapping_node1] = mapping_line_number
                mapping_rows[mapping_line_number] = mrow.copy()

            elif mapping_rule_mode == "same-as-property" or mapping_label == same_as_property_label:
                if mapping_node1 in property_map:
                    if mapping_node2 != property_map[mapping_node1] or not allow_exact_duplicates:
                        print("Duplicate %s for %s at mapping file line %d, originally in line %d" % (mapping_label,
                                                                                                      repr(mapping_node1),
                                                                                                      mapping_line_number,
                                                                                                      property_line_map[mapping_node1]),
                              file=error_file, flush=True)
                        mapping_errors += 1
                    continue
                property_map[mapping_node1] = mapping_node2
                property_line_map[mapping_node1] = mapping_line_number
                mapping_rows[mapping_line_number] = mrow.copy()

            else:
                print("Unknown mapping action %s at line %d of mapping file %s" % (mapping_label,
                                                                                   mapping_line_number,
                                                                                   repr(str(mapping_kgtk_file))),
                      file=error_file, flush=True)
                mapping_errors += 1
                continue
                

        # Close the mapping file.
        mkr.close()
        if rmkw is not None:
            rmkw.close()

        if mapping_errors > 0:
            raise KGTKException("%d errors detected in the mapping file %s" % (mapping_errors, repr(str(mapping_kgtk_file))))

        if len(item_map) == 0 and len(property_map) == 0:
            raise KGTKException("Nothing read from the mapping file %s" % repr(str(mapping_kgtk_file)))

        if verbose:
            print("%d mapping lines, %d excluded for confidence, %d excluded for idempotency." % (mapping_line_number,
                                                                                                  mapping_confidence_exclusions,
                                                                                                  mapping_idempotent_exclusions),
                  file=error_file, flush=True)
            print("%d item mapping rules." % len(item_map), file=error_file, flush=True)
            print("%d property mapping rules." % len(property_map), file=error_file, flush=True)

        if verbose:
            print("Opening the input file %s." % repr(str(input_kgtk_file)), file=error_file, flush=True)
        ikr:  KgtkReader = KgtkReader.open(input_kgtk_file,
                                           options=input_reader_options,
                                           value_options = value_options,
                                           error_file=error_file,
                                           verbose=verbose,
                                           very_verbose=very_verbose,
        )
        trouble = False
        input_node1_idx: int = ikr.get_node1_column_index(node1_column_name)
        input_label_idx: int = ikr.get_label_column_index(label_column_name)
        input_node2_idx: int = ikr.get_node2_column_index(node2_column_name)
        if input_node1_idx < 0 and mapping_rule_mode in ["normal", "same-as-item"]:
            trouble = True
            print("Error: Cannot find the input file node1 column.", file=error_file, flush=True)
        if input_label_idx < 0 and mapping_rule_mode in ["normal", "same-as-property"]:
            trouble = True
            print("Error: Cannot find the input file label column.", file=error_file, flush=True)
        if input_node2_idx < 0 and mapping_rule_mode in ["normal", "same-as-item"]:
            trouble = True
            print("Error: Cannot find the input file node2 column.", file=error_file, flush=True)
        if trouble:
            # Clean up:                                                                                                                                               
            ikr.close()
            raise KGTKException("Missing columns in the input file.")

        okw: KgtkWriter = KgtkWriter.open(ikr.column_names,
                                          output_kgtk_file,
                                          mode=KgtkWriter.Mode[ikr.mode.name],
                                          use_mgzip=input_reader_options.use_mgzip, # Hack!
                                          mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                          error_file=error_file,
                                          verbose=verbose,
                                          very_verbose=very_verbose)

        uekw: typing.Optional[KgtkWriter] = None
        if unmodified_edges_kgtk_file is not None:
            if verbose:
                print("Opening the unmodified edges file %s." % repr(str(unmodified_edges_kgtk_file)), file=error_file, flush=True)
            uekw = KgtkWriter.open(ikr.column_names,
                                   unmodified_edges_kgtk_file,
                                   mode=KgtkWriter.Mode[ikr.mode.name],
                                   use_mgzip=input_reader_options.use_mgzip, # Hack!
                                   mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                   error_file=error_file,
                                   verbose=verbose,
                                   very_verbose=very_verbose)

        amkw: typing.Optional[KgtkWriter] = None
        if activated_mapping_kgtk_file is not None:
            if verbose:
                print("Opening the activated mapping edges file %s." % repr(str(activated_mapping_kgtk_file)), file=error_file, flush=True)
            amkw = KgtkWriter.open(mkr.column_names,
                                   activated_mapping_kgtk_file,
                                   mode=KgtkWriter.Mode[mkr.mode.name],
                                   use_mgzip=input_reader_options.use_mgzip, # Hack!
                                   mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                   error_file=error_file,
                                   verbose=verbose,
                                   very_verbose=very_verbose)

        # Process each row of the input file.
        if verbose:
            print("Processing the input file.", file=error_file, flush=True)
        input_count: int = 0
        modified_edge_count: int = 0
        unmodified_edge_count: int = 0
        row: typing.List[str]
        for row in ikr:
            input_count +=1
            newrow: typing.List[str] = row.copy()

            modified_node1: bool = False
            modified_node2: bool = False
            modified_label: bool = False

            if mapping_rule_mode in ["normal", "same-as-item"]:
                input_node1: str = row[input_node1_idx]
                if input_node1 in item_map:
                    newrow[input_node1_idx] = item_map[input_node1]
                    modified_node1 = True
                    if amkw is not None:
                        mapping_line_number = item_line_map[input_node1]
                        if mapping_line_number not in activated_mapping_rows:
                            activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number]
                        
                input_node2: str = row[input_node2_idx]
                if input_node2 in item_map:
                    newrow[input_node2_idx] = item_map[input_node2]
                    modified_node2 = True
                    if amkw is not None:
                        mapping_line_number = item_line_map[input_node2]
                        if mapping_line_number not in activated_mapping_rows:
                            activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number]

            if mapping_rule_mode in ["normal", "same-as-property"]:
                input_label: str = row[input_label_idx]
                if input_label in property_map:
                    newrow[input_label_idx] = property_map[input_label]
                    modified_label = True
                    if amkw is not None:
                        mapping_line_number = property_line_map[input_label]
                        if mapping_line_number not in activated_mapping_rows:
                            activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number]

            modified: bool
            if modified_pattern == "node1|label|node2":
                modified = modified_node1 or modified_label or modified_node2
            elif modified_pattern == "node1|label":
                modified = modified_node1 or modified_label
            elif modified_pattern == "node1|node2":
                modified = modified_node1 or modified_node2
            elif modified_pattern == "label|node2":
                modified = modified_label or modified_node2
            elif modified_pattern == "node1":
                modified = modified_node1
            elif modified_pattern == "label":
                modified = modified_label
            elif modified_pattern == "node2":
                modified = modified_node2
            elif modified_pattern == "node1&label&node2":
                modified = modified_node1 and modified_label and modified_node2
            elif modified_pattern == "node1&label":
                modified = modified_node1 and modified_label
            elif modified_pattern == "node1&node2":
                modified = modified_node1 and modified_node2
            elif modified_pattern == "label&node2":
                modified = modified_label and modified_node2
            else:
                raise KGTKException("Unrecognized modification test pattern %s" % repr(modified_pattern))                

            if modified:
                modified_edge_count += 1
                okw.write(newrow)
            else:
                unmodified_edge_count += 1
                if uekw is not None:
                    uekw.write(row)
                if not split_output_mode:
                    okw.write(row)
                        
        # Done!
        ikr.close()
        okw.close()

        if verbose:
            print("%d edges read. %d modified, %d unmodified." % (input_count, modified_edge_count, unmodified_edge_count), file=error_file, flush=True)

        if uekw is not None:
            uekw.close()

        if amkw is not None:
            activated_count: int = 0
            for mapping_line_number in sorted(activated_mapping_rows.keys()):
                amkw.write(activated_mapping_rows[mapping_line_number])
                activated_count += 1
            amkw.close()

            if verbose:
                print("%d activated mapping edges" % activated_count, file=error_file, flush=True)

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#29
0
def run(
        node_file: KGTKFiles,
        edge_file: KGTKFiles,
        qualifier_file: KGTKFiles,
        output_file: KGTKFiles,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.exports.exportwikidata import ExportWikidata
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    node_kgtk_file: Path = KGTKArgumentParser.get_input_file(
        node_file, who="KGTK node file", default_stdin=False)
    edge_kgtk_file: Path = KGTKArgumentParser.get_input_file(
        edge_file, who="KGTK edge file", default_stdin=False)
    qualifier_kgtk_file: Path = KGTKArgumentParser.get_input_file(
        qualifier_file, who="KGTK qualifier file", default_stdin=False)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--node-file=%s" % str(node_kgtk_file),
              file=error_file,
              flush=True)
        print("--edge-file=%s" % str(edge_kgtk_file),
              file=error_file,
              flush=True)
        print("--qualifier-file=%s" % str(qualifier_kgtk_file),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(output_kgtk_file),
              file=error_file,
              flush=True)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        ew: ExportWikidata = ExportWikidata(
            node_file_path=node_kgtk_file,
            edge_file_path=edge_kgtk_file,
            qualifier_file_path=qualifier_kgtk_file,
            output_file_path=output_kgtk_file,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ew.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
示例#30
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        output_format: typing.Optional[str],
        column_names: typing.List[str],
        omit_remaining_columns: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(output_kgtk_file),
              file=error_file,
              flush=True)
        if output_format is not None:
            print("--output-format=%s" % output_format,
                  file=error_file,
                  flush=True)
        print("--columns %s" % " ".join(column_names),
              file=error_file,
              flush=True)
        print("--trim=%s" % str(omit_remaining_columns),
              file=error_file,
              flush=True)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        if verbose:
            print("Opening the input file %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        remaining_names: typing.List[str] = kr.column_names.copy()
        reordered_names: typing.List[str] = []
        save_reordered_names: typing.Optional[typing.List[str]] = None

        ellipses: str = "..."  # All unmentioned columns
        ranger: str = ".."  # All columns between two columns.

        saw_ranger: bool = False
        column_name: str
        for column_name in column_names:
            if column_name == ellipses:
                if save_reordered_names is not None:
                    raise KGTKException("Elipses may appear only once")

                if saw_ranger:
                    raise KGTKException(
                        "ELipses may not appear directly after a range operator ('..')."
                    )

                save_reordered_names = reordered_names
                reordered_names = []
                continue

            if column_name == ranger:
                if len(reordered_names) == 0:
                    raise KGTKException(
                        "The column range operator ('..') may not appear without a preceeding column name."
                    )
                saw_ranger = True
                continue

            if column_name not in kr.column_names:
                raise KGTKException("Unknown column name '%s'." % column_name)
            if column_name not in remaining_names:
                raise KGTKException(
                    "Column name '%s' was duplicated in the list." %
                    column_name)

            if saw_ranger:
                saw_ranger = False
                prior_column_name: str = reordered_names[-1]
                prior_column_idx: int = kr.column_name_map[prior_column_name]
                column_name_idx: int = kr.column_name_map[column_name]
                start_idx: int
                end_idx: int
                idx_inc: int
                if column_name_idx > prior_column_idx:
                    start_idx = prior_column_idx + 1
                    end_idx = column_name_idx - 1
                    idx_inc = 1
                else:
                    start_idx = prior_column_idx - 1
                    end_idx = column_name_idx + 1
                    idx_inc = -1

                idx: int = start_idx
                while idx <= end_idx:
                    idx_column_name: str = kr.column_names[idx]
                    if idx_column_name not in remaining_names:
                        raise KGTKException(
                            "Column name '%s' (%s .. %s) was duplicated in the list."
                            % (column_name, prior_column_name, column_name))

                    reordered_names.append(idx_column_name)
                    remaining_names.remove(idx_column_name)
                    idx += idx_inc

            reordered_names.append(column_name)
            remaining_names.remove(column_name)

        if saw_ranger:
            raise KGTKException(
                "The column ranger operator ('..') may not end the list of column names."
            )

        if len(remaining_names) > 0 and save_reordered_names is None:
            # There are remaining column names and the ellipses was not seen.
            if not omit_remaining_columns:
                raise KGTKException(
                    "No ellipses, and the following columns not accounted for: %s"
                    % " ".join(remaining_names))
            else:
                if verbose:
                    print("Omitting the following columns: %s" %
                          " ".join(remaining_names),
                          file=error_file,
                          flush=True)
        if save_reordered_names is not None:
            if len(remaining_names) > 0:
                save_reordered_names.extend(remaining_names)
            if len(reordered_names) > 0:
                save_reordered_names.extend(reordered_names)
            reordered_names = save_reordered_names

        if verbose:
            print("Opening the output file %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(
            reordered_names,
            output_kgtk_file,
            require_all_columns=True,
            prohibit_extra_columns=True,
            fill_missing_columns=False,
            gzip_in_parallel=False,
            mode=KgtkWriter.Mode[kr.mode.name],
            output_format=output_format,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        shuffle_list: typing.List = kw.build_shuffle_list(kr.column_names)

        input_data_lines: int = 0
        row: typing.List[str]
        for row in kr:
            input_data_lines += 1
            kw.write(row, shuffle_list=shuffle_list)

        # Flush the output file so far:
        kw.flush()

        if verbose:
            print("Read %d data lines from file %s" %
                  (input_data_lines, input_kgtk_file),
                  file=error_file,
                  flush=True)

        kw.close()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))