Пример #1
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys
    import typing

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        # First create the KgtkReader.  It provides parameters used by the ID
        # column builder. Next, create the ID column builder, which provides a
        # possibly revised list of column names for the KgtkWriter.  Create
        # the KgtkWriter.  Last, process the data stream.

        # Open the input file.
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Create the ID builder.
        idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options)

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(idb.column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        # Process the input file, building IDs.
        idb.process(kr, ew)

        # Clean up.
        ew.close()
        kr.close()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Пример #2
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        namespace_file: KGTKFiles,
        updated_namespace_file: KGTKFiles,
        namespace_id_prefix: str,
        namespace_id_use_uuid: bool,
        namespace_id_counter: int,
        namespace_id_zfill: int,
        output_only_used_namespaces: bool,
        allow_lax_uri: bool,
        local_namespace_prefix: str,
        local_namespace_use_uuid: bool,
        prefix_expansion_label: str,
        structured_value_label: str,
        structured_uri_label: str,
        newnode_prefix: str,
        newnode_use_uuid: bool,
        newnode_counter: int,
        newnode_zfill: int,
        build_id: bool,
        escape_pipes: bool,
        validate: bool,
        override_uuid: typing.Optional[str],
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.imports.kgtkntriples import KgtkNtriples
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    # Select where to send error messages, defaulting to stderr.
    input_file_paths: typing.List[
        Path] = KGTKArgumentParser.get_input_file_list(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_file_path: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            reject_file, who="KGTK reject file")

    namespace_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_input_file(
            namespace_file, who="KGTK namespace file")
    updated_namespace_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            updated_namespace_file, who="KGTK updated namespace file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-files %s" %
              " ".join([str(path) for path in input_file_paths]),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(output_kgtk_file),
              file=error_file,
              flush=True)
        if reject_file_path is not None:
            print("--reject-file=%s" % str(reject_file_path),
                  file=error_file,
                  flush=True)
        if namespace_kgtk_file is not None:
            print("--namespace-file=%s" % str(namespace_kgtk_file),
                  file=error_file,
                  flush=True)
        if updated_namespace_kgtk_file is not None:
            print("--updated-namespace-file=%s" %
                  str(updated_namespace_kgtk_file),
                  file=error_file,
                  flush=True)

        print("--namespace-id-prefix %s" % namespace_id_prefix,
              file=error_file,
              flush=True)
        print("--namespace-id-use-uuid %s" % str(namespace_id_use_uuid),
              file=error_file,
              flush=True)
        print("--namespace-id-counter %s" % str(namespace_id_counter),
              file=error_file,
              flush=True)
        print("--namespace-id-zfill %s" % str(namespace_id_zfill),
              file=error_file,
              flush=True)
        print("--output-only-used-namespaces %s" %
              str(output_only_used_namespaces),
              file=error_file,
              flush=True)

        print("--allow-lax-uri %s" % str(allow_lax_uri),
              file=error_file,
              flush=True)

        print("--local-namespace-prefix %s" % local_namespace_prefix,
              file=error_file,
              flush=True)
        print("--local-namespace-use-uuid %s" % str(local_namespace_use_uuid),
              file=error_file,
              flush=True)

        print("--prefix-expansion-label %s" % prefix_expansion_label,
              file=error_file,
              flush=True)
        print("--structured-value-label %s" % structured_value_label,
              file=error_file,
              flush=True)
        print("--structured-uri-label %s" % structured_uri_label,
              file=error_file,
              flush=True)

        print("--newnode-prefix %s" % newnode_prefix,
              file=error_file,
              flush=True)
        print("--newnode-use-uuid %s" % str(newnode_use_uuid),
              file=error_file,
              flush=True)
        print("--newnode-counter %s" % str(newnode_counter),
              file=error_file,
              flush=True)
        print("--newnode-zfill %s" % str(newnode_zfill),
              file=error_file,
              flush=True)

        print("--build-id=%s" % str(build_id), file=error_file, flush=True)

        print("--escape-pipes=%s" % str(escape_pipes),
              file=error_file,
              flush=True)

        print("--validate=%s" % str(validate), file=error_file, flush=True)

        print("--override-uuid=%s" % str(override_uuid),
              file=error_file,
              flush=True)

        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        kn: KgtkNtriples = KgtkNtriples(
            input_file_paths=input_file_paths,
            output_file_path=output_kgtk_file,
            reject_file_path=reject_file_path,
            updated_namespace_file_path=updated_namespace_kgtk_file,
            namespace_file_path=namespace_kgtk_file,
            namespace_id_prefix=namespace_id_prefix,
            namespace_id_use_uuid=namespace_id_use_uuid,
            namespace_id_counter=namespace_id_counter,
            namespace_id_zfill=namespace_id_zfill,
            output_only_used_namespaces=output_only_used_namespaces,
            newnode_prefix=newnode_prefix,
            newnode_use_uuid=newnode_use_uuid,
            newnode_counter=newnode_counter,
            newnode_zfill=newnode_zfill,
            allow_lax_uri=allow_lax_uri,
            local_namespace_prefix=local_namespace_prefix,
            local_namespace_use_uuid=local_namespace_use_uuid,
            prefix_expansion_label=prefix_expansion_label,
            structured_value_label=structured_value_label,
            structured_uri_label=structured_uri_label,
            build_id=build_id,
            escape_pipes=escape_pipes,
            validate=validate,
            override_uuid=override_uuid,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        kn.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Пример #3
0
def run(input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,

        column_name: str,
        prefix: str,
        type_names: typing.List[str],
        without_fields: typing.Optional[typing.List[str]],
        overwrite_column: bool,
        validate: bool,
        escape_pipes: bool,
        quantities_include_numbers: bool,
        general_strings: bool,
        remove_prefixed_columns: bool,
        ignore_unselected_types: bool,
        retain_unselected_types: bool,
        build_id: bool,
        show_data_types: bool,
        quiet: bool,
        
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want.
)->int:
    # import modules locally
    from pathlib import Path
    import sys
    
    from kgtk.exceptions import KGTKException
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.reshape.kgtkimplode import KgtkImplode
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(reject_file, who="KGTK reject file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(kwargs)    
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file, flush=True)

        print("--column %s" % column_name, file=error_file, flush=True)
        print("--prefix %s" % prefix, file=error_file, flush=True)
        print("--overwrite %s" % str(overwrite_column), file=error_file, flush=True)
        print("--validate %s" % str(validate), file=error_file, flush=True)
        print("--escape-pipes %s" % str(escape_pipes), file=error_file, flush=True)
        print("--quantities-include-numbers %s" % str(quantities_include_numbers), file=error_file, flush=True)
        print("--general-strings %s" % str(general_strings), file=error_file, flush=True)
        print("--remove-prefixed-columns %s" % str(remove_prefixed_columns), file=error_file, flush=True)
        print("--ignore-unselected-types %s" % str(ignore_unselected_types), file=error_file, flush=True)
        print("--retain-unselected-types %s" % str(retain_unselected_types), file=error_file, flush=True)
        if type_names is not None:
            print("--types %s" % " ".join(type_names), file=error_file, flush=True)
        if without_fields is not None:
            print("--without %s" % " ".join(without_fields), file=error_file, flush=True)
        print("--show-data-types %s" % str(show_data_types), file=error_file, flush=True)
        print("--quiet %s" % str(quiet), file=error_file, flush=True)
        print("--build-id=%s" % str(build_id), file=error_file, flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)
    if show_data_types:
        data_type: str
        for data_type in KgtkFormat.DataType.choices():
            print("%s" % data_type, file=error_file, flush=True)
        return 0

    wf: typing.List[str] = without_fields if without_fields is not None else list()

    try:
        ex: KgtkImplode = KgtkImplode(
            input_file_path=input_kgtk_file,
            output_file_path=output_kgtk_file,
            reject_file_path=reject_kgtk_file,
            column_name=column_name,
            prefix=prefix,
            type_names=type_names,
            without_fields=wf,
            overwrite_column=overwrite_column,
            validate=validate,
            escape_pipes=escape_pipes,
            quantities_include_numbers=quantities_include_numbers,
            general_strings=general_strings,
            remove_prefixed_columns=remove_prefixed_columns,
            ignore_unselected_types=ignore_unselected_types,
            retain_unselected_types=retain_unselected_types,
            quiet=quiet,
            build_id=build_id,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Пример #4
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        key_column_names: typing.List[str],
        compact_id: bool,
        sorted_input: bool,
        verify_sort: bool,
        build_id: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        print("--columns=%s" % " ".join(key_column_names), file=error_file)
        print("--compact-id=%s" % str(compact_id), file=error_file, flush=True)
        print("--presorted=%s" % str(sorted_input))
        print("--verify-sort=%s" % str(verify_sort),
              file=error_file,
              flush=True)
        print("--build-id=%s" % str(build_id), file=error_file, flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        ex: KgtkCompact = KgtkCompact(
            input_file_path=input_kgtk_file,
            key_column_names=key_column_names,
            compact_id=compact_id,
            sorted_input=sorted_input,
            verify_sort=verify_sort,
            output_file_path=output_kgtk_file,
            build_id=build_id,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Пример #5
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        list_output_file: KGTKFiles,
        key_column_names: typing.List[str],
        keep_first_names: typing.List[str],
        compact_id: bool,
        deduplicate: bool,
        sorted_input: bool,
        verify_sort: bool,
        lists_in_input: bool,
        report_lists: bool,
        exclude_lists: bool,
        output_only_lists: bool,
        build_id: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkcompact import KgtkCompact
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    list_output_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            list_output_file, who="KGTK list output file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if list_output_kgtk_file is not None:
            print("--list-output-file=%s" % str(list_output_kgtk_file),
                  file=error_file,
                  flush=True)
        print("--columns=%s" % " ".join(key_column_names), file=error_file)
        print("--keep-first=%s" % " ".join(keep_first_names), file=error_file)
        print("--compact-id=%s" % str(compact_id), file=error_file, flush=True)
        print("--deduplicate=%s" % str(deduplicate),
              file=error_file,
              flush=True)
        print("--presorted=%s" % str(sorted_input),
              file=error_file,
              flush=True)
        print("--verify-sort=%s" % str(verify_sort),
              file=error_file,
              flush=True)
        print("--lists-in-input=%s" % str(lists_in_input),
              file=error_file,
              flush=True)
        print("--report-lists=%s" % str(report_lists),
              file=error_file,
              flush=True)
        print("--exclude-lists=%s" % str(exclude_lists),
              file=error_file,
              flush=True)
        print("--output-only-lists=%s" % str(output_only_lists),
              file=error_file,
              flush=True)
        print("--build-id=%s" % str(build_id), file=error_file, flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        KgtkReader.show_debug_arguments(errors_to_stdout=errors_to_stdout,
                                        errors_to_stderr=errors_to_stderr,
                                        show_options=show_options,
                                        verbose=verbose,
                                        very_verbose=very_verbose,
                                        out=error_file)
        print("=======", file=error_file, flush=True)

    if exclude_lists and output_only_lists:
        raise KGTKException(
            "--exclude-lists and --output-only-lists may not be used together."
        )

    try:
        ex: KgtkCompact = KgtkCompact(
            input_file_path=input_kgtk_file,
            output_file_path=output_kgtk_file,
            list_output_file_path=list_output_kgtk_file,
            key_column_names=key_column_names,
            keep_first_names=keep_first_names,
            compact_id=compact_id,
            deduplicate=deduplicate,
            sorted_input=sorted_input,
            verify_sort=verify_sort,
            lists_in_input=lists_in_input,
            report_lists=report_lists,
            exclude_lists=exclude_lists,
            output_only_lists=output_only_lists,
            build_id=build_id,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Пример #6
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        method: str = "blockmodel",
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys
    import typing

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions =\
    KgtkIdBuilderOptions.from_dict(kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    try:

        # First create the KgtkReader.  It provides parameters used by the ID
        # column builder. Next, create the ID column builder, which provides a
        # possibly revised list of column names for the KgtkWriter.  Create
        # the KgtkWriter.  Last, process the data stream.

        # Open the input file.
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        g = Graph(directed=False)

        d = {}
        count = 0
        nodes = []
        edges = []
        for row in kr:
            if row[kr.node1_column_idx] not in d:
                d[row[kr.node1_column_idx]] = count
                count = count + 1
                nodes.append(row[kr.node1_column_idx])
            if row[kr.node2_column_idx] not in d:
                d[row[kr.node2_column_idx]] = count
                count = count + 1
                nodes.append(row[kr.node2_column_idx])
            edges.append((row[kr.node1_column_idx], row[kr.node2_column_idx]))

        vlist = g.add_vertex(len(d))

        for ele in edges:
            g.add_edge(g.vertex(d[ele[0]]), g.vertex(d[ele[1]]))

        if method == 'blockmodel':
            state = graph_tool.inference.minimize.minimize_blockmodel_dl(g)
            arr = []

            for i in range(0, len(nodes)):
                arr.append('cluster_' + str(state.get_blocks()[i]))

            kw: KgtkWriter = KgtkWriter.open(
                ["node1", "label", "node2"],
                output_kgtk_file,
                verbose=verbose,
                very_verbose=very_verbose,
            )

            for i in range(0, len(nodes)):
                kw.write([nodes[i], 'in', arr[i]])

        elif method == 'nested':
            state = graph_tool.inference.minimize.\
            minimize_nested_blockmodel_dl(g)

            arr = []

            for i in range(0, len(nodes)):
                arr.append([str(i)])

            for i in range(0, len(state.levels)):
                if state.levels[i].get_B() == 1:
                    break
                for j in range(0, len(arr)):
                    arr[j].insert(
                        0,
                        str(state.levels[i].get_blocks()[arr[j][len(arr[j]) -
                                                                1]]))
            for i in range(0, len(nodes)):
                if len(arr[i]) > 0:
                    arr[i].pop()
                arr[i] = 'cluster_' + '_'.join(arr[i])

            kw: KgtkWriter = KgtkWriter.open(
                ["node1", "label", "node2"],
                output_kgtk_file,
                verbose=verbose,
                very_verbose=very_verbose,
            )
            for i in range(0, len(nodes)):
                kw.write([nodes[i], 'in', arr[i]])
        elif method == 'mcmc':
            state = graph_tool.inference.minimize.minimize_blockmodel_dl(g)
            graph_tool.inference.mcmc.\
                mcmc_equilibrate(state, wait=1000, mcmc_args=dict(niter=10))

            dS, nattempts, nmoves = state.multiflip_mcmc_sweep(niter=1000)
            graph_tool.inference.mcmc.\
                mcmc_equilibrate(state, wait=10,
                                 nbreaks=2, mcmc_args=dict(niter=10))

            bs = []  # collect some partitions

            def collect_partitions(s):
                bs.append(s.b.a.copy())

            # Now we collect partitions for exactly 100,000 sweeps
            # of 10 sweeps:
            graph_tool.inference.mcmc.mcmc_equilibrate(
                state,
                force_niter=10000,
                mcmc_args=dict(niter=10),
                callback=collect_partitions)

            # Disambiguate partitions and obtain marginals
            pmode = graph_tool.inference.partition_modes.\
                PartitionModeState(bs, converge=True)
            pv = list(pmode.get_marginal(g))
            m = list(pmode.get_max(g))

            kw: KgtkWriter =\
            KgtkWriter.open(["node1", "label", "node2", 'node2;prob'],
                                                 output_kgtk_file,
                                                 verbose=verbose,
                                                 very_verbose=very_verbose,
                                     )

            for i in range(0, len(nodes)):
                kw.write([
                    nodes[i], 'in', 'cluster_' + str(m[i]),
                    str(pv[i][m[i]] / sum(pv[i]))
                ])

        kr.close()
        kw.close()
        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Пример #7
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        new_edges_file: KGTKFiles,
        base_columns: typing.Optional[typing.List[str]] = None,
        columns_to_lower: typing.Optional[typing.List[str]] = None,
        label_values: typing.Optional[typing.List[str]] = None,
        lift_separator: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SEPARATOR,
        ignore_empty_node1: bool = False,
        ignore_empty_node2: bool = False,
        add_id: bool = False,
        lower: bool = False,
        normalize: bool = False,
        deduplicate_new_edges: bool = True,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalue import KgtkValue
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    new_edges_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(new_edges_file,
                                                            who="Label file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if new_edges_kgtk_file is not None:
            print("--label-file=%s" % str(new_edges_kgtk_file),
                  file=error_file)

        if base_columns is not None:
            print("--base-columns %s" % " ".join(base_columns),
                  file=error_file)
        if columns_to_lower is not None:
            print("--columns-to-lower %s" % " ".join(columns_to_lower),
                  file=error_file)
        if label_values is not None:
            print("--label-values %s" % " ".join(label_values),
                  file=error_file)
        print("--lift-separator=%s" % lift_separator, file=error_file)
        print("--add-id=%s" % add_id, file=error_file)
        print("--lower=%s" % lower, file=error_file)
        print("--ignore-empty-node1=%s" % ignore_empty_node1, file=error_file)
        print("--ignore-empty-node2=%s" % ignore_empty_node2, file=error_file)
        print("--normalize=%s" % normalize, file=error_file)
        print("--deduplicate-labels=%s" % deduplicate_new_edges,
              file=error_file)

        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if not lower and not normalize:
        raise KGTKException(
            "One or both of --lower and --normalize must be requested.")

    try:
        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Map the index of a column being removed to the index of the base column that supplies its node1 value.
        lower_map: typing.MutableMapping[int, typing.Tuple[int, str]] = dict()

        node1_column_name: str = kr.get_node1_column_actual_name()
        label_column_name: str = kr.get_label_column_actual_name()
        node2_column_name: str = kr.get_node2_column_actual_name()
        id_column_name: str = kr.get_id_column_actual_name()

        key_column_names: typing.List[str] = list()
        key_column_idxs: typing.Set[int] = set()

        if node1_column_name != "":
            if verbose:
                print("Node1 column name: %s" % node1_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(node1_column_name)
            key_column_idxs.add(kr.node1_column_idx)

        if label_column_name != "":
            if verbose:
                print("Label column name: %s" % label_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(label_column_name)
            key_column_idxs.add(kr.label_column_idx)

        if node2_column_name != "":
            if verbose:
                print("Node2 column name: %s" % node2_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(node2_column_name)
            key_column_idxs.add(kr.node2_column_idx)

        if id_column_name != "":
            if verbose:
                print("Id column name: %s" % id_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(id_column_name)
            key_column_idxs.add(kr.id_column_idx)
        elif normalize:
            raise KGTKException(
                "--normalize was requested but the ID column was not found.")

        base_name: str
        new_label_value: str
        column_name: str
        idx: int
        # There are three option patterns.

        if columns_to_lower is not None and len(
                columns_to_lower) > 0 and base_columns is not None and len(
                    base_columns) > 0:
            # Pattern 1: len(columns_to_lower) > 0 and len(base_columns) == len(columns_to_lower)
            # column_names and base_columns are paired. New records use label_values if specified.
            if len(columns_to_lower) != len(base_columns):
                raise KGTKException(
                    "There are %d columns to lower but only %d base columns." %
                    (len(columns_to_lower), len(base_columns)))

            if label_values is not None and len(label_values) > 0 and len(
                    label_values) != len(columns_to_lower):
                raise KGTKException(
                    "There are %d columns to lower but only %d label values." %
                    (len(columns_to_lower), len(label_values)))

            for idx, column_name in enumerate(columns_to_lower):
                base_name = base_columns[idx]
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if base_name not in kr.column_names:
                    raise KGTKException(
                        "For column name %s, base name %s is unknown" %
                        (repr(column_name), repr(base_name)))

                if normalize and base_name == id_column_name:
                    lower_map[kr.column_name_map[column_name]] = (
                        kr.column_name_map[base_name], column_name)
                else:
                    if not lower:
                        raise KGTKException(
                            "--lower is not enabled for column %s, base name %s"
                            % (repr(column_name), repr(base_name)))
                    if label_values is not None and len(
                            label_values) > 0 and len(label_values[idx]) > 0:
                        lower_map[kr.column_name_map[column_name]] = (
                            kr.column_name_map[base_name], label_values[idx])
                    else:
                        lower_map[kr.column_name_map[column_name]] = (
                            kr.column_name_map[base_name], column_name)

        elif columns_to_lower is not None and len(columns_to_lower) > 0 and (
                base_columns is None or len(base_columns) == 0):
            # Pattern 2: len(columns_to_lower) > 0 and len(base_columns) == 0
            # Each column name is split at the lift separator to determine the base name and label value.
            if len(lift_separator) == 0:
                raise KGTKException("The --lift-separator must not be empty.")

            for idx, column_name in enumerate(columns_to_lower):
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if lower and lift_separator in column_name:
                    base_name, new_label_value = column_name.split(
                        lift_separator, 1)
                    if base_name not in kr.column_names:
                        raise KGTKException(
                            "For column name %s, base name %s is not known" %
                            (repr(column_name), repr(base_name)))

                elif normalize:
                    base_name = id_column_name
                    new_label_value = column_name

                else:
                    raise KGTKException(
                        "Unable to parse column name %s, no separator (%s)." %
                        (repr(column_name), repr(lift_separator)))

                lower_map[kr.column_name_map[column_name]] = (
                    kr.column_name_map[base_name], new_label_value)

        elif columns_to_lower is None or len(columns_to_lower) == 0:
            # Pattern 3: len(columns_to_lower) == 0.
            # Any column that matches a lift pattern against one of the
            # key columns (node1, label, node2, id, or their aliases)
            # will be lowered.
            if len(lift_separator) == 0:
                raise KGTKException("The --lift-separator must not be empty.")

            if base_columns is None or len(base_columns) == 0:
                # The base name list wasn't supplied.  Use [node1, label, node2, id]
                base_columns = list(key_column_names)
                if verbose:
                    print("Using the default base columns: %s" %
                          " ".join(base_columns),
                          file=error_file,
                          flush=True)
            else:
                if verbose:
                    print("Using these base columns: %s" %
                          " ".join(base_columns),
                          file=error_file,
                          flush=True)

            for idx, column_name in enumerate(kr.column_names):
                # Skip the node1, label, node12, and id columns
                if idx in key_column_idxs:
                    if verbose:
                        print("column %s is a key column, skipping." %
                              repr(column_name),
                              file=error_file,
                              flush=True)
                    continue

                # Does this column match a lifting pattern?
                if lower and lift_separator in column_name:
                    base_name, new_label_value = column_name.split(
                        lift_separator, 1)

                    if base_name not in base_columns:
                        if verbose:
                            print(
                                "Column %s contains base name %s, which is not a base column."
                                % (repr(column_name), repr(base_name)),
                                file=error_file,
                                flush=True)
                        continue

                elif normalize:
                    base_name = id_column_name
                    new_label_value = column_name

                else:
                    if verbose:
                        print(
                            "Column %s does not contain the separator %s and not normalizing, skipping."
                            % (repr(column_name), repr(lift_separator)),
                            file=error_file,
                            flush=True)
                    continue

                # This test should be redundant.
                if base_name in kr.column_names:
                    lower_map[idx] = (kr.column_name_map[base_name],
                                      new_label_value)
                else:
                    raise KGTKException(
                        "Base name %s was unexpectedly not found." %
                        repr(base_name))

        if len(lower_map) == 0:
            raise KGTKException("There are no columns to lower or normalize.")

        if verbose:
            print("The following columns will be lowered or normalized",
                  file=error_file,
                  flush=True)
            for idx in sorted(lower_map.keys()):
                column_name = kr.column_names[idx]
                base_idx, new_label_value = lower_map[idx]
                base_name = kr.column_names[base_idx]
                print(" %s from %s (label %s)" %
                      (column_name, base_name, repr(new_label_value)),
                      file=error_file,
                      flush=True)

        output_column_names: typing.List[str] = list()
        for idx, column_name in enumerate(kr.column_names):
            if idx not in lower_map:
                output_column_names.append(column_name)

        # Create the ID builder.
        idb: typing.Optional[KgtkIdBuilder] = None
        if add_id:
            idb = KgtkIdBuilder.from_column_names(output_column_names,
                                                  idbuilder_options)
            output_column_names = idb.column_names.copy()

        if verbose:
            print("The output columns are: %s" % " ".join(output_column_names),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(
            output_column_names,
            output_kgtk_file,
            mode=KgtkWriter.Mode.EDGE,
            require_all_columns=False,  # Simplifies writing the labels
            verbose=verbose,
            very_verbose=very_verbose)
        shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names)

        lkw: typing.Optional[KgtkWriter] = None
        if new_edges_kgtk_file is not None:
            if verbose:
                print("Opening the label output file: %s" %
                      str(new_edges_kgtk_file),
                      file=error_file,
                      flush=True)

            label_column_names = [
                node1_column_name, label_column_name, node2_column_name
            ]
            lkw = KgtkWriter.open(label_column_names,
                                  new_edges_kgtk_file,
                                  mode=KgtkWriter.Mode.EDGE,
                                  verbose=verbose,
                                  very_verbose=very_verbose)

        # Optionally deduplicate the labels
        #  set(node1_value + KgtkFormat.SEPARATOR + node2_value)
        label_set: typing.Set[str] = set()
        label_key: str

        input_line_count: int = 0
        output_line_count: int = 0
        label_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            output_row: typing.List[str] = kw.shuffle(
                row, shuffle_list=shuffle_list)
            kw.write(output_row)
            output_line_count += 1

            id_seq_num: int = 0
            column_idx: int
            for column_idx in lower_map.keys():
                node1_idx: int
                node1_idx, new_label_value = lower_map[column_idx]
                node1_value: str
                node1_value = row[node1_idx]
                if len(node1_value) == 0:
                    if ignore_empty_node1:
                        continue  # TODO: raise an exception
                    else:
                        raise KGTKException(
                            "Empty node1 value when lowering %d to %d: %s in input line %d"
                            % (column_idx, node1_idx, new_label_value,
                               input_line_count))

                item: str = row[column_idx]
                if len(item) == 0:
                    if ignore_empty_node2:
                        continue  # Ignore empty node2 values.
                    else:
                        raise KGTKException(
                            "Empty node2 value when lowering %d to %d: %s in input line %d"
                            % (column_idx, node1_idx, new_label_value,
                               input_line_count))

                # Ths item might be a KGTK list.  Let's split it, because
                # lists aren't allow in the node2 values we'll generate.
                node2_value: str
                for node2_value in KgtkValue.split_list(item):
                    if len(node2_value) == 0:
                        if ignore_empty_node2:
                            continue  # Ignore empty node2 values in a list.
                        else:
                            raise KGTKException(
                                "Empty node2 value in a list when lowering %d to %d: %s in input line %d"
                                % (column_idx, node1_idx, new_label_value,
                                   input_line_count))

                    if deduplicate_new_edges:
                        label_key = node1_value + KgtkFormat.KEY_FIELD_SEPARATOR + new_label_value + KgtkFormat.KEY_FIELD_SEPARATOR + node2_value
                        if label_key in label_set:
                            continue
                        else:
                            label_set.add(label_key)

                    lowered_input_row: typing.List[str] = [
                        "" for idx in range(kr.column_count)
                    ]
                    lowered_input_row[kr.node1_column_idx] = node1_value
                    lowered_input_row[kr.label_column_idx] = new_label_value
                    lowered_input_row[kr.node2_column_idx] = node2_value

                    lowered_output_row: typing.List[str] = kw.shuffle(
                        lowered_input_row, shuffle_list=shuffle_list)
                    if idb is not None:
                        id_seq_num += 0
                        lowered_output_row = idb.build(lowered_output_row,
                                                       id_seq_num,
                                                       already_added=True)
                    if lkw is not None:
                        lkw.write(lowered_output_row)
                        label_line_count += 1
                    else:
                        kw.write(lowered_output_row)
                        label_line_count += 1
                        output_line_count += 1

        if verbose:
            print("Read %d rows, wrote %d rows with %d labels." %
                  (input_line_count, output_line_count, label_line_count),
                  file=error_file,
                  flush=True)

        kw.close()
        if lkw is not None:
            lkw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1