示例#1
0
def main():
    """
    Test the KGTK compact processor.

    TODO: Support the list output file.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(
        dest="input_file_path",
        help="The KGTK file with the input data (default=%(default)s)",
        type=Path,
        nargs="?",
        default="-")

    parser.add_argument(
        "--columns",
        dest="key_column_names",
        help="The key columns to identify records for compaction. " +
        "(default=id for node files, (node1, label, node2, id) for edge files).",
        nargs='+',
        default=[])

    parser.add_argument(
        "--keep-first",
        dest="keep_first_names",
        help=
        "If compaction results in a list of values for any column on this list, keep only the first value after sorting. "
        + "(default=none).",
        nargs='+',
        default=[])

    parser.add_argument(
        "--compact-id",
        dest="compact_id",
        help=
        "Indicate that the ID column in KGTK edge files should be compacted. "
        + "Normally, if the ID column exists, it is not compacted, " +
        "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--deduplicate",
        dest="deduplicate",
        help=
        "Treat all columns as key columns, overriding --columns and --compact-id. "
        +
        "This will remove completely duplicate records without compacting any new lists. "
        + "(default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False,
        metavar="True|False")

    parser.add_argument(
        "--presorted",
        dest="sorted_input",
        help=
        "Indicate that the input has been presorted (or at least pregrouped). (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--verify-sort",
        dest="verify_sort",
        help=
        "If the input has been presorted, verify its consistency (disable if only pregrouped). (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--lists-in-input",
        dest="lists_in_input",
        help=
        "Assume that the input file may contain lists (disable when certain it does not). (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--report-lists",
        dest="report_lists",
        help=
        "When True, report records with lists to the error output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--exclude-lists",
        dest="exclude_lists",
        help=
        "When True, exclude records with lists from the output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--output-only-lists",
        dest="output_only_lists",
        help=
        "When True, output only records containing lists. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--build-id",
        dest="build_id",
        help="Build id values in an id column. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    KgtkIdBuilderOptions.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(
        args)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        print("input: %s" % str(args.input_file_path),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        print("--columns %s" % " ".join(args.key_column_names),
              file=error_file,
              flush=True)
        print("--keep-first %s" % " ".join(args.keep_first_names),
              file=error_file,
              flush=True)
        print("--compact-id=%s" % str(args.compact_id),
              file=error_file,
              flush=True)
        print("--deduplicate=%s" % str(args.deduplicate),
              file=error_file,
              flush=True)
        print("--presorted=%s" % str(args.sorted_input),
              file=error_file,
              flush=True)
        print("--verify-sort=%s" % str(args.verify_sort),
              file=error_file,
              flush=True)
        print("--lists-in-input=%s" % str(args.lists_in_input),
              file=error_file,
              flush=True)
        print("--report-lists=%s" % str(args.report_lists),
              file=error_file,
              flush=True)
        print("--exclude-lists=%s" % str(args.exclude_lists),
              file=error_file,
              flush=True)
        print("--output-only-lists=%s" % str(args.output_only_lists),
              file=error_file,
              flush=True)
        print("--build-id=%s" % str(args.build_id),
              file=error_file,
              flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    kc: KgtkCompact = KgtkCompact(input_file_path=args.input_file_path,
                                  key_column_names=args.key_column_names,
                                  keep_first_names=args.keep_first_names,
                                  compact_id=args.compact_id,
                                  deduplicate=args.deduplicate,
                                  sorted_input=args.sorted_input,
                                  verify_sort=args.verify_sort,
                                  lists_in_input=args.lists_in_input,
                                  report_lists=args.report_lists,
                                  exclude_lists=args.exclude_lists,
                                  output_only_lists=args.output_only_lists,
                                  output_file_path=args.output_file_path,
                                  build_id=args.build_id,
                                  idbuilder_options=idbuilder_options,
                                  reader_options=reader_options,
                                  value_options=value_options,
                                  error_file=error_file,
                                  verbose=args.verbose,
                                  very_verbose=args.very_verbose)

    kc.process()
示例#2
0
def main():
    """
    Test the KGTK implode processor.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(
        dest="input_file_path",
        help="The KGTK file with the input data. (default=%(default)s)",
        type=Path,
        nargs="?",
        default="-")

    parser.add_argument(
        "--column",
        dest="column_name",
        help="The name of the column to explode. (default=%(default)s).",
        default="node2")

    parser.add_argument(
        "--types",
        dest="type_names",
        nargs='*',
        help=
        "The KGTK data types for which fields should be imploded. (default=%(default)s).",
        choices=KgtkFormat.DataType.choices(),
        default=KgtkFormat.DataType.choices())

    parser.add_argument(
        "--without",
        dest="without_fields",
        nargs='*',
        help="The KGTK fields to do without. (default=%(default)s).",
        choices=KgtkValueFields.OPTIONAL_DEFAULT_FIELD_NAMES,
        default=None)

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--prefix",
        dest="prefix",
        help="The prefix for exploded column names. (default=%(default)s).",
        default="node2;kgtk:")

    parser.add_argument(
        "--overwrite",
        dest="overwrite_column",
        help=
        "Indicate that it is OK to overwrite an existing imploded column. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--validate",
        dest="validate",
        help="Validate imploded values. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--escape-pipes",
        dest="escape_pipes",
        help=
        "When true, pipe characters (|) need to be escaped (\\|) per KGTK file format. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--quantities-include-numbers",
        dest="quantities_include_numbers",
        help=
        "When true, numbers are acceptable quantities. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--general-strings",
        dest="general_strings",
        help=
        "When true, strings may include language qualified strings. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--remove-prefixed-columns",
        dest="remove_prefixed_columns",
        help=
        "When true, remove all columns beginning with the prefix from the output file. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--ignore-unselected-types",
        dest="ignore_unselected_types",
        help=
        "When true, input records with valid but unselected data types will be passed through to output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--retain-unselected-types",
        dest="retain_unselected_types",
        help=
        "When true, input records with valid but unselected data types will be retain existing data on output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--build-id",
        dest="build_id",
        help="Build id values in an id column. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--reject-file",
        dest="reject_file_path",
        help=
        "The KGTK file into which to write rejected records (default=%(default)s).",
        type=Path,
        default=None)

    KgtkIdBuilderOptions.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(
        args)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        # TODO: show ifempty-specific options.
        print("input: %s" % str(args.input_file_path),
              file=error_file,
              flush=True)
        print("--column %s" % args.column_name, file=error_file, flush=True)
        print("--prefix %s" % args.prefix, file=error_file, flush=True)
        print("--overwrite %s" % str(args.overwrite_column),
              file=error_file,
              flush=True)
        print("--validate %s" % str(args.validate),
              file=error_file,
              flush=True)
        print("--escape-pipes %s" % str(args.escape_pipes),
              file=error_file,
              flush=True)
        print("--quantities-include-numbers %s" %
              str(args.quantities_include_numbers),
              file=error_file,
              flush=True)
        print("--general-strings %s" % str(args.general_strings),
              file=error_file,
              flush=True)
        print("--remove-prefixed-columns %s" %
              str(args.remove_prefixed_columns),
              file=error_file,
              flush=True)
        print("--ignore-unselected-types %s" %
              str(args.ignore_unselected_types),
              file=error_file,
              flush=True)
        print("--retain-unselected-types %s" %
              str(args.retain_unselected_types),
              file=error_file,
              flush=True)
        if args.type_names is not None:
            print("--types %s" % " ".join(args.type_names),
                  file=error_file,
                  flush=True)
        if args.without_fields is not None:
            print("--without %s" % " ".join(args.without_fields),
                  file=error_file,
                  flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        if args.reject_file_path is not None:
            print("--reject-file=%s" % str(args.reject_file_path),
                  file=error_file,
                  flush=True)
        print("--build-id=%s" % str(args.build_id),
              file=error_file,
              flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    without_fields: typing.List[
        str] = args.without_fields if args.without_fields is not None else list(
        )

    ex: KgtkImplode = KgtkImplode(
        input_file_path=args.input_file_path,
        column_name=args.column_name,
        prefix=args.prefix,
        type_names=args.type_names,
        without_fields=without_fields,
        overwrite_column=args.overwrite_column,
        validate=args.validate,
        escape_pipes=args.escape_pipes,
        quantities_include_numbers=args.quantities_include_numbers,
        general_strings=args.general_strings,
        remove_prefixed_columns=args.remove_prefixed_columns,
        ignore_unselected_types=args.ignore_unselected_types,
        retain_unselected_types=args.retain_unselected_types,
        output_file_path=args.output_file_path,
        reject_file_path=args.reject_file_path,
        build_id=args.build_id,
        idbuilder_options=idbuilder_options,
        reader_options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose)

    ex.process()
示例#3
0
def main():
    """
    Test the KGTK ntriples importer.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(
        "-i",
        "--input-files",
        dest="input_file_paths",
        nargs='*',
        help="The file(s) with the input ntriples data. (default=%(default)s)",
        type=Path,
        default="-")

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--reject-file",
        dest="reject_file_path",
        help=
        "The KGTK file into which to write rejected records. (default=%(default)s).",
        type=Path,
        default=None)

    parser.add_argument(
        "--namespace-file",
        dest="namespace_file_path",
        help="The KGTK file with known namespaces. (default=%(default)s).",
        type=Path,
        default=None)

    parser.add_argument(
        "--updated-namespace-file",
        dest="updated_namespace_file_path",
        help=
        "An updated KGTK file with known namespaces. (default=%(default)s).",
        type=Path,
        default=None)

    KgtkNtriples.add_arguments(parser)
    KgtkIdBuilderOptions.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(
        args)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        print("--input-files %s" %
              " ".join([str(path) for path in input_file_paths]),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        # TODO: show ifempty-specific options.
        if args.reject_file_path is not None:
            print("--reject-file=%s" % str(args.reject_file_path),
                  file=error_file,
                  flush=True)
        if args.namespace_file_path is not None:
            print("--namespace-file=%s" % str(args.namespace_file_path),
                  file=error_file,
                  flush=True)
        if args.updated_namespace_file_path is not None:
            print("--updated-namespace-file=%s" %
                  str(args.updated_namespace_file_path),
                  file=error_file,
                  flush=True)
        print("--namespace-id-prefix %s" % args.namespace_id_prefix,
              file=error_file,
              flush=True)
        print("--namespace-id-use-uuid %s" % str(args.namespace_id_use_uuid),
              file=error_file,
              flush=True)
        print("--namespace-id-counter %s" % str(args.namespace_id_counter),
              file=error_file,
              flush=True)
        print("--namespace-id-zfill %s" % str(args.namespace_id_zfill),
              file=error_file,
              flush=True)
        print("--output-only-used-namespaces %s" %
              str(args.output_only_used_namespaces),
              file=error_file,
              flush=True)
        print("--allow-lax-uri %s" % str(args.allow_lax_uri),
              file=error_file,
              flush=True)
        print("--local-namespace-prefix %s" % args.local_namespace_prefix,
              file=error_file,
              flush=True)
        print("--local-namespace-use-uuid %s" %
              str(args.local_namespace_use_uuid),
              file=error_file,
              flush=True)
        print("--prefix-expansion-label %s" % args.prefix_expansion_label,
              file=error_file,
              flush=True)
        print("--structured-value-label %s" % args.structured_value_label,
              file=error_file,
              flush=True)
        print("--structured-uri-label %s" % args.structured_uri_label,
              file=error_file,
              flush=True)
        print("--newnode-prefix %s" % args.newnode_prefix,
              file=error_file,
              flush=True)
        print("--newnode-use-uuid %s" % str(args.newnode_use_uuid),
              file=error_file,
              flush=True)
        print("--newnode-counter %s" % str(args.newnode_counter),
              file=error_file,
              flush=True)
        print("--newnode-zfill %s" % str(args.newnode_zfill),
              file=error_file,
              flush=True)
        print("--build-id=%s" % str(args.build_id),
              file=error_file,
              flush=True)
        print("--escape-pipes=%s" % str(args.escape_pipes),
              file=error_file,
              flush=True)
        print("--validate=%s" % str(args.validate),
              file=error_file,
              flush=True)
        if args.override_uuid is not None:
            print("--override_uuid=%s" % str(args.override_uuid),
                  file=error_file,
                  flush=True)

        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    kn: KgtkNtriples = KgtkNtriples(
        input_file_paths=args.input_file_paths,
        output_file_path=args.output_file_path,
        reject_file_path=args.reject_file_path,
        namespace_file_path=args.namespace_file_path,
        updated_namespace_file_path=args.updated_namespace_file_path,
        namespace_id_prefix=args.namespace_id_prefix,
        namespace_id_use_uuid=args.namespace_id_use_uuid,
        namespace_id_counter=args.namespace_id_counter,
        namespace_id_zfill=args.namespace_id_zfill,
        output_only_used_namespaces=args.output_only_used_namespaces,
        newnode_prefix=args.newnode_prefix,
        newnode_use_uuid=args.newnode_use_uuid,
        newnode_counter=args.newnode_counter,
        newnode_zfill=args.newnode_zfill,
        allow_lax_uri=args.allow_lax_uri,
        local_namespace_prefix=args.local_namespace_prefix,
        local_namespace_use_uuid=args.local_namespace_use_uuid,
        prefix_expansion_label=args.prefix_expansion_label,
        structured_value_label=args.structured_value_label,
        structured_uri_label=args.structured_uri_label,
        build_id=args.build_id,
        escape_pipes=args.escape_pipes,
        idbuilder_options=idbuilder_options,
        validate=args.validate,
        override_uuid=args.override_uuid,
        reader_options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose)

    kn.process()
示例#4
0
def main():
    """
    Test the KGTK compact processor.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(dest="input_file_path", help="The KGTK file with the input data (default=%(default)s)", type=Path, nargs="?", default="-")

    parser.add_argument(      "--columns", dest="key_column_names",
                              help="The key columns to identify records for compaction. " +
                              "(default=id for node files, (node1, label, node2, id) for edge files).", nargs='+', default=[ ])

    parser.add_argument(      "--compact-id", dest="compact_id",
                              help="Indicate that the ID column in KGTK edge files should be compacted. " +
                              "Normally, if the ID column exists, it is not compacted, " +
                              "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--presorted", dest="sorted_input",
                              help="Indicate that the input has been presorted (or at least pregrouped). (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--verify-sort", dest="verify_sort",
                              help="If the input has been presorted, verify its consistency (disable if only pregrouped). (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=True)

    parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-")
    
    parser.add_argument(      "--build-id", dest="build_id",
                              help="Build id values in an id column. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    KgtkIdBuilderOptions.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.                                                                                                                          
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(args)    
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

   # Show the final option structures for debugging and documentation.                                                                                             
    if args.show_options:
        print("input: %s" % str(args.input_file_path), file=error_file, flush=True)
        print("--columns %s" % " ".join(args.key_column_names), file=error_file, flush=True)
        print("--compact-id=%s" % str(args.compact_id), file=error_file, flush=True)
        print("--presorted=%s" % str(args.sorted_input), file=error_file, flush=True)
        print("--verify-sort=%s" % str(args.verify_sort), file=error_file, flush=True)
        print("--output-file=%s" % str(args.output_file_path), file=error_file, flush=True)
        print("--build-id=%s" % str(args.build_id), file=error_file, flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    kc: KgtkCompact = KgtkCompact(
        input_file_path=args.input_file_path,
        key_column_names=args.key_column_names,
        compact_id=args.compact_id,
        sorted_input=args.sorted_input,
        verify_sort=args.verify_sort,
        output_file_path=args.output_file_path,
        build_id=args.build_id,
        idbuilder_options=idbuilder_options,
        reader_options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose)

    kc.process()