def run(input_file: KGTKFiles, output_file: KGTKFiles, label_file: KGTKFiles, unmodified_row_file: KGTKFiles, matched_label_file: KGTKFiles, unmatched_label_file: KGTKFiles, input_select_column_name: typing.Optional[str], input_select_column_value: typing.Optional[str], input_lifting_column_names: typing.List[str], output_lifted_column_names: typing.List[str], output_lifted_column_suffix: str, output_select_column_value: str, label_select_column_name: typing.Optional[str], label_select_column_value: str, label_match_column_name: typing.Optional[str], label_value_column_name: typing.Optional[str], default_value: str, remove_label_records: bool = False, sort_lifted_labels: bool = True, suppress_duplicate_labels: bool = True, suppress_empty_columns: bool = False, ok_if_no_labels: bool = False, prefilter_labels: bool = False, input_is_presorted: bool = False, labels_are_presorted: bool = False, clear_before_lift: bool = False, overwrite: bool = False, output_only_modified_rows: bool = False, languages: typing.Optional[typing.List[str]] = None, prioritize: bool = False, use_label_envar: bool = False, lift_all_columns: bool = False, require_label_file: bool = False, force_input_mode_none: bool = False, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. )->int: # import modules locally import os from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.lift.kgtklift import KgtkLift from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) label_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_input_file(label_file, who="KGTK label file") unmodified_row_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmodified_row_file, who="KGTK unmodified row output file") matched_label_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(matched_label_file, who="KGTK matched label output file") unmatched_label_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmatched_label_file, who="KGTK unmatched label output file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input", fallback=True) label_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="label", fallback=True) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file, flush=True) print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True) if label_kgtk_file is not None: print("--label-file=%s" % label_kgtk_file, file=error_file, flush=True) if unmodified_row_kgtk_file is not None: print("--unmodified-row-output-file=%s" % unmodified_row_kgtk_file, file=error_file, flush=True) if matched_label_kgtk_file is not None: print("--matched-label-output-file=%s" % matched_label_kgtk_file, file=error_file, flush=True) if unmatched_label_kgtk_file is not None: print("--unmatched-label-output-file=%s" % unmatched_label_kgtk_file, file=error_file, flush=True) if input_select_column_name is not None: print("--input-select-column=%s" % input_select_column_name, file=error_file, flush=True) if input_select_column_value is not None: print("--input-select-value=%s" % input_select_column_value, file=error_file, flush=True) if input_lifting_column_names is not None and len(input_lifting_column_names) > 0: print("--columns-to-lift %s" % " ".join(input_lifting_column_names), file=error_file, flush=True) if output_lifted_column_names is not None and len(output_lifted_column_names) > 0: print("--columns-to-write %s" % " ".join(output_lifted_column_names), file=error_file, flush=True) print("--lift-suffix=%s" % output_lifted_column_suffix, file=error_file, flush=True) if output_select_column_value is not None: print("--update-select-value=%s" % output_select_column_value, file=error_file, flush=True) if label_select_column_name is not None: print("--label-select-column=%s" % label_select_column_name, file=error_file, flush=True) print("--label-select-value=%s" % label_select_column_value, file=error_file, flush=True) if label_match_column_name is not None: print("--label-match-column=%s" % label_match_column_name, file=error_file, flush=True) if label_value_column_name is not None: print("--label-value-column=%s" % label_value_column_name, file=error_file, flush=True) print("--default-value=%s" % repr(default_value), file=error_file, flush=True) print("--remove-label-records=%s" % repr(remove_label_records), file=error_file, flush=True) print("--sort-lifted-labels=%s" % repr(sort_lifted_labels), file=error_file, flush=True) print("--suppress-duplicate-labels=%s" % repr(suppress_duplicate_labels), file=error_file, flush=True) print("--suppress-empty-columns=%s" % repr(suppress_empty_columns), file=error_file, flush=True) print("--ok-if-no-labels=%s" % repr(ok_if_no_labels), file=error_file, flush=True) print("--prefilter-labels=%s" % repr(prefilter_labels), file=error_file, flush=True) print("--input-file-is-presorted=%s" % repr(input_is_presorted), file=error_file, flush=True) print("--label-file-is-presorted=%s" % repr(labels_are_presorted), file=error_file, flush=True) print("--clear-before-lift=%s" % repr(clear_before_lift), file=error_file, flush=True) print("--overwrite=%s" % repr(overwrite), file=error_file, flush=True) print("--output-only-modified-rows=%s" % repr(output_only_modified_rows), file=error_file, flush=True) if languages is not None: print("--languages %s" % " ".join(repr(l) for l in languages), file=error_file, flush=True) print("--prioritize=%s" % repr(prioritize), file=error_file, flush=True) print("--use-label-envar=%s" % repr(use_label_envar), file=error_file, flush=True) print("--lift-all-columns=%s" % repr(lift_all_columns), file=error_file, flush=True) print("--require-label-files=%s" % repr(require_label_file), file=error_file, flush=True) print("--force-input-mode-none=%s" % repr(force_input_mode_none), file=error_file, flush=True) input_reader_options.show(out=error_file, who="input") label_reader_options.show(out=error_file, who="label") value_options.show(out=error_file) print("=======", file=error_file, flush=True) # Should the following functionality be moved to KgtkLift? if label_kgtk_file is None and use_label_envar: label_file_envar: str = 'KGTK_LABEL_FILE' # TODO: Move this to a common file. label_file_envar_value: typing.Optional[str] = os.getenv(label_file_envar) if label_file_envar_value is not None: label_kgtk_file = Path(label_file_envar_value) if verbose: print("Using label file %s from envar %s" % (repr(label_file_envar_value), repr(label_file_envar)), file=error_file, flush=True) if require_label_file and label_kgtk_file is None: raise KGTKException("A label file must be specified using --label-file or KGTK_LABEL_FILE") try: kl: KgtkLift = KgtkLift( input_file_path=input_kgtk_file, label_file_path=label_kgtk_file, output_file_path=output_kgtk_file, unmodified_row_file_path=unmodified_row_kgtk_file, matched_label_file_path=matched_label_kgtk_file, unmatched_label_file_path=unmatched_label_kgtk_file, input_select_column_name=input_select_column_name, input_select_column_value=input_select_column_value, input_lifting_column_names=input_lifting_column_names, output_lifted_column_suffix=output_lifted_column_suffix, output_select_column_value=output_select_column_value, output_lifted_column_names=output_lifted_column_names, label_select_column_name=label_select_column_name, label_select_column_value=label_select_column_value, label_match_column_name=label_match_column_name, label_value_column_name=label_value_column_name, default_value=default_value, remove_label_records=remove_label_records, sort_lifted_labels=sort_lifted_labels, suppress_duplicate_labels=suppress_duplicate_labels, suppress_empty_columns=suppress_empty_columns, ok_if_no_labels=ok_if_no_labels, prefilter_labels=prefilter_labels, input_is_presorted=input_is_presorted, labels_are_presorted=labels_are_presorted, clear_before_lift=clear_before_lift, overwrite=overwrite, output_only_modified_rows=output_only_modified_rows, languages=languages, prioritize=prioritize, lift_all_columns=lift_all_columns, force_input_mode_none=force_input_mode_none, input_reader_options=input_reader_options, label_reader_options=label_reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) kl.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, filter_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, matched_filter_file: KGTKFiles, unmatched_filter_file: KGTKFiles, join_file: KGTKFiles, input_keys: typing.Optional[typing.List[str]], filter_keys: typing.Optional[typing.List[str]], cache_input: bool = False, preserve_order: bool = False, presorted: bool = False, field_separator: typing.Optional[str] = None, left_join: bool = False, right_join: bool = False, input_prefix: typing.Optional[str] = None, filter_prefix: typing.Optional[str] = None, join_output: bool = False, right_first: bool = False, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.iff.kgtkifexists import KgtkIfExists from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) filter_kgtk_file: Path = KGTKArgumentParser.get_input_file( filter_file, who="KGTK filter file") output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) reject_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( reject_file, who="KGTK reject file") matched_filter_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( matched_filter_file, who="KGTK matched filter file") unmatched_filter_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( unmatched_filter_file, who="KGTK unmatched filter file") join_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( join_file, who="KGTK join file") if (str(input_kgtk_file) == "-" and str(filter_kgtk_file) == "-"): raise KGTKException( "My not use stdin for both --input-file and --filter-on files.") field_separator = KgtkIfExists.FIELD_SEPARATOR_DEFAULT if field_separator is None else field_separator # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict( kwargs, who="input", fallback=True) filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict( kwargs, who="filter", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--filter-file=%s" % str(filter_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if reject_kgtk_file is not None: print("--reject-file=%s" % str(reject_kgtk_file), file=error_file) if matched_filter_kgtk_file is not None: print("--matched-filter-file=%s" % str(matched_filter_kgtk_file), file=error_file) if unmatched_filter_kgtk_file is not None: print("--unmatched-filter-file=%s" % str(unmatched_filter_kgtk_file), file=error_file) if join_kgtk_file is not None: print("--join-file=%s" % str(join_kgtk_file), file=error_file) if input_keys is not None: print("--input-keys=%s" % " ".join(input_keys), file=error_file) if filter_keys is not None: print("--filter-keys=%s" % " ".join(filter_keys), file=error_file) print("--cache-input=%s" % str(cache_input), file=error_file) print("--preserve-order=%s" % str(preserve_order), file=error_file) print("--presortedr=%s" % str(presorted), file=error_file) print("--field-separator=%s" % repr(field_separator), file=error_file) print("--left-join=%s" % str(left_join), file=error_file) print("--right-join=%s" % str(right_join), file=error_file) if input_prefix is not None: print("--input-prefix=%s" % repr(input_prefix), file=error_file) if filter_prefix is not None: print("--filter-prefix=%s" % repr(filter_prefix), file=error_file) print("--join-output=%s" % str(join_output), file=error_file) print("--right-join-first=%s" % str(right_first), file=error_file) input_reader_options.show(out=error_file, who="input") filter_reader_options.show(out=error_file, who="filter") value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: ie: KgtkIfExists = KgtkIfExists( input_file_path=input_kgtk_file, input_keys=input_keys, filter_file_path=filter_kgtk_file, filter_keys=filter_keys, output_file_path=output_kgtk_file, reject_file_path=reject_kgtk_file, matched_filter_file_path=matched_filter_kgtk_file, unmatched_filter_file_path=unmatched_filter_kgtk_file, join_file_path=join_kgtk_file, left_join=left_join, right_join=right_join, input_prefix=input_prefix, filter_prefix=filter_prefix, join_output=join_output, right_first=right_first, invert=False, cache_input=cache_input, preserve_order=preserve_order, presorted=presorted, field_separator=field_separator, input_reader_options=input_reader_options, filter_reader_options=filter_reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) ie.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, reified_file: KGTKFiles, unreified_file: KGTKFiles, uninvolved_file: KGTKFiles, trigger_label_value: str, trigger_node2_value: str, value_label_value: str, old_label_value: str, new_label_value: typing.Optional[str], allow_multiple_values: bool, allow_extra_columns: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.unreify.kgtkunreifyvalues import KgtkUnreifyValues from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) reified_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( reified_file, who="KGTK reified file") unreified_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( unreified_file, who="KGTK unreified file") uninvolved_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( uninvolved_file, who="KGTK uninvolved file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-files %s" % str(input_kgtk_file), file=error_file, flush=True) print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True) if reified_kgtk_file is not None: print("--reified-file=%s" % str(reified_kgtk_file), file=error_file, flush=True) if unreified_kgtk_file is not None: print("--unreified-file=%s" % str(unreified_kgtk_file), file=error_file, flush=True) if uninvolved_kgtk_file is not None: print("--uninvolved-file=%s" % str(uninvolved_kgtk_file), file=error_file, flush=True) print("--trigger-label=%s" % trigger_label_value, file=error_file, flush=True) print("--trigger-node2=%s" % trigger_node2_value, file=error_file, flush=True) print("--value-label=%s" % value_label_value, file=error_file, flush=True) print("--old-label=%s" % old_label_value, file=error_file, flush=True) if new_label_value is not None: print("--new-label=%s" % new_label_value, file=error_file, flush=True) print("--allow-multiple-values=%s" % str(allow_multiple_values), file=error_file, flush=True) print("--allow-extra-columns=%s" % str(allow_extra_columns), file=error_file, flush=True) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: kuv: KgtkUnreifyValues = KgtkUnreifyValues( input_file_path=input_kgtk_file, output_file_path=output_kgtk_file, reified_file_path=reified_kgtk_file, unreified_file_path=unreified_kgtk_file, uninvolved_file_path=uninvolved_kgtk_file, trigger_label_value=trigger_label_value, trigger_node2_value=trigger_node2_value, value_label_value=value_label_value, old_label_value=old_label_value, new_label_value=new_label_value, allow_multiple_values=allow_multiple_values, allow_extra_columns=allow_extra_columns, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose) kuv.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run(input_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, column_name: str, prefix: str, type_names: typing.List[str], without_fields: typing.Optional[typing.List[str]], overwrite_column: bool, validate: bool, escape_pipes: bool, quantities_include_numbers: bool, general_strings: bool, remove_prefixed_columns: bool, ignore_unselected_types: bool, retain_unselected_types: bool, build_id: bool, show_data_types: bool, quiet: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. )->int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions from kgtk.reshape.kgtkimplode import KgtkImplode from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) reject_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(reject_file, who="KGTK reject file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(kwargs) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True) if reject_kgtk_file is not None: print("--reject-file=%s" % str(reject_kgtk_file), file=error_file, flush=True) print("--column %s" % column_name, file=error_file, flush=True) print("--prefix %s" % prefix, file=error_file, flush=True) print("--overwrite %s" % str(overwrite_column), file=error_file, flush=True) print("--validate %s" % str(validate), file=error_file, flush=True) print("--escape-pipes %s" % str(escape_pipes), file=error_file, flush=True) print("--quantities-include-numbers %s" % str(quantities_include_numbers), file=error_file, flush=True) print("--general-strings %s" % str(general_strings), file=error_file, flush=True) print("--remove-prefixed-columns %s" % str(remove_prefixed_columns), file=error_file, flush=True) print("--ignore-unselected-types %s" % str(ignore_unselected_types), file=error_file, flush=True) print("--retain-unselected-types %s" % str(retain_unselected_types), file=error_file, flush=True) if type_names is not None: print("--types %s" % " ".join(type_names), file=error_file, flush=True) if without_fields is not None: print("--without %s" % " ".join(without_fields), file=error_file, flush=True) print("--show-data-types %s" % str(show_data_types), file=error_file, flush=True) print("--quiet %s" % str(quiet), file=error_file, flush=True) print("--build-id=%s" % str(build_id), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) if show_data_types: data_type: str for data_type in KgtkFormat.DataType.choices(): print("%s" % data_type, file=error_file, flush=True) return 0 wf: typing.List[str] = without_fields if without_fields is not None else list() try: ex: KgtkImplode = KgtkImplode( input_file_path=input_kgtk_file, output_file_path=output_kgtk_file, reject_file_path=reject_kgtk_file, column_name=column_name, prefix=prefix, type_names=type_names, without_fields=wf, overwrite_column=overwrite_column, validate=validate, escape_pipes=escape_pipes, quantities_include_numbers=quantities_include_numbers, general_strings=general_strings, remove_prefixed_columns=remove_prefixed_columns, ignore_unselected_types=ignore_unselected_types, retain_unselected_types=retain_unselected_types, quiet=quiet, build_id=build_id, idbuilder_options=idbuilder_options, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose) ex.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, namespace_file: KGTKFiles, updated_namespace_file: KGTKFiles, namespace_id_prefix: str, namespace_id_use_uuid: bool, namespace_id_counter: int, namespace_id_zfill: int, output_only_used_namespaces: bool, allow_lax_uri: bool, local_namespace_prefix: str, local_namespace_use_uuid: bool, prefix_expansion_label: str, structured_value_label: str, structured_uri_label: str, newnode_prefix: str, newnode_use_uuid: bool, newnode_counter: int, newnode_zfill: int, build_id: bool, escape_pipes: bool, validate: bool, override_uuid: typing.Optional[str], errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.imports.kgtkntriples import KgtkNtriples from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions # Select where to send error messages, defaulting to stderr. input_file_paths: typing.List[ Path] = KGTKArgumentParser.get_input_file_list(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) reject_file_path: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( reject_file, who="KGTK reject file") namespace_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_input_file( namespace_file, who="KGTK namespace file") updated_namespace_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( updated_namespace_file, who="KGTK updated namespace file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict( kwargs) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-files %s" % " ".join([str(path) for path in input_file_paths]), file=error_file, flush=True) print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True) if reject_file_path is not None: print("--reject-file=%s" % str(reject_file_path), file=error_file, flush=True) if namespace_kgtk_file is not None: print("--namespace-file=%s" % str(namespace_kgtk_file), file=error_file, flush=True) if updated_namespace_kgtk_file is not None: print("--updated-namespace-file=%s" % str(updated_namespace_kgtk_file), file=error_file, flush=True) print("--namespace-id-prefix %s" % namespace_id_prefix, file=error_file, flush=True) print("--namespace-id-use-uuid %s" % str(namespace_id_use_uuid), file=error_file, flush=True) print("--namespace-id-counter %s" % str(namespace_id_counter), file=error_file, flush=True) print("--namespace-id-zfill %s" % str(namespace_id_zfill), file=error_file, flush=True) print("--output-only-used-namespaces %s" % str(output_only_used_namespaces), file=error_file, flush=True) print("--allow-lax-uri %s" % str(allow_lax_uri), file=error_file, flush=True) print("--local-namespace-prefix %s" % local_namespace_prefix, file=error_file, flush=True) print("--local-namespace-use-uuid %s" % str(local_namespace_use_uuid), file=error_file, flush=True) print("--prefix-expansion-label %s" % prefix_expansion_label, file=error_file, flush=True) print("--structured-value-label %s" % structured_value_label, file=error_file, flush=True) print("--structured-uri-label %s" % structured_uri_label, file=error_file, flush=True) print("--newnode-prefix %s" % newnode_prefix, file=error_file, flush=True) print("--newnode-use-uuid %s" % str(newnode_use_uuid), file=error_file, flush=True) print("--newnode-counter %s" % str(newnode_counter), file=error_file, flush=True) print("--newnode-zfill %s" % str(newnode_zfill), file=error_file, flush=True) print("--build-id=%s" % str(build_id), file=error_file, flush=True) print("--escape-pipes=%s" % str(escape_pipes), file=error_file, flush=True) print("--validate=%s" % str(validate), file=error_file, flush=True) print("--override-uuid=%s" % str(override_uuid), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: kn: KgtkNtriples = KgtkNtriples( input_file_paths=input_file_paths, output_file_path=output_kgtk_file, reject_file_path=reject_file_path, updated_namespace_file_path=updated_namespace_kgtk_file, namespace_file_path=namespace_kgtk_file, namespace_id_prefix=namespace_id_prefix, namespace_id_use_uuid=namespace_id_use_uuid, namespace_id_counter=namespace_id_counter, namespace_id_zfill=namespace_id_zfill, output_only_used_namespaces=output_only_used_namespaces, newnode_prefix=newnode_prefix, newnode_use_uuid=newnode_use_uuid, newnode_counter=newnode_counter, newnode_zfill=newnode_zfill, allow_lax_uri=allow_lax_uri, local_namespace_prefix=local_namespace_prefix, local_namespace_use_uuid=local_namespace_use_uuid, prefix_expansion_label=prefix_expansion_label, structured_value_label=structured_value_label, structured_uri_label=structured_uri_label, build_id=build_id, escape_pipes=escape_pipes, validate=validate, override_uuid=override_uuid, idbuilder_options=idbuilder_options, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose) kn.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, pattern_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, grouped_input: bool = False, reject_node1_groups: bool = False, no_complaints: bool = False, complain_immediately: bool = False, add_isa_column: bool = False, isa_column_name: str = "isa;node2", autovalidate: bool = True, errors_to_stdout: bool = False, errors_to_stderr: bool = False, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkReaderOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderMode, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.propertypatternvalidator import PropertyPatterns, PropertyPatternValidator from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) pattern_kgtk_file: Path = KGTKArgumentParser.get_input_file( pattern_file, default_stdin=False) output_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file(output_file) reject_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file(reject_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--pattern-file=%s" % str(pattern_kgtk_file), file=error_file) if output_kgtk_file is not None: print("--output-file=%s" % str(output_kgtk_file), file=error_file) if reject_kgtk_file is not None: print("--reject-file=%s" % str(reject_kgtk_file), file=error_file) print("--presorted=%s" % str(grouped_input)) print("--reject-node1-groups=%s" % str(reject_node1_groups)) print("--complain-immediately=%s" % str(complain_immediately)) print("--add-isa-column=%s" % str(add_isa_column)) print("--isa-column-name=%s" % str(isa_column_name)) print("--autovalidate=%s" % str(autovalidate)) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) if verbose: print("Reading data from '%s'" % str(input_kgtk_file), file=error_file, flush=True) print("Reading patterns from '%s'" % str(pattern_kgtk_file), file=error_file, flush=True) if output_kgtk_file is not None: print("Writing good data to '%s'" % str(output_kgtk_file), file=error_file, flush=True) if reject_kgtk_file is not None: print("Writing rejected data to '%s'" % str(reject_kgtk_file), file=error_file, flush=True) try: pkr: KgtkReader = KgtkReader.open(pattern_kgtk_file, error_file=error_file, mode=KgtkReaderMode.EDGE, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose) pps: PropertyPatterns = PropertyPatterns.load( pkr, value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose) kr: KgtkReader = KgtkReader.open(input_kgtk_file, error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose) output_column_names: typing.List[str] = [] isa_column_idx: int = -1 if output_kgtk_file is not None: output_column_names = kr.column_names.copy() if add_isa_column: if isa_column_name in output_column_names: isa_column_idx = output_column_names.index(isa_column_name) else: isa_column_idx = len(output_column_names) output_column_names.append(isa_column_name) ppv: PropertyPatternValidator = PropertyPatternValidator.new( pps, kr, grouped_input=grouped_input, reject_node1_groups=reject_node1_groups, no_complaints=no_complaints, complain_immediately=complain_immediately, isa_column_idx=isa_column_idx, autovalidate=autovalidate, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose) kw: typing.Optional[KgtkWriter] = None if output_kgtk_file is not None: kw = KgtkWriter.open(output_column_names, output_kgtk_file, verbose=verbose, very_verbose=very_verbose) rkw: typing.Optional[KgtkWriter] = None if reject_kgtk_file is not None: rkw = KgtkWriter.open(output_column_names, reject_kgtk_file, verbose=verbose, very_verbose=very_verbose) ppv.process(kr, kw, rkw) if verbose: print("Read %d rows, %d valid" % (ppv.input_row_count, ppv.valid_row_count), file=error_file, flush=True) if kw is not None: print("Wrote %d good rows" % ppv.output_row_count, file=error_file, flush=True) if rkw is not None: print("Wrote %d rejected rows" % ppv.reject_row_count, file=error_file, flush=True) if kw is not None: kw.close() if rkw is not None: rkw.close() return 0 except Exception as e: raise KGTKException(e)
def run(input_file: KGTKFiles, output_file: KGTKFiles, mapping_file: KGTKFiles, unmodified_edges_file: KGTKFiles, activated_mapping_file: KGTKFiles, rejected_mapping_file: KGTKFiles, confidence_column_name: str, require_confidence: bool, default_confidence_str: typing.Optional[str], confidence_threshold: float, same_as_item_label: str, same_as_property_label: str, allow_exact_duplicates: bool, allow_idempotent_mapping: bool, split_output_mode: bool, modified_pattern: str, node1_column_name: typing.Optional[str], label_column_name: typing.Optional[str], node2_column_name: typing.Optional[str], mapping_rule_mode: str, mapping_node1_column_name: typing.Optional[str], mapping_label_column_name: typing.Optional[str], mapping_node2_column_name: typing.Optional[str], errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. )->int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) mapping_kgtk_file: Path = KGTKArgumentParser.get_input_file(mapping_file, who="KGTK mappping file") unmodified_edges_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmodified_edges_file, who="KGTK unmodified edges output file") activated_mapping_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(activated_mapping_file, who="KGTK activated mapping output file") rejected_mapping_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(rejected_mapping_file, who="KGTK rejected mapping output file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input", fallback=True) mapping_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="mapping", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % repr(str(input_kgtk_file)), file=error_file, flush=True) print("--output-file=%s" % repr(str(output_kgtk_file)), file=error_file, flush=True) print("--mapping-file=%s" % repr(str(mapping_kgtk_file)), file=error_file, flush=True) if unmodified_edges_kgtk_file is not None: print("--unmodified-edges-file=%s" % repr(str(unmodified_edges_kgtk_file)), file=error_file, flush=True) if activated_mapping_kgtk_file is not None: print("--activated-mapping-edges-file=%s" % repr(str(activated_mapping_kgtk_file)), file=error_file, flush=True) if rejected_mapping_kgtk_file is not None: print("--rejected-mapping-edges-file=%s" % repr(str(rejected_mapping_kgtk_file)), file=error_file, flush=True) print("--confidence-column=%s" % repr(confidence_column_name), file=error_file, flush=True) print("--require-confidence=%s" % repr(require_confidence), file=error_file, flush=True) if default_confidence_str is not None: print("--default-confidence-value=%s" % default_confidence_str, file=error_file, flush=True) print("--threshold=%f" % confidence_threshold, file=error_file, flush=True) print("--same-as-item-label=%s" % repr(same_as_item_label), file=error_file, flush=True) print("--same-as-property-label=%s" % repr(same_as_property_label), file=error_file, flush=True) print("--allow-exact-duplicates=%s" % repr(allow_exact_duplicates), file=error_file, flush=True) print("--allow-idempotent-actions=%s" % repr(allow_idempotent_mapping), file=error_file, flush=True) print("--split-output-mode=%s" % repr(split_output_mode), file=error_file, flush=True) print("--modified-pattern=%s" % repr(modified_pattern), file=error_file, flush=True) if node1_column_name is not None: print("--node1-column-=%s" % repr(node1_column_name), file=error_file, flush=True) if label_column_name is not None: print("--label-column-=%s" % repr(label_column_name), file=error_file, flush=True) if node2_column_name is not None: print("--node2-column-=%s" % repr(node2_column_name), file=error_file, flush=True) print("--mapping-rule-mode=%s" % repr(mapping_rule_mode), file=error_file, flush=True) if mapping_node1_column_name is not None: print("--mapping-node1-column-=%s" % repr(mapping_node1_column_name), file=error_file, flush=True) if mapping_label_column_name is not None: print("--mapping-label-column-=%s" % repr(mapping_label_column_name), file=error_file, flush=True) if mapping_node2_column_name is not None: print("--mapping-node2-column-=%s" % repr(mapping_node2_column_name), file=error_file, flush=True) input_reader_options.show(out=error_file, who="input") mapping_reader_options.show(out=error_file, who="mapping") value_options.show(out=error_file) print("=======", file=error_file, flush=True) default_confidence_value: typing.Optional[float] = None if default_confidence_str is not None: try: default_confidence_value = float(default_confidence_str) except: raise KGTKException("--default-confidence-value=%s is invalid" % repr(default_confidence_str)) try: if verbose: print("Opening the mapping file %s." % repr(str(mapping_kgtk_file)), file=error_file, flush=True) mkr: KgtkReader = KgtkReader.open(mapping_kgtk_file, options=mapping_reader_options, value_options = value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) trouble = False mapping_node1_idx: int = mkr.get_node1_column_index(mapping_node1_column_name) mapping_label_idx: int = mkr.get_label_column_index(mapping_label_column_name) mapping_node2_idx: int = mkr.get_node2_column_index(mapping_node2_column_name) if mapping_node1_idx < 0: trouble = True print("Error: Cannot find the mapping file node1 column.", file=error_file, flush=True) if mapping_label_idx < 0 and mapping_rule_mode == "normal": trouble = True print("Error: Cannot find the mapping file label column.", file=error_file, flush=True) if mapping_node2_idx < 0: trouble = True print("Error: Cannot find the mapping file node2 column.", file=error_file, flush=True) if trouble: # Clean up: mkr.close() raise KGTKException("Missing columns in the mapping file.") confidence_column_idx: int = mkr.column_name_map.get(confidence_column_name, -1) if require_confidence and confidence_column_idx < 0: mkr.close() raise KGTKException("The mapping file does not have a confidence column, and confidence is required.") rmkw: typing.Optional[KgtkWriter] = None if rejected_mapping_kgtk_file is not None: if verbose: print("Opening the rejected mapping edges file %s." % repr(str(rejected_mapping_kgtk_file)), file=error_file, flush=True) rmkw = KgtkWriter.open(mkr.column_names, rejected_mapping_kgtk_file, mode=KgtkWriter.Mode[mkr.mode.name], use_mgzip=input_reader_options.use_mgzip, # Hack! mgzip_threads=input_reader_options.mgzip_threads, # Hack! error_file=error_file, verbose=verbose, very_verbose=very_verbose) # Mapping structures: item_map: typing.MutableMapping[str, str] = dict() item_line_map: typing.MutableMapping[str, int] = dict() property_map: typing.MutableMapping[str, str] = dict() property_line_map: typing.MutableMapping[str, int] = dict() mapping_rows: typing.MutableMapping[int, typing.List[str]] = dict() activated_mapping_rows: typing.MutableMapping[int, typing.List[str]] = dict() # Read the mapping file. if verbose: print("Processing the mapping file.", file=error_file, flush=True) mapping_confidence_exclusions: int = 0 mapping_idempotent_exclusions: int = 0 mapping_errors: int = 0 mapping_line_number: int = 0 mrow: typing.List[str] for mrow in mkr: mapping_line_number += 1 mapping_node1: str = mrow[mapping_node1_idx] mapping_label: str = mrow[mapping_label_idx] if mapping_rule_mode == "normal" else "" mapping_node2: str = mrow[mapping_node2_idx] mapping_confidence: typing.Optional[float] = default_confidence_value if confidence_column_idx >= 0: confidence_value_str: str = mrow[confidence_column_idx] if len(confidence_value_str) == 0: if require_confidence: print("In line %d of the mapping file: the required confidence value is missing" % (mapping_line_number), file=error_file, flush=True) mapping_errors += 1 continue else: try: mapping_confidence = float(confidence_value_str) except ValueError: print("In line %d of the mapping file: cannot parse confidence value %s" % (mapping_line_number, repr(mrow[confidence_column_idx])), file=error_file, flush=True) mapping_errors += 1 continue if mapping_confidence is not None and mapping_confidence < confidence_threshold: mapping_confidence_exclusions += 1 if rmkw is not None: rmkw.write(mrow) continue if mapping_node1 == mapping_node2 and not allow_idempotent_mapping: mapping_idempotent_exclusions += 1 continue if mapping_rule_mode == "same-as-item" or mapping_label == same_as_item_label: if mapping_node1 in item_map: if mapping_node2 != item_map[mapping_node1] or not allow_exact_duplicates: print("Duplicate %s for %s at mapping file line %d, originally in line %d" % (mapping_label, repr(mapping_node1), mapping_line_number, item_line_map[mapping_node1]), file=error_file, flush=True) mapping_errors += 1 continue item_map[mapping_node1] = mapping_node2 item_line_map[mapping_node1] = mapping_line_number mapping_rows[mapping_line_number] = mrow.copy() elif mapping_rule_mode == "same-as-property" or mapping_label == same_as_property_label: if mapping_node1 in property_map: if mapping_node2 != property_map[mapping_node1] or not allow_exact_duplicates: print("Duplicate %s for %s at mapping file line %d, originally in line %d" % (mapping_label, repr(mapping_node1), mapping_line_number, property_line_map[mapping_node1]), file=error_file, flush=True) mapping_errors += 1 continue property_map[mapping_node1] = mapping_node2 property_line_map[mapping_node1] = mapping_line_number mapping_rows[mapping_line_number] = mrow.copy() else: print("Unknown mapping action %s at line %d of mapping file %s" % (mapping_label, mapping_line_number, repr(str(mapping_kgtk_file))), file=error_file, flush=True) mapping_errors += 1 continue # Close the mapping file. mkr.close() if rmkw is not None: rmkw.close() if mapping_errors > 0: raise KGTKException("%d errors detected in the mapping file %s" % (mapping_errors, repr(str(mapping_kgtk_file)))) if len(item_map) == 0 and len(property_map) == 0: raise KGTKException("Nothing read from the mapping file %s" % repr(str(mapping_kgtk_file))) if verbose: print("%d mapping lines, %d excluded for confidence, %d excluded for idempotency." % (mapping_line_number, mapping_confidence_exclusions, mapping_idempotent_exclusions), file=error_file, flush=True) print("%d item mapping rules." % len(item_map), file=error_file, flush=True) print("%d property mapping rules." % len(property_map), file=error_file, flush=True) if verbose: print("Opening the input file %s." % repr(str(input_kgtk_file)), file=error_file, flush=True) ikr: KgtkReader = KgtkReader.open(input_kgtk_file, options=input_reader_options, value_options = value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) trouble = False input_node1_idx: int = ikr.get_node1_column_index(node1_column_name) input_label_idx: int = ikr.get_label_column_index(label_column_name) input_node2_idx: int = ikr.get_node2_column_index(node2_column_name) if input_node1_idx < 0 and mapping_rule_mode in ["normal", "same-as-item"]: trouble = True print("Error: Cannot find the input file node1 column.", file=error_file, flush=True) if input_label_idx < 0 and mapping_rule_mode in ["normal", "same-as-property"]: trouble = True print("Error: Cannot find the input file label column.", file=error_file, flush=True) if input_node2_idx < 0 and mapping_rule_mode in ["normal", "same-as-item"]: trouble = True print("Error: Cannot find the input file node2 column.", file=error_file, flush=True) if trouble: # Clean up: ikr.close() raise KGTKException("Missing columns in the input file.") okw: KgtkWriter = KgtkWriter.open(ikr.column_names, output_kgtk_file, mode=KgtkWriter.Mode[ikr.mode.name], use_mgzip=input_reader_options.use_mgzip, # Hack! mgzip_threads=input_reader_options.mgzip_threads, # Hack! error_file=error_file, verbose=verbose, very_verbose=very_verbose) uekw: typing.Optional[KgtkWriter] = None if unmodified_edges_kgtk_file is not None: if verbose: print("Opening the unmodified edges file %s." % repr(str(unmodified_edges_kgtk_file)), file=error_file, flush=True) uekw = KgtkWriter.open(ikr.column_names, unmodified_edges_kgtk_file, mode=KgtkWriter.Mode[ikr.mode.name], use_mgzip=input_reader_options.use_mgzip, # Hack! mgzip_threads=input_reader_options.mgzip_threads, # Hack! error_file=error_file, verbose=verbose, very_verbose=very_verbose) amkw: typing.Optional[KgtkWriter] = None if activated_mapping_kgtk_file is not None: if verbose: print("Opening the activated mapping edges file %s." % repr(str(activated_mapping_kgtk_file)), file=error_file, flush=True) amkw = KgtkWriter.open(mkr.column_names, activated_mapping_kgtk_file, mode=KgtkWriter.Mode[mkr.mode.name], use_mgzip=input_reader_options.use_mgzip, # Hack! mgzip_threads=input_reader_options.mgzip_threads, # Hack! error_file=error_file, verbose=verbose, very_verbose=very_verbose) # Process each row of the input file. if verbose: print("Processing the input file.", file=error_file, flush=True) input_count: int = 0 modified_edge_count: int = 0 unmodified_edge_count: int = 0 row: typing.List[str] for row in ikr: input_count +=1 newrow: typing.List[str] = row.copy() modified_node1: bool = False modified_node2: bool = False modified_label: bool = False if mapping_rule_mode in ["normal", "same-as-item"]: input_node1: str = row[input_node1_idx] if input_node1 in item_map: newrow[input_node1_idx] = item_map[input_node1] modified_node1 = True if amkw is not None: mapping_line_number = item_line_map[input_node1] if mapping_line_number not in activated_mapping_rows: activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number] input_node2: str = row[input_node2_idx] if input_node2 in item_map: newrow[input_node2_idx] = item_map[input_node2] modified_node2 = True if amkw is not None: mapping_line_number = item_line_map[input_node2] if mapping_line_number not in activated_mapping_rows: activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number] if mapping_rule_mode in ["normal", "same-as-property"]: input_label: str = row[input_label_idx] if input_label in property_map: newrow[input_label_idx] = property_map[input_label] modified_label = True if amkw is not None: mapping_line_number = property_line_map[input_label] if mapping_line_number not in activated_mapping_rows: activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number] modified: bool if modified_pattern == "node1|label|node2": modified = modified_node1 or modified_label or modified_node2 elif modified_pattern == "node1|label": modified = modified_node1 or modified_label elif modified_pattern == "node1|node2": modified = modified_node1 or modified_node2 elif modified_pattern == "label|node2": modified = modified_label or modified_node2 elif modified_pattern == "node1": modified = modified_node1 elif modified_pattern == "label": modified = modified_label elif modified_pattern == "node2": modified = modified_node2 elif modified_pattern == "node1&label&node2": modified = modified_node1 and modified_label and modified_node2 elif modified_pattern == "node1&label": modified = modified_node1 and modified_label elif modified_pattern == "node1&node2": modified = modified_node1 and modified_node2 elif modified_pattern == "label&node2": modified = modified_label and modified_node2 else: raise KGTKException("Unrecognized modification test pattern %s" % repr(modified_pattern)) if modified: modified_edge_count += 1 okw.write(newrow) else: unmodified_edge_count += 1 if uekw is not None: uekw.write(row) if not split_output_mode: okw.write(row) # Done! ikr.close() okw.close() if verbose: print("%d edges read. %d modified, %d unmodified." % (input_count, modified_edge_count, unmodified_edge_count), file=error_file, flush=True) if uekw is not None: uekw.close() if amkw is not None: activated_count: int = 0 for mapping_line_number in sorted(activated_mapping_rows.keys()): amkw.write(activated_mapping_rows[mapping_line_number]) activated_count += 1 amkw.close() if verbose: print("%d activated mapping edges" % activated_count, file=error_file, flush=True) return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, pattern: str, subj_col: typing.Optional[str], pred_col: typing.Optional[str], obj_col: typing.Optional[str], or_pattern: bool, invert: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) reject_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( output_file, who="KGTK reject file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if reject_kgtk_file is not None: print("--reject-file=%s" % str(reject_kgtk_file), file=error_file) print("--pattern=%s" % str(pattern), file=error_file) if subj_col is not None: print("--subj=%s" % str(subj_col), file=error_file) if pred_col is not None: print("--pred=%s" % str(pred_col), file=error_file) if obj_col is not None: print("--obj=%s" % str(obj_col), file=error_file) print("--or=%s" % str(or_pattern), file=error_file) print("--invert=%s" % str(invert), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) def prepare_filter(pattern: str) -> typing.Set[str]: filt: typing.Set[str] = set() pattern = pattern.strip() if len(pattern) == 0: return filt target: str for target in pattern.split(","): target = target.strip() if len(target) > 0: filt.add(target) return filt try: patterns: typing.List[str] = pattern.split(";") if len(patterns) != 3: print( "Error: The pattern must have three sections separated by semicolons (two semicolons total).", file=error_file, flush=True) raise KGTKException("Bad pattern") subj_filter: typing.Set[str] = prepare_filter(patterns[0]) pred_filter: typing.Set[str] = prepare_filter(patterns[1]) obj_filter: typing.Set[str] = prepare_filter(patterns[2]) apply_subj_filter: bool = len(subj_filter) > 0 apply_pred_filter: bool = len(pred_filter) > 0 apply_obj_filter: bool = len(obj_filter) > 0 if verbose and not (apply_subj_filter or apply_pred_filter or apply_obj_filter): print("Warning: the filter is empty.", file=error_file, flush=True) if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) subj_idx: int = kr.get_node1_column_index(subj_col) pred_idx: int = kr.get_label_column_index(pred_col) obj_idx: int = kr.get_node2_column_index(obj_col) # Complain about a missing column only when it is needed by the pattern. trouble: bool = False if subj_idx < 0 and len(subj_filter) > 0: trouble = True print("Error: Cannot find the subject column '%s'." % kr.get_node1_canonical_name(subj_col), file=error_file, flush=True) if pred_idx < 0 and len(pred_filter) > 0: trouble = True print("Error: Cannot find the predicate column '%s'." % kr.get_label_canonical_name(pred_col), file=error_file, flush=True) if obj_idx < 0 and len(obj_filter) > 0: trouble = True print("Error: Cannot find the object column '%s'." % kr.get_node2_canonical_name(obj_col), file=error_file, flush=True) if trouble: raise KGTKException("Missing columns.") if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(kr.column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) rw: typing.Optional[KgtkWriter] = None if reject_kgtk_file is not None: if verbose: print("Opening the reject file: %s" % str(reject_kgtk_file), file=error_file, flush=True) rw = KgtkWriter.open(kr.column_names, reject_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 subj_filter_keep_count: int = 0 pred_filter_keep_count: int = 0 obj_filter_keep_count: int = 0 subj_filter_reject_count: int = 0 pred_filter_reject_count: int = 0 obj_filter_reject_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 keep: bool = False reject: bool = False if apply_subj_filter: if row[subj_idx] in subj_filter: keep = True subj_filter_keep_count += 1 else: reject = True subj_filter_reject_count += 1 if apply_pred_filter: if row[pred_idx] in pred_filter: keep = True pred_filter_keep_count += 1 else: reject = True pred_filter_reject_count += 1 if apply_obj_filter: if row[obj_idx] in obj_filter: keep = True obj_filter_keep_count += 1 else: reject = True obj_filter_reject_count += 1 if (not keep ^ invert) if or_pattern else (reject ^ invert): if rw is not None: rw.write(row) reject_line_count += 1 else: kw.write(row) output_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count)) print("Keep counts: subject=%d, predicate=%d, object=%d." % (subj_filter_keep_count, pred_filter_keep_count, obj_filter_keep_count)) print("Reject counts: subject=%d, predicate=%d, object=%d." % (subj_filter_reject_count, pred_filter_reject_count, obj_filter_reject_count)) kw.close() if rw is not None: rw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def run( input_file: KGTKFiles, output_file: KGTKFiles, list_output_file: KGTKFiles, key_column_names: typing.List[str], keep_first_names: typing.List[str], compact_id: bool, deduplicate: bool, sorted_input: bool, verify_sort: bool, lists_in_input: bool, report_lists: bool, exclude_lists: bool, output_only_lists: bool, build_id: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.reshape.kgtkcompact import KgtkCompact from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) list_output_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( list_output_file, who="KGTK list output file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict( kwargs) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if list_output_kgtk_file is not None: print("--list-output-file=%s" % str(list_output_kgtk_file), file=error_file, flush=True) print("--columns=%s" % " ".join(key_column_names), file=error_file) print("--keep-first=%s" % " ".join(keep_first_names), file=error_file) print("--compact-id=%s" % str(compact_id), file=error_file, flush=True) print("--deduplicate=%s" % str(deduplicate), file=error_file, flush=True) print("--presorted=%s" % str(sorted_input), file=error_file, flush=True) print("--verify-sort=%s" % str(verify_sort), file=error_file, flush=True) print("--lists-in-input=%s" % str(lists_in_input), file=error_file, flush=True) print("--report-lists=%s" % str(report_lists), file=error_file, flush=True) print("--exclude-lists=%s" % str(exclude_lists), file=error_file, flush=True) print("--output-only-lists=%s" % str(output_only_lists), file=error_file, flush=True) print("--build-id=%s" % str(build_id), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) KgtkReader.show_debug_arguments(errors_to_stdout=errors_to_stdout, errors_to_stderr=errors_to_stderr, show_options=show_options, verbose=verbose, very_verbose=very_verbose, out=error_file) print("=======", file=error_file, flush=True) if exclude_lists and output_only_lists: raise KGTKException( "--exclude-lists and --output-only-lists may not be used together." ) try: ex: KgtkCompact = KgtkCompact( input_file_path=input_kgtk_file, output_file_path=output_kgtk_file, list_output_file_path=list_output_kgtk_file, key_column_names=key_column_names, keep_first_names=keep_first_names, compact_id=compact_id, deduplicate=deduplicate, sorted_input=sorted_input, verify_sort=verify_sort, lists_in_input=lists_in_input, report_lists=report_lists, exclude_lists=exclude_lists, output_only_lists=output_only_lists, build_id=build_id, idbuilder_options=idbuilder_options, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) ex.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, filter_column_names: typing.List[str], all_are: bool = False, only_count: bool = False, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.iff.kgtkifempty import KgtkIfEmpty from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) reject_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( reject_file, who="KGTK reject file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if reject_kgtk_file is not None: print("--reject-file=%s" % str(reject_kgtk_file), file=error_file) print("--columns=%s" % " ".join(filter_column_names), file=error_file) print("--count=%s" % str(only_count), file=error_file) print("--all=%s" % str(all_are), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: ie: KgtkIfEmpty = KgtkIfEmpty( input_file_path=input_kgtk_file, filter_column_names=filter_column_names, output_file_path=output_kgtk_file, reject_file_path=reject_kgtk_file, all_are=all_are, notempty=True, only_count=only_count, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) ie.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, undirected: bool, errors_to_stdout: bool, errors_to_stderr: bool, show_options: bool, verbose: bool, very_verbose: bool, **kwargs, # Whatever KgtkFileOptions and KgtkValueOptions want. ): # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException import kgtk.gt.analysis_utils as gtanalysis from kgtk.gt.gt_load import load_graph_from_kgtk from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions try: # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_gt_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file(output_file) if verbose: print('loading the KGTK input file...\n', file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) sub: int = kr.get_node1_column_index() if sub < 0: print("Missing node1 (subject) column.", file=error_file, flush=True) pred: int = kr.get_label_column_index() if pred < 0: print("Missing label (predicate) column.", file=error_file, flush=True) obj: int = kr.get_node2_column_index() if obj < 0: print("Missing node2 (object) column", file=error_file, flush=True) if sub < 0 or pred < 0 or obj < 0: kr.close() raise KGTKException("Exiting due to missing columns.") G2 = load_graph_from_kgtk(kr, directed=not undirected, ecols=(sub, obj), verbose=verbose, out=error_file) if verbose: print('graph loaded! It has %d nodes and %d edges.' % (G2.num_vertices(), G2.num_edges()), file=error_file, flush=True) print('\n###Top relations:', file=error_file, flush=True) for rel, freq in gtanalysis.get_topN_relations( G2, pred_property=kr.column_names[pred]): print('%s\t%d' % (rel, freq), file=error_file, flush=True) if output_gt_file is not None: if verbose: print('\nNow saving the graph to %s' % str(output_gt_file), file=error_file, flush=True) G2.save(str(output_gt_file)) if verbose: print('Done saving the graph.', file=error_file, flush=True) except Exception as e: raise KGTKException('Error: ' + str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, new_edges_file: KGTKFiles, base_columns: typing.Optional[typing.List[str]] = None, columns_to_lower: typing.Optional[typing.List[str]] = None, label_value: str = KgtkLift.DEFAULT_LABEL_SELECT_COLUMN_VALUE, lift_separator: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SEPARATOR, lower: bool = False, normalize: bool = False, deduplicate_new_edges: bool = True, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) new_edges_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file(new_edges_file, who="Label file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if new_edges_kgtk_file is not None: print("--label-file=%s" % str(new_edges_kgtk_file), file=error_file) if base_columns is not None: print("--base-columns=%s" % " ".join(base_columns), file=error_file) if columns_to_lower is not None: print("--columns-to-lower=%s" % " ".join(columns_to_lower), file=error_file) print("--label-value=%s" % label_value, file=error_file) print("--lift-separator=%s" % lift_separator, file=error_file) print("--lower=%s" % lower, file=error_file) print("--normalize=%s" % normalize, file=error_file) print("--deduplicate-labels=%s" % deduplicate_new_edges, file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) if not lower and not normalize: raise KGTKException( "One or both of --lower and --normalize must be requested.") try: if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) # Map the index of a column being removed to the index of the base column that supplies its node1 value. lower_map: typing.MutableMapping[int, typing.Tuple[int, str]] = dict() node1_column_name: str = kr.get_node1_column_actual_name() label_column_name: str = kr.get_label_column_actual_name() node2_column_name: str = kr.get_node2_column_actual_name() id_column_name: str = kr.get_id_column_actual_name() key_column_names: typing.List[str] = list() key_column_idxs: typing.Set[int] = set() if node1_column_name != "": if verbose: print("Node1 column name: %s" % node1_column_name, file=error_file, flush=True) key_column_names.append(node1_column_name) key_column_idxs.add(kr.node1_column_idx) if label_column_name != "": if verbose: print("Label column name: %s" % label_column_name, file=error_file, flush=True) key_column_names.append(label_column_name) key_column_idxs.add(kr.label_column_idx) if node2_column_name != "": if verbose: print("Node2 column name: %s" % node2_column_name, file=error_file, flush=True) key_column_names.append(node2_column_name) key_column_idxs.add(kr.node2_column_idx) if id_column_name != "": if verbose: print("Id column name: %s" % id_column_name, file=error_file, flush=True) key_column_names.append(id_column_name) key_column_idxs.add(kr.id_column_idx) elif normalize: raise KGTKException( "--normalize was requested but the ID column was not found.") base_name: str new_label_value: str column_name: str idx: int # There are three option patterns. if columns_to_lower is not None and len( columns_to_lower) > 0 and base_columns is not None and len( base_columns) > 0: # Pattern 1: len(columns_to_lower) > 0 and len(base_columns) == len(columns_to_lower) # column_names and base_columns are paired. New records use label_value. if len(columns_to_lower) != len(base_columns): raise KGTKException( "There are %d columns to remove but only %d base columns." % (len(columns_to_lower), len(base_columns))) if len(label_value) == 0: raise KGTKException("The --label-value must not be empty.") for idx, column_name in enumerate(columns_to_lower): base_name = base_columns[idx] if column_name not in kr.column_names: raise KGTKException( "Column %s is an unknown column, cannot remove it." % repr(column_name)) if column_name in key_column_names: raise KGTKException( "Column %s is a key column, cannot remove it." % repr(column_name)) if base_name not in kr.column_names: raise KGTKException( "For column name %s, base name %s is unknown" % (repr(column_name), repr(base_name))) if normalize and base_name == id_column_name: lower_map[kr.column_name_map[column_name]] = ( kr.column_name_map[base_name], column_name) else: if not lower: raise KGTKException( "--lower is not enabled for column %s, base name %s" % (repr(column_name), repr(base_name))) lower_map[kr.column_name_map[column_name]] = ( kr.column_name_map[base_name], label_value) elif columns_to_lower is not None and len(columns_to_lower) > 0 and ( base_columns is None or len(base_columns) == 0): # Pattern 2: len(columns_to_lower) > 0 and len(base_columns) == 0 # Each column name is split at the lift separator to determine the base name and label value. if len(lift_separator) == 0: raise KGTKException("The --lift-separator must not be empty.") for idx, column_name in enumerate(columns_to_lower): if column_name not in kr.column_names: raise KGTKException( "Column %s is an unknown column, cannot remove it." % repr(column_name)) if column_name in key_column_names: raise KGTKException( "Column %s is a key column, cannot remove it." % repr(column_name)) if lower and lift_separator in column_name: base_name, new_label_value = column_name.split( lift_separator, 1) if base_name not in kr.column_names: raise KGTKException( "For column name %s, base name %s is not known" % (repr(column_name), repr(base_name))) elif normalize: base_name = id_column_name new_label_value = column_name else: raise KGTKException( "Unable to parse column name %s, no separator (%s)." % (repr(column_name), repr(lift_separator))) lower_map[kr.column_name_map[column_name]] = ( kr.column_name_map[base_name], new_label_value) elif columns_to_lower is None or len(columns_to_lower) == 0: # Pattern 3: len(columns_to_lower) == 0. # Any column that matches a lift pattern against one of the # key columns (node1, label, node2, id, or their aliases) # will be lowered. if len(lift_separator) == 0: raise KGTKException("The --lift-separator must not be empty.") if base_columns is None or len(base_columns) == 0: # The base name list wasn't supplied. Use [node1, label, node2, id] base_columns = list(key_column_names) if verbose: print("Using the default base columns: %s" % " ".join(base_columns), file=error_file, flush=True) else: if verbose: print("Using these base columns: %s" % " ".join(base_columns), file=error_file, flush=True) for idx, column_name in enumerate(kr.column_names): # Skip the node1, label, node12, and id columns if idx in key_column_idxs: if verbose: print("column %s is a key column, skipping." % repr(column_name), file=error_file, flush=True) continue # Does this column match a lifting pattern? if lower and lift_separator in column_name: base_name, new_label_value = column_name.split( lift_separator, 1) if base_name not in base_columns: if verbose: print( "Column %s contains base name %s, which is not a base column." % (repr(column_name), repr(base_name)), file=error_file, flush=True) continue elif normalize: base_name = id_column_name new_label_value = column_name else: if verbose: print( "Column %s does not contain the separator %s and not normalizing, skipping." % (repr(column_name), repr(lift_separator)), file=error_file, flush=True) continue # This test should be redundant. if base_name in kr.column_names: lower_map[idx] = (kr.column_name_map[base_name], new_label_value) else: raise KGTKException( "Base name %s was unexpectedly not found." % repr(base_name)) if len(lower_map) == 0: raise KGTKException("There are no columns to lower or normalize.") if verbose: print("The following columns will be lowered or normalized", file=error_file, flush=True) for idx in sorted(lower_map.keys()): column_name = kr.column_names[idx] base_idx, new_label_value = lower_map[idx] base_name = kr.column_names[base_idx] print(" %s from %s (label %s)" % (column_name, base_name, repr(new_label_value)), file=error_file, flush=True) output_column_names: typing.List[str] = list() for idx, column_name in enumerate(kr.column_names): if idx not in lower_map: output_column_names.append(column_name) if verbose: print("The output columns are: %s" % " ".join(output_column_names), file=error_file, flush=True) if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open( output_column_names, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=False, # Simplifies writing the labels verbose=verbose, very_verbose=very_verbose) shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names) lkw: typing.Optional[KgtkWriter] = None if new_edges_kgtk_file is not None: if verbose: print("Opening the label output file: %s" % str(new_edges_kgtk_file), file=error_file, flush=True) label_column_names = [ node1_column_name, label_column_name, node2_column_name ] lkw = KgtkWriter.open(label_column_names, new_edges_kgtk_file, mode=KgtkWriter.Mode.EDGE, verbose=verbose, very_verbose=very_verbose) # Optionally deduplicate the labels # set(node1_value + KgtkFormat.SEPARATOR + node2_value) label_set: typing.Set[str] = set() label_key: str input_line_count: int = 0 output_line_count: int = 0 label_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 kw.write(row, shuffle_list=shuffle_list) output_line_count += 1 column_idx: int for column_idx in lower_map.keys(): node1_idx: int node1_idx, new_label_value = lower_map[column_idx] node1_value: str node1_value = row[node1_idx] if len(node1_value) == 0: continue # TODO: raise an exception item: str = row[column_idx] if len(item) == 0: continue # Ignore empty node2 values. # Ths item might be a KGTK list. Let's split it, because # lists aren't allow in the node2 values we'll generate. node2_value: str for node2_value in KgtkValue.split_list(item): if len(node2_value) == 0: continue # Ignore empty node2 values. if deduplicate_new_edges: label_key = node1_value + KgtkFormat.KEY_FIELD_SEPARATOR + new_label_value + KgtkFormat.KEY_FIELD_SEPARATOR + node2_value if label_key in label_set: continue else: label_set.add(label_key) output_map: typing.Mapping[str, str] = { node1_column_name: node1_value, label_column_name: new_label_value, node2_column_name: node2_value, } if lkw is None: kw.writemap(output_map) label_line_count += 1 output_line_count += 1 else: lkw.writemap(output_map) label_line_count += 1 if verbose: print("Read %d rows, wrote %d rows with %d labels." % (input_line_count, output_line_count, label_line_count), file=error_file, flush=True) kw.close() if lkw is not None: lkw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def run(input_file: KGTKFiles, output_file: KGTKFiles, directed, log_file): from kgtk.exceptions import KGTKException def infer_index(h, options=[]): for o in options: if o in h: return h.index(o) return -1 def infer_predicate(h, options=[]): for o in options: if o in h: return o return '' try: # import modules locally from pathlib import Path import socket import sys import typing from graph_tool import load_graph_from_csv from graph_tool import centrality import kgtk.gt.analysis_utils as gtanalysis import csv csv.field_size_limit(sys.maxsize) filename: Path = KGTKArgumentParser.get_input_file(input_file) output: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file(output_file) with open(filename, 'r') as f: header = next(f).split('\t') subj_index = infer_index(header, options=['node1', 'subject']) obj_index = infer_index(header, options=['node2', 'object', 'value']) predicate = infer_predicate( header, options=['relation', 'predicate', 'label', 'relationship']) p = [] for i, header_col in enumerate(header): if i in [subj_index, obj_index]: continue p.append(header_col) with open(log_file, 'w') as writer: writer.write('loading the TSV graph now ...\n') G2 = load_graph_from_csv(str(filename), skip_first=True, directed=directed, hashed=True, ecols=[subj_index, obj_index], eprop_names=p, csv_options={'delimiter': '\t'}) writer.write('graph loaded! It has %d nodes and %d edges\n' % (G2.num_vertices(), G2.num_edges())) writer.write('\n###Top relations:\n') for rel, freq in gtanalysis.get_topN_relations( G2, pred_property=predicate): writer.write('%s\t%d\n' % (rel, freq)) if output: writer.write('now saving the graph to %s\n' % str(output)) G2.save(str(output)) except Exception as e: raise KGTKException('Error: ' + str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, pattern: str, subj_col: typing.Optional[str], pred_col: typing.Optional[str], obj_col: typing.Optional[str], or_pattern: bool, invert: bool, show_version: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) reject_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( reject_file, who="KGTK reject file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) UPDATE_VERSION: str = "2020-08-06T17:06:06.829542+00:00#Mu9vz3KEPh+beQeSwZ8qGMKrTJzHWZFfZFXY6UrYXJAnNpPSin+5NvkSfxKLMkyJtGyeavgGAz8+74bup7eYaQ==" if show_version or verbose: print("kgtk filter version: %s" % UPDATE_VERSION, file=error_file, flush=True) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if reject_kgtk_file is not None: print("--reject-file=%s" % str(reject_kgtk_file), file=error_file) print("--pattern=%s" % str(pattern), file=error_file) if subj_col is not None: print("--subj=%s" % str(subj_col), file=error_file) if pred_col is not None: print("--pred=%s" % str(pred_col), file=error_file) if obj_col is not None: print("--obj=%s" % str(obj_col), file=error_file) print("--or=%s" % str(or_pattern), file=error_file) print("--invert=%s" % str(invert), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) def prepare_filter(pattern: str) -> typing.Set[str]: filt: typing.Set[str] = set() pattern = pattern.strip() if len(pattern) == 0: return filt target: str for target in pattern.split(","): target = target.strip() if len(target) > 0: filt.add(target) return filt def single_predicate_filter( kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], pred_idx: int, pred_filter: typing.Set[str], ): if verbose: print("Applying a single predicate filter", file=error_file, flush=True) pred_filter_value: str = list(pred_filter)[0] input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if row[pred_idx] == pred_filter_value: kw.write(row) output_line_count += 1 else: if rw is not None: rw.write(row) reject_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count)) def single_predicate_filter_inverted( kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], pred_idx: int, pred_filter: typing.Set[str], ): if verbose: print("Applying a single predicate filter inverted", file=error_file, flush=True) pred_filter_value: str = list(pred_filter)[0] input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if row[pred_idx] != pred_filter_value: kw.write(row) output_line_count += 1 else: if rw is not None: rw.write(row) reject_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count)) def single_object_filter( kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], obj_idx: int, obj_filter: typing.Set[str], ): if verbose: print("Applying a single object filter", file=error_file, flush=True) obj_filter_value: str = list(obj_filter)[0] input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if row[obj_idx] == obj_filter_value: kw.write(row) output_line_count += 1 else: if rw is not None: rw.write(row) reject_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count)) def single_object_filter_inverted( kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], obj_idx: int, obj_filter: typing.Set[str], ): if verbose: print("Applying a single object filter inverted", file=error_file, flush=True) obj_filter_value: str = list(obj_filter)[0] input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if row[obj_idx] != obj_filter_value: kw.write(row) output_line_count += 1 else: if rw is not None: rw.write(row) reject_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count)) def general_filter(kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], subj_idx: int, subj_filter: typing.Set[str], pred_idx: int, pred_filter: typing.Set[str], obj_idx: int, obj_filter: typing.Set[str]): if verbose: print("Applying a general filter", file=error_file, flush=True) apply_subj_filter: bool = len(subj_filter) > 0 apply_pred_filter: bool = len(pred_filter) > 0 apply_obj_filter: bool = len(obj_filter) > 0 input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 subj_filter_keep_count: int = 0 pred_filter_keep_count: int = 0 obj_filter_keep_count: int = 0 subj_filter_reject_count: int = 0 pred_filter_reject_count: int = 0 obj_filter_reject_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 keep: bool = False reject: bool = False if apply_subj_filter: if row[subj_idx] in subj_filter: keep = True subj_filter_keep_count += 1 else: reject = True subj_filter_reject_count += 1 if apply_pred_filter: if row[pred_idx] in pred_filter: keep = True pred_filter_keep_count += 1 else: reject = True pred_filter_reject_count += 1 if apply_obj_filter: if row[obj_idx] in obj_filter: keep = True obj_filter_keep_count += 1 else: reject = True obj_filter_reject_count += 1 if (not keep ^ invert) if or_pattern else (reject ^ invert): if rw is not None: rw.write(row) reject_line_count += 1 else: kw.write(row) output_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count)) print("Keep counts: subject=%d, predicate=%d, object=%d." % (subj_filter_keep_count, pred_filter_keep_count, obj_filter_keep_count)) print("Reject counts: subject=%d, predicate=%d, object=%d." % (subj_filter_reject_count, pred_filter_reject_count, obj_filter_reject_count)) try: patterns: typing.List[str] = pattern.split(";") if len(patterns) != 3: print( "Error: The pattern must have three sections separated by semicolons (two semicolons total).", file=error_file, flush=True) raise KGTKException("Bad pattern") subj_filter: typing.Set[str] = prepare_filter(patterns[0]) pred_filter: typing.Set[str] = prepare_filter(patterns[1]) obj_filter: typing.Set[str] = prepare_filter(patterns[2]) if verbose and len(subj_filter) == 0 and len(pred_filter) == 0 and len( obj_filter) == 0: print("Warning: the filter is empty.", file=error_file, flush=True) if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) subj_idx: int = kr.get_node1_column_index(subj_col) pred_idx: int = kr.get_label_column_index(pred_col) obj_idx: int = kr.get_node2_column_index(obj_col) # Complain about a missing column only when it is needed by the pattern. trouble: bool = False if subj_idx < 0 and len(subj_filter) > 0: trouble = True print("Error: Cannot find the subject column '%s'." % kr.get_node1_canonical_name(subj_col), file=error_file, flush=True) if pred_idx < 0 and len(pred_filter) > 0: trouble = True print("Error: Cannot find the predicate column '%s'." % kr.get_label_canonical_name(pred_col), file=error_file, flush=True) if obj_idx < 0 and len(obj_filter) > 0: trouble = True print("Error: Cannot find the object column '%s'." % kr.get_node2_canonical_name(obj_col), file=error_file, flush=True) if trouble: raise KGTKException("Missing columns.") if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(kr.column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) rw: typing.Optional[KgtkWriter] = None if reject_kgtk_file is not None: if verbose: print("Opening the reject file: %s" % str(reject_kgtk_file), file=error_file, flush=True) rw = KgtkWriter.open(kr.column_names, reject_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) if len(subj_filter) == 0 and len(pred_filter) == 1 and len( obj_filter) == 0: if invert: single_predicate_filter_inverted(kr, kw, rw, pred_idx, pred_filter) else: single_predicate_filter(kr, kw, rw, pred_idx, pred_filter) elif len(subj_filter) == 0 and len(pred_filter) == 0 and len( obj_filter) == 1: if invert: single_object_filter_inverted(kr, kw, rw, obj_idx, obj_filter) else: single_object_filter(kr, kw, rw, obj_idx, obj_filter) else: general_filter(kr, kw, rw, subj_idx, subj_filter, pred_idx, pred_filter, obj_idx, obj_filter) kw.close() if rw is not None: rw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def run( input_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, errors_to_stdout: bool = False, errors_to_stderr: bool = False, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkReaderOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys import typing from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file_path: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file_path: Path = KGTKArgumentParser.get_output_file( output_file) reject_kgtk_file_path: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file(reject_file, who="Reject file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file_path), file=error_file) print("--output-file=%s" % str(output_kgtk_file_path), file=error_file) if reject_kgtk_file_path is not None: print("--reject-file=%s" % str(reject_kgtk_file_path), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) if verbose: if str(input_kgtk_file_path) == "-": print("Cleaning data from stdin", file=error_file, flush=True) else: print("Cleaning data from '%s'" % str(input_kgtk_file_path), file=error_file, flush=True) if str(output_kgtk_file_path) == "-": print("Writing data to stdout", file=error_file, flush=True) else: print("Writing data to '%s'" % str(output_kgtk_file_path), file=error_file, flush=True) if str(reject_kgtk_file_path) == "-": print("Writing reject data to stdout", file=error_file, flush=True) else: print("Writing reject data to '%s'" % str(reject_kgtk_file_path), file=error_file, flush=True) reject_kgtk_file: typing.Optional[typing.TextIO] = None if reject_kgtk_file_path is not None: reject_kgtk_file = open(reject_kgtk_file_path, mode="wt") try: kr: KgtkReader = KgtkReader.open(input_kgtk_file_path, error_file=error_file, reject_file=reject_kgtk_file, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose) kw: KgtkWriter = KgtkWriter.open(kr.column_names, output_kgtk_file_path, verbose=verbose, very_verbose=very_verbose) line_count: int = 0 row: typing.List[str] for row in kr: kw.write(row) line_count += 1 kw.close() if reject_kgtk_file is not None: reject_kgtk_file.close() if verbose: print("Copied %d clean data lines" % line_count, file=error_file, flush=True) return 0 except Exception as e: raise KGTKException(e)
def run( input_file: KGTKFiles, output_file: KGTKFiles, label_file: KGTKFiles, base_columns: typing.Optional[typing.List[str]] = None, columns_to_remove: typing.Optional[typing.List[str]] = None, label_value: str = KgtkLift.DEFAULT_LABEL_SELECT_COLUMN_VALUE, lift_suffix: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SUFFIX, deduplicate_labels: bool = True, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) label_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file(label_file, who="Label file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if label_kgtk_file is not None: print("--label-file=%s" % str(label_kgtk_file), file=error_file) if base_columns is not None: print("--base-columns=%s" % " ".join(base_columns), file=error_file) if columns_to_remove is not None: print("--columns-to-lower=%s" % " ".join(columns_to_remove), file=error_file) print("--label-value=%s" % label_value, file=error_file) print("--lift-suffix=%s" % lift_suffix, file=error_file) print("--deduplicate-labels=%s" % deduplicate_labels, file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) # Map the index of a column being removed to the index of the base column that supplies its node1 value. lower_map: typing.MutableMapping[int, int] = dict() # These columns will never be removed: key_column_idxs: typing.Set[int] = set( (kr.node1_column_idx, kr.label_column_idx, kr.node2_column_idx, kr.id_column_idx)) key_column_idxs.discard(-1) key_column_names: typing.Set[str] = set( (kr.column_names[idx] for idx in key_column_idxs)) base_name: str column_name: str idx: int # There are three option patterns. if columns_to_remove is not None and len( columns_to_remove) > 0 and base_columns is not None and len( base_columns) > 0: # Pattern 1: len(columns_to_remove) > 0 and len(base_columns) == len(columns_to_remove) # column_names and base_columns are paired. if len(columns_to_remove) != len(base_columns): raise KGTKException( "There are %d columns to remove but only %d base columns." % (len(columns_to_remove), len(base_columns))) for idx, column_name in enumerate(columns_to_remove): base_name = base_columns[idx] if column_name not in kr.column_names: raise KGTKException( "Column %s is an unknown column, cannot remove it." % repr(column_name)) if column_name in key_column_names: raise KGTKException( "Column %s is a key column, cannot remove it." % repr(column_name)) if base_name not in kr.column_names: raise KGTKException( "For column name %s, base name %s is unknown" % (repr(column_name), repr(base_name))) lower_map[kr.column_name_map[ column_name]] = kr.column_name_map[base_name] elif columns_to_remove is not None and len(columns_to_remove) > 0 and ( base_columns is None or len(base_columns) == 0): # Pattern 2: len(columns_to_remove) > 0 and len(base_columns) == 0 # Each column name is stripped of the lift suffix to determine the base name. if len(lift_suffix) == 0: raise KGTKException("The --lift-suffix must not be empty.") for idx, column_name in enumerate(columns_to_remove): if column_name not in kr.column_names: raise KGTKException( "Column %s is an unknown column, cannot remove it." % repr(column_name)) if column_name in key_column_names: raise KGTKException( "Column %s is a key column, cannot remove it." % repr(column_name)) if not column_name.endswith(lift_suffix): raise KGTKException("Unable to parse column name %s." % repr(column_name)) base_name = column_name[:-len(lift_suffix)] if base_name not in kr.column_names: raise KGTKException( "For column name %s, base name %s is not known" % (repr(column_name), repr(base_name))) lower_map[kr.column_name_map[ column_name]] = kr.column_name_map[base_name] elif columns_to_remove is None or len(columns_to_remove) == 0: # Pattern 3: len(columns_to_remove) == 0. if len(lift_suffix) == 0: raise KGTKException("The --lift-suffix must not be empty.") if base_columns is None or len(base_columns) == 0: # The base name list wasn't supplied. Use [node1, label, node2, id] base_columns = list(key_column_names) for idx, column_name in enumerate(kr.column_names): # Skip the node1, label, node12, and id columns if idx in key_column_idxs: continue # Does this column match a lifting pattern? for base_name in base_columns: if len(base_name) == 0: continue if column_name == base_name + lift_suffix: lower_map[idx] = kr.column_name_map[base_name] if len(lower_map) == 0: raise KGTKException("There are no columns to lower.") if verbose: print("The following columns will be lowered", file=error_file, flush=True) for idx in sorted(lower_map.keys()): column_name = kr.column_names[idx] base_name = kr.column_names[lower_map[idx]] print(" %s from %s" % (column_name, base_name), file=error_file, flush=True) output_column_names: typing.List[str] = list() for idx, column_name in enumerate(kr.column_names): if idx not in lower_map: output_column_names.append(column_name) if verbose: print("The output columns are: %s" % " ".join(output_column_names), file=error_file, flush=True) if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open( output_column_names, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=False, # Simplifies writing the labels verbose=verbose, very_verbose=very_verbose) shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names) lkw: typing.Optional[KgtkWriter] = None if label_kgtk_file is not None: if verbose: print("Opening the label output file: %s" % str(label_kgtk_file), file=error_file, flush=True) label_column_names = [ KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2 ] lkw = KgtkWriter.open(label_column_names, label_kgtk_file, mode=KgtkWriter.Mode.EDGE, verbose=verbose, very_verbose=very_verbose) # Optionally deduplicate the labels # set(node1_value + KgtkFormat.SEPARATOR + node2_value) label_set: typing.Set[str] = set() label_key: str # If labels will be written to the output file and deduplication is enabled: check_existing_labels: bool = \ deduplicate_labels and \ lkw is None and \ kr.node1_column_idx >= 0 and \ kr.label_column_idx >= 0 and \ kr.node2_column_idx >= 0 input_line_count: int = 0 output_line_count: int = 0 label_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if check_existing_labels and row[ kr.label_column_idx] == label_value: label_key = row[ kr.node1_column_idx] + KgtkFormat.COLUMN_SEPARATOR + row[ kr.node2_column_idx] if label_key in label_set: continue else: label_set.add(label_key) kw.write(row, shuffle_list=shuffle_list) output_line_count += 1 column_idx: int for column_idx in lower_map.keys(): node1_value: str = row[lower_map[column_idx]] if len(node1_value) == 0: continue # TODO: raise an exception item: str = row[column_idx] if len(item) == 0: continue # Ignore empty node2 values. # Ths item might be a KGTK list. Let's split it, because # lists aren't allow in the node2 values we'll generate. node2_value: str for node2_value in KgtkValue.split_list(item): if len(node2_value) == 0: continue # Ignore empty node2 values. if deduplicate_labels: label_key = node1_value + KgtkFormat.COLUMN_SEPARATOR + node2_value if label_key in label_set: continue else: label_set.add(label_key) output_map: typing.Mapping[str, str] = { KgtkFormat.NODE1: node1_value, KgtkFormat.LABEL: label_value, KgtkFormat.NODE2: node2_value, } if lkw is None: kw.writemap(output_map) label_line_count += 1 output_line_count += 1 else: lkw.writemap(output_map) label_line_count += 1 if verbose: print("Read %d rows, wrote %d rows with %d labels." % (input_line_count, output_line_count, label_line_count), file=error_file, flush=True) kw.close() if lkw is not None: lkw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1