예제 #1
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys
    import typing

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        # First create the KgtkReader.  It provides parameters used by the ID
        # column builder. Next, create the ID column builder, which provides a
        # possibly revised list of column names for the KgtkWriter.  Create
        # the KgtkWriter.  Last, process the data stream.

        # Open the input file.
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Create the ID builder.
        idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options)

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(idb.column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        # Process the input file, building IDs.
        idb.process(kr, ew)

        # Clean up.
        ew.close()
        kr.close()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
예제 #2
0
    def process(self):

        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            else:
                print("Reading the input data from stdin",
                      file=self.error_file,
                      flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )
        self.id_column_idx = kr.id_column_idx

        # If requested, create the ID column builder.
        # Assemble the list of output column names.
        output_column_names: typing.List[str]
        idb: typing.Optional[KgtkIdBuilder] = None
        if self.build_id:
            if self.idbuilder_options is None:
                raise ValueError(
                    "ID build requested but ID builder options are missing")
            idb = KgtkIdBuilder.new(kr, self.idbuilder_options)
            output_column_names = idb.column_names
        else:
            output_column_names = kr.column_names

        # Build the list of key column edges:
        key_idx_list: typing.List[int] = []

        if len(self.key_column_names) == 0:
            if kr.is_edge_file:
                # Add the KGTK edge file required columns.
                key_idx_list.append(kr.node1_column_idx)
                key_idx_list.append(kr.label_column_idx)
                key_idx_list.append(kr.node2_column_idx)
                if not self.compact_id and kr.id_column_idx >= 0:
                    key_idx_list.append(kr.id_column_idx)

            elif kr.is_node_file:
                # Add the KGTK node file required column:
                key_idx_list.append(kr.id_column_idx)

            else:
                raise ValueError(
                    "The input file is neither an edge nor a node file.  Key columns must be supplied."
                )

        else:
            # Append columns to the list of key column indices,
            # silently removing duplicates, but complaining about unknown names.
            #
            # TODO: warn about duplicates?
            column_name: str
            for column_name in self.key_column_names:
                if column_name not in kr.column_name_map:
                    raise ValueError("Column %s is not in the input file" %
                                     (repr(column_name)))
                key_idx: int = kr.column_name_map[column_name]
                if key_idx not in key_idx_list:
                    key_idx_list.append(key_idx)

        if self.verbose:
            print("key indexes: %s" %
                  " ".join([str(idx) for idx in key_idx_list]),
                  file=self.error_file,
                  flush=True)

        self.keep_first_idx_list.clear()
        if len(self.keep_first_names) > 0:
            keep_first_name: str
            for keep_first_name in self.keep_first_names:
                if keep_first_name not in kr.column_name_map:
                    raise ValueError(
                        "Keep first column %s is not in the input file" %
                        (repr(keep_first_name)))
                keep_first_idx: int = kr.column_name_map[keep_first_name]
                if keep_first_idx in key_idx_list:
                    raise ValueError(
                        "Keep first column %s may not be a key column" %
                        (repr(keep_first_name)))
                self.keep_first_idx_list.append(keep_first_idx)
            if self.verbose:
                print("keep first indexes: %s" %
                      " ".join([str(idx) for idx in self.keep_first_idx_list]),
                      file=self.error_file,
                      flush=True)

        if self.deduplicate:
            if self.compact_id and kr.id_column_idx >= 0 and kr.id_column_idx not in self.keep_first_idx_list:
                self.keep_first_idx_list.append(kr.id_column_idx)

            # Any columns that aren't in the keep_first list and aren't
            # already in key_idx_list will be appended to key_idx_list:
            idx: int
            for idx in range(kr.column_count):
                if idx not in self.keep_first_idx_list and idx not in key_idx_list:
                    key_idx_list.append(idx)

            if self.verbose:
                print("revised key indexes: %s" %
                      " ".join([str(idx) for idx in key_idx_list]),
                      file=self.error_file,
                      flush=True)

        if self.verbose:
            key_idx_list_str: typing.List[str] = []
            for key_idx in key_idx_list:
                key_idx_list_str.append(str(key_idx))
            print("key indexes: %s" % " ".join(key_idx_list_str),
                  file=self.error_file,
                  flush=True)

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(
            output_column_names,
            self.output_file_path,
            mode=kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            use_mgzip=self.reader_options.use_mgzip,  # Hack!
            mgzip_threads=self.reader_options.mgzip_threads,  # Hack!
            gzip_in_parallel=False,
            verbose=self.verbose,
            very_verbose=self.very_verbose)

        # Open the optional list output file.
        lew: typing.Optional[KgtkWriter] = None
        if self.list_output_file_path is not None:
            lew = KgtkWriter.open(
                output_column_names,
                self.list_output_file_path,
                mode=kr.mode,
                require_all_columns=False,
                prohibit_extra_columns=True,
                fill_missing_columns=True,
                use_mgzip=self.reader_options.use_mgzip,  # Hack!
                mgzip_threads=self.reader_options.mgzip_threads,  # Hack!
                gzip_in_parallel=False,
                verbose=self.verbose,
                very_verbose=self.very_verbose)

        input_line_count: int = 0
        row: typing.List[str] = []
        input_key: str
        prev_input_key: typing.Optional[str] = None
        going_up: typing.Optional[bool] = None
        if self.sorted_input:
            if self.verbose:
                print("Reading the input data from %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            for row in kr:
                input_line_count += 1
                input_key = self.build_key(row, key_idx_list)
                if self.verify_sort:
                    if prev_input_key is None:
                        prev_input_key = input_key
                    else:
                        if going_up is None:
                            if prev_input_key < input_key:
                                going_up = True
                                prev_input_key = input_key
                            elif prev_input_key > input_key:
                                going_up = False
                                prev_input_key = input_key
                            else:
                                pass  # No change in input key
                        elif going_up:
                            if prev_input_key < input_key:
                                prev_input_key = input_key
                            elif prev_input_key > input_key:
                                raise ValueError(
                                    "Line %d sort violation going up: prev='%s' curr='%s'"
                                    % (input_line_count,
                                       prev_input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR),
                                       input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR)))
                            else:
                                pass  # No change in input_key
                        else:
                            if prev_input_key > input_key:
                                prev_input_key = input_key
                            elif prev_input_key < input_key:
                                raise ValueError(
                                    "Line %d sort violation going down: prev='%s' curr='%s'"
                                    % (input_line_count,
                                       prev_input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR),
                                       input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR)))
                            else:
                                pass  # No change in input_key

                self.process_row(input_key, row, input_line_count, idb, ew,
                                 lew)

        else:
            if self.verbose:
                print("Sorting the input data from %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            # Map key values to lists of input and output data.
            input_map: typing.MutableMapping[
                str, typing.List[typing.List[str]]] = {}

            for row in kr:
                input_line_count += 1
                input_key = self.build_key(row, key_idx_list)
                if input_key in input_map:
                    # Append the row to an existing list for that key.
                    input_map[input_key].append(row)
                else:
                    # Create a new list of rows for this key.
                    input_map[input_key] = [row]

            if self.verbose:
                print("Processing the sorted input data",
                      file=self.error_file,
                      flush=True)

            for input_key in sorted(input_map.keys()):
                for row in input_map[input_key]:
                    self.process_row(input_key, row, input_line_count, idb, ew,
                                     lew)

        # Flush the final row, if any.  We pass the last row read for
        # feedback, such as an ID uniqueness violation.
        self.process_row("", row, input_line_count, idb, ew, lew, flush=True)

        if self.verbose:
            print("Read %d records, excluded %d records, wrote %d records." %
                  (input_line_count, self.excluded_row_count,
                   self.output_line_count),
                  file=self.error_file,
                  flush=True)
            if lew is not None:
                print("Wrote %d list ouput records." %
                      (self.list_output_line_count),
                      file=self.error_file,
                      flush=True)

        ew.close()
        if lew is not None:
            lew.close()
예제 #3
0
    def process(self):
        output_column_names: typing.List[str]
        if self.build_id and self.idbuilder_options is not None:
            self.idbuilder = KgtkIdBuilder.from_column_names(
                self.COLUMN_NAMES, self.idbuilder_options)
            output_column_names = self.idbuilder.column_names
        else:
            output_column_names = self.COLUMN_NAMES

        if self.verbose:
            print("Opening output file %s" % str(self.output_file_path),
                  file=self.error_file,
                  flush=True)
        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(output_column_names,
                                         self.output_file_path,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        rw: typing.Optional[typing.TextIO] = None
        if self.reject_file_path is not None:
            if self.verbose:
                print("Opening reject file %s" % str(self.reject_file_path),
                      file=self.error_file,
                      flush=True)
            # Open the reject file. Since the input data is not in KGTK format,
            # we use an ordinary file here.
            if str(self.reject_file_path) == "-":
                rw = sys.stdout
            else:
                rw = open(self.reject_file_path, "wt")

        total_input_line_count: int = 0
        reject_line_count: int = 0

        namespace_line_count: int = self.get_initial_namespaces()

        input_file_path: str
        for input_file_path in self.input_file_paths:
            input_line_count: int = 0

            if self.local_namespace_use_uuid or self.namespace_id_use_uuid or self.newnode_use_uuid:
                if self.override_uuid is not None:
                    self.local_namespace_uuid = self.override_uuid  # for debugging
                else:
                    # Generate a new local namespace UUID.
                    self.local_namespace_uuid = shortuuid.uuid()

            # Open the input file.
            if self.verbose:
                print("Opening the input file: %s" % input_file_path,
                      file=self.error_file,
                      flush=True)
            infile: typing.TestIO
            if str(input_file_path) == "-":
                infile = sys.stdin
            else:
                infile = open(input_file_path, 'rt')

            line: str
            for line in infile:
                input_line_count += 1
                total_input_line_count += 1

                row: typing.List[str]
                valid: bool
                row, valid = self.parse(line, input_line_count)
                if not valid:
                    if rw is not None:
                        rw.write(line)
                    reject_line_count += 1
                    continue

                node1: str
                ok_1: bool
                node1, ok_1 = self.convert_and_validate(
                    row[0], input_line_count, ew)

                label: str
                ok_2: bool
                label, ok_2 = self.convert_and_validate(
                    row[1], input_line_count, ew)

                node2: str
                ok_3: bool
                node2, ok_3 = self.convert_and_validate(
                    row[2], input_line_count, ew)

                if ok_1 and ok_2 and ok_3:
                    self.write_row(ew, node1, label, node2)
                else:
                    if rw is not None:
                        rw.write(line)
                    reject_line_count += 1

            if input_file_path != "-":
                infile.close()

                self.save_namespaces(ew)

        if self.verbose:
            print("Processed %d known namespaces." % (namespace_line_count),
                  file=self.error_file,
                  flush=True)
            print("Processed %d records." % (total_input_line_count),
                  file=self.error_file,
                  flush=True)
            print("Rejected %d records." % (reject_line_count),
                  file=self.error_file,
                  flush=True)
            print("Wrote %d records." % (self.output_line_count),
                  file=self.error_file,
                  flush=True)

        if ew is not None:
            ew.close()

        if rw is not None and self.reject_file_path is not None and self.reject_file_path != "-":
            rw.close()
예제 #4
0
파일: kgtkimplode.py 프로젝트: usbader/kgtk
    def process(self):
        if len(self.column_name) == 0:
            raise ValueError("The name of the column to implode is empty.")

        selected_field_names: typing.List[str] = []
        field_name: str

        if self.type_names is not None:
            if self.verbose:
                print("Validate the names of the data types to extract.",
                      file=self.error_file,
                      flush=True)
            type_name: str
            for type_name in self.type_names:
                if type_name not in KgtkValueFields.DEFAULT_DATA_TYPE_FIELDS:
                    raise ValueError("Unknown data type name '%s'." %
                                     type_name)
                # Merge this KGTK data type's fields into the list of selected fields:
                for field_name in KgtkValueFields.DEFAULT_DATA_TYPE_FIELDS[
                        type_name]:
                    if field_name == KgtkValueFields.VALID_FIELD_NAME:
                        continue  # We don't need the valid field.
                    if field_name == KgtkValueFields.LIST_LEN_FIELD_NAME:
                        continue  # We don't need the list length field.
                    if field_name not in selected_field_names:
                        selected_field_names.append(field_name)

        if len(selected_field_names) == 0:
            raise ValueError("The list of fields to implode is empty.")

        if KgtkValueFields.DATA_TYPE_FIELD_NAME not in selected_field_names:
            raise ValueError(
                "The data type field '%s' has not been selected." %
                KgtkValueFields.DATA_TYPE_FIELD_NAME)

        # Open the input file.
        if self.verbose:
            print("Opening the input file: %s" % self.input_file_path,
                  file=self.error_file,
                  flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        output_column_names = kr.column_names.copy()
        new_column: bool  # True ==> adding the imploded column, False ==> using an existing column
        column_idx: int  # The index of the imploded column (new or old).
        if self.column_name in kr.column_name_map:
            column_idx = kr.column_name_map[self.column_name]
            new_column = False
            if not self.overwrite_column:
                raise ValueError(
                    "Imploded column '%s' (idx %d) already exists and overwrite not allowed."
                    % (self.column_name, column_idx))
            if self.verbose:
                print("Overwriting existing imploded column '%s' (idx %d)." %
                      (self.column_name, column_idx),
                      file=self.error_file,
                      flush=True)
        else:
            column_idx = len(output_column_names)
            new_column = True
            output_column_names.append(self.column_name)
            if self.verbose:
                print("Imploded column '%s' will be created (idx %d)." %
                      (self.column_name, column_idx),
                      file=self.error_file,
                      flush=True)

        if self.verbose:
            print("Build the map of field names to exploded columns",
                  file=self.error_file,
                  flush=True)
        implosion: typing.MutableMapping[str, int] = {}
        missing_columns: typing.List[str] = []
        for field_name in selected_field_names:
            if field_name in self.without_fields:
                if self.verbose:
                    print("We can do without field '%s'." % field_name,
                          file=self.error_file,
                          flush=True)
                implosion[field_name] = -1
                continue
            exploded_name: str = self.prefix + field_name
            if self.verbose:
                print("Field '%s' becomes '%s'" % (field_name, exploded_name),
                      file=self.error_file,
                      flush=True)
            if exploded_name in implosion:
                raise ValueError(
                    "Field name '%s' is duplicated in the field list.")
            if exploded_name in kr.column_names:
                exploded_idx = kr.column_name_map[exploded_name]
                implosion[field_name] = exploded_idx
                if self.verbose:
                    print("Field '%s' is in column '%s' (idx=%d)" %
                          (field_name, exploded_name, exploded_idx),
                          file=self.error_file,
                          flush=True)
            else:
                if self.verbose:
                    print("Field '%s' exploded column '%s' not found." %
                          (field_name, exploded_name),
                          file=self.error_file,
                          flush=True)
                missing_columns.append(exploded_name)
        if len(missing_columns) > 0:
            raise ValueError("Missing columns: %s" % " ".join(missing_columns))

        data_type_idx = implosion[KgtkValueFields.DATA_TYPE_FIELD_NAME]

        # If requested, create the ID column builder.
        # Assemble the list of output column names.
        idb: typing.Optional[KgtkIdBuilder] = None
        if self.build_id:
            if self.idbuilder_options is None:
                raise ValueError(
                    "ID build requested but ID builder options are missing")
            idb = KgtkIdBuilder.from_column_names(output_column_names,
                                                  self.idbuilder_options)
            id_output_column_names = idb.column_names.copy()
        else:
            id_output_column_names = output_column_names.copy()

        trimmed_output_column_names: typing.List[str]
        if self.remove_prefixed_columns and len(self.prefix) > 0:
            trimmed_output_column_names = []
            if self.verbose:
                print("Removing columns with names that start with '%s'." %
                      self.prefix,
                      file=self.error_file,
                      flush=True)
            column_name: str
            for column_name in id_output_column_names:
                if column_name.startswith(self.prefix):
                    if self.verbose:
                        print("Removing column '%s." % column_name,
                              file=self.error_file,
                              flush=True)
                else:
                    trimmed_output_column_names.append(column_name)
        else:
            trimmed_output_column_names = id_output_column_names

        shuffle_list: typing.List[int] = [
        ]  # Easier to init than deal with typing.Optional.
        ew: typing.Optional[KgtkWriter] = None
        if self.output_file_path is not None:
            if self.verbose:
                print("Opening output file %s" % str(self.output_file_path),
                      file=self.error_file,
                      flush=True)
            # Open the output file.
            ew: KgtkWriter = KgtkWriter.open(trimmed_output_column_names,
                                             self.output_file_path,
                                             mode=kr.mode,
                                             require_all_columns=False,
                                             prohibit_extra_columns=True,
                                             fill_missing_columns=False,
                                             gzip_in_parallel=False,
                                             verbose=self.verbose,
                                             very_verbose=self.very_verbose)
            shuffle_list = ew.build_shuffle_list(id_output_column_names)

        rw: typing.Optional[KgtkWriter] = None
        if self.reject_file_path is not None:
            if self.verbose:
                print("Opening reject file %s" % str(self.reject_file_path),
                      file=self.error_file,
                      flush=True)
            # Open the reject file.
            rw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                             self.reject_file_path,
                                             mode=kr.mode,
                                             require_all_columns=False,
                                             prohibit_extra_columns=True,
                                             fill_missing_columns=False,
                                             gzip_in_parallel=False,
                                             verbose=self.verbose,
                                             very_verbose=self.very_verbose)

        if self.verbose:
            print("Imploding records from %s" % self.input_file_path,
                  file=self.error_file,
                  flush=True)
        input_line_count: int = 0
        imploded_value_count: int = 0
        invalid_value_count: int = 0

        existing_column_idx: int = -1 if new_column else column_idx

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            value: str
            valid: bool
            value, valid = self.implode(input_line_count, row, implosion,
                                        data_type_idx, existing_column_idx)
            if valid:
                imploded_value_count += 1
            else:
                invalid_value_count += 1

            if rw is not None and not valid:
                # Reject the row before implosion.
                rw.write(row)
            elif ew is not None:
                output_row: typing.List[str] = row.copy()
                if new_column:
                    output_row.append(value)
                else:
                    output_row[column_idx] = value
                if idb is not None:
                    output_row = idb.build(output_row, input_line_count)
                ew.write(output_row, shuffle_list=shuffle_list)

        if self.verbose:
            print(
                "Processed %d records, imploded %d values, %d invalid values."
                %
                (input_line_count, imploded_value_count, invalid_value_count),
                file=self.error_file,
                flush=True)

        if ew is not None:
            ew.close()

        if rw is not None:
            rw.close()
예제 #5
0
파일: kgtkcompact.py 프로젝트: usbader/kgtk
    def process(self):

        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True)
            else:
                print("Reading the input data from stdin", file=self.error_file, flush=True)

        kr: KgtkReader =  KgtkReader.open(self.input_file_path,
                                          error_file=self.error_file,
                                          options=self.reader_options,
                                          value_options = self.value_options,
                                          verbose=self.verbose,
                                          very_verbose=self.very_verbose,
        )

        # If requested, creat the ID column builder.
        # Assemble the list of output column names.
        output_column_names: typing.List[str]
        idb: typing.Optional[KgtkIdBuilder] = None
        if self.build_id:
            if self.idbuilder_options is None:
                raise ValueError("ID build requested but ID builder options are missing")
            idb = KgtkIdBuilder.new(kr, self.idbuilder_options)
            output_column_names = idb.column_names
        else:
            output_column_names = kr.column_names

        # Build the list of key column edges:
        key_idx_list: typing.List[int] = [ ]
        if kr.is_edge_file:
            # Add the KGTK edge file required columns.
            key_idx_list.append(kr.node1_column_idx)
            key_idx_list.append(kr.label_column_idx)
            key_idx_list.append(kr.node2_column_idx)
            if not self.compact_id and kr.id_column_idx >= 0:
                key_idx_list.append(kr.id_column_idx)

        elif kr.is_node_file:
            # Add the KGTK node file required column:
            key_idx_list.append(kr.id_column_idx)

        # Append additinal columns to the list of key column indixes,
        # silently removing duplicates, but complaining about unknown names.
        #
        # TODO: warn about duplicates?
        column_name: str
        for column_name in self.key_column_names:
            if column_name not in kr.column_name_map:
                raise ValueError("Column %s is not in the input file" % (column_name))
            key_idx: int = kr.column_name_map[column_name]
            if key_idx not in key_idx_list:
                key_idx_list.append(key_idx)

        if self.verbose:
            key_idx_list_str: typing.List[str] = [ ]
            for key_idx in key_idx_list:
                key_idx_list_str.append(str(key_idx))
            print("key indexes: %s" % " ".join(key_idx_list_str))
            
        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(output_column_names,
                                         self.output_file_path,
                                         mode=kr.mode,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)        
        input_line_count: int = 0
        row: typing.List[str] = [ ]
        input_key: str
        prev_input_key: typing.Optional[str] = None
        going_up: typing.Optional[bool] = None
        if self.sorted_input:
            if self.verbose:
                print("Reading the input data from %s" % self.input_file_path, file=self.error_file, flush=True)
            for row in kr:
                input_line_count += 1
                input_key = self.build_key(row, key_idx_list)
                if self.verify_sort:
                    if prev_input_key is None:
                        prev_input_key = input_key
                    else:
                        if going_up is None:
                            if prev_input_key < input_key:
                                going_up = True
                                prev_input_key = input_key
                            elif prev_input_key > input_key:
                                going_up = False
                                prev_input_key = input_key
                            else:
                                pass # No change in input key
                        elif going_up:
                            if prev_input_key < input_key:
                                prev_input_key = input_key
                            elif prev_input_key > input_key:
                                raise ValueError("Line %d sort violation going up: prev='%s' curr='%s'" % (input_line_count, prev_input_key, input_key))
                            else:
                                pass # No change in input_key
                        else:
                            if prev_input_key > input_key:
                                prev_input_key = input_key
                            elif prev_input_key < input_key:
                                raise ValueError("Line %d sort violation going down: prev='%s' curr='%s'" % (input_line_count, prev_input_key, input_key))
                            else:
                                pass # No change in input_key
                            
                self.process_row(input_key, row, input_line_count, idb, ew)
            
        else:
            if self.verbose:
                print("Sorting the input data from %s" % self.input_file_path, file=self.error_file, flush=True)
            # Map key values to lists of input and output data.
            input_map: typing.MutableMapping[str, typing.List[typing.List[str]]] = { }

            for row in kr:
                input_line_count += 1
                input_key = self.build_key(row, key_idx_list)
                if input_key in input_map:
                    # Append the row to an existing list for that key.
                    input_map[input_key].append(row)
                else:
                    # Create a new list of rows for this key.
                    input_map[input_key] = [ row ]

            if self.verbose:
                print("Processing the sorted input data", file=self.error_file, flush=True)
            
            for input_key in sorted(input_map.keys()):
                for row in input_map[input_key]:
                    self.process_row(input_key, row, input_line_count, idb, ew)

        # Flush the final row, if any.  We pass the last row read for
        # feedback, such as an ID uniqueness violation.
        self.process_row("", row, input_line_count, idb, ew, flush=True)
        
        if self.verbose:
            print("Read %d records, wrote %d records." % (input_line_count, self.output_line_count), file=self.error_file, flush=True)
        
        ew.close()
예제 #6
0
파일: normalize.py 프로젝트: nicklein/kgtk
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        new_edges_file: KGTKFiles,
        base_columns: typing.Optional[typing.List[str]] = None,
        columns_to_lower: typing.Optional[typing.List[str]] = None,
        label_values: typing.Optional[typing.List[str]] = None,
        lift_separator: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SEPARATOR,
        ignore_empty_node1: bool = False,
        ignore_empty_node2: bool = False,
        add_id: bool = False,
        lower: bool = False,
        normalize: bool = False,
        deduplicate_new_edges: bool = True,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalue import KgtkValue
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    new_edges_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(new_edges_file,
                                                            who="Label file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if new_edges_kgtk_file is not None:
            print("--label-file=%s" % str(new_edges_kgtk_file),
                  file=error_file)

        if base_columns is not None:
            print("--base-columns %s" % " ".join(base_columns),
                  file=error_file)
        if columns_to_lower is not None:
            print("--columns-to-lower %s" % " ".join(columns_to_lower),
                  file=error_file)
        if label_values is not None:
            print("--label-values %s" % " ".join(label_values),
                  file=error_file)
        print("--lift-separator=%s" % lift_separator, file=error_file)
        print("--add-id=%s" % add_id, file=error_file)
        print("--lower=%s" % lower, file=error_file)
        print("--ignore-empty-node1=%s" % ignore_empty_node1, file=error_file)
        print("--ignore-empty-node2=%s" % ignore_empty_node2, file=error_file)
        print("--normalize=%s" % normalize, file=error_file)
        print("--deduplicate-labels=%s" % deduplicate_new_edges,
              file=error_file)

        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if not lower and not normalize:
        raise KGTKException(
            "One or both of --lower and --normalize must be requested.")

    try:
        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Map the index of a column being removed to the index of the base column that supplies its node1 value.
        lower_map: typing.MutableMapping[int, typing.Tuple[int, str]] = dict()

        node1_column_name: str = kr.get_node1_column_actual_name()
        label_column_name: str = kr.get_label_column_actual_name()
        node2_column_name: str = kr.get_node2_column_actual_name()
        id_column_name: str = kr.get_id_column_actual_name()

        key_column_names: typing.List[str] = list()
        key_column_idxs: typing.Set[int] = set()

        if node1_column_name != "":
            if verbose:
                print("Node1 column name: %s" % node1_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(node1_column_name)
            key_column_idxs.add(kr.node1_column_idx)

        if label_column_name != "":
            if verbose:
                print("Label column name: %s" % label_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(label_column_name)
            key_column_idxs.add(kr.label_column_idx)

        if node2_column_name != "":
            if verbose:
                print("Node2 column name: %s" % node2_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(node2_column_name)
            key_column_idxs.add(kr.node2_column_idx)

        if id_column_name != "":
            if verbose:
                print("Id column name: %s" % id_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(id_column_name)
            key_column_idxs.add(kr.id_column_idx)
        elif normalize:
            raise KGTKException(
                "--normalize was requested but the ID column was not found.")

        base_name: str
        new_label_value: str
        column_name: str
        idx: int
        # There are three option patterns.

        if columns_to_lower is not None and len(
                columns_to_lower) > 0 and base_columns is not None and len(
                    base_columns) > 0:
            # Pattern 1: len(columns_to_lower) > 0 and len(base_columns) == len(columns_to_lower)
            # column_names and base_columns are paired. New records use label_values if specified.
            if len(columns_to_lower) != len(base_columns):
                raise KGTKException(
                    "There are %d columns to lower but only %d base columns." %
                    (len(columns_to_lower), len(base_columns)))

            if label_values is not None and len(label_values) > 0 and len(
                    label_values) != len(columns_to_lower):
                raise KGTKException(
                    "There are %d columns to lower but only %d label values." %
                    (len(columns_to_lower), len(label_values)))

            for idx, column_name in enumerate(columns_to_lower):
                base_name = base_columns[idx]
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if base_name not in kr.column_names:
                    raise KGTKException(
                        "For column name %s, base name %s is unknown" %
                        (repr(column_name), repr(base_name)))

                if normalize and base_name == id_column_name:
                    lower_map[kr.column_name_map[column_name]] = (
                        kr.column_name_map[base_name], column_name)
                else:
                    if not lower:
                        raise KGTKException(
                            "--lower is not enabled for column %s, base name %s"
                            % (repr(column_name), repr(base_name)))
                    if label_values is not None and len(
                            label_values) > 0 and len(label_values[idx]) > 0:
                        lower_map[kr.column_name_map[column_name]] = (
                            kr.column_name_map[base_name], label_values[idx])
                    else:
                        lower_map[kr.column_name_map[column_name]] = (
                            kr.column_name_map[base_name], column_name)

        elif columns_to_lower is not None and len(columns_to_lower) > 0 and (
                base_columns is None or len(base_columns) == 0):
            # Pattern 2: len(columns_to_lower) > 0 and len(base_columns) == 0
            # Each column name is split at the lift separator to determine the base name and label value.
            if len(lift_separator) == 0:
                raise KGTKException("The --lift-separator must not be empty.")

            for idx, column_name in enumerate(columns_to_lower):
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if lower and lift_separator in column_name:
                    base_name, new_label_value = column_name.split(
                        lift_separator, 1)
                    if base_name not in kr.column_names:
                        raise KGTKException(
                            "For column name %s, base name %s is not known" %
                            (repr(column_name), repr(base_name)))

                elif normalize:
                    base_name = id_column_name
                    new_label_value = column_name

                else:
                    raise KGTKException(
                        "Unable to parse column name %s, no separator (%s)." %
                        (repr(column_name), repr(lift_separator)))

                lower_map[kr.column_name_map[column_name]] = (
                    kr.column_name_map[base_name], new_label_value)

        elif columns_to_lower is None or len(columns_to_lower) == 0:
            # Pattern 3: len(columns_to_lower) == 0.
            # Any column that matches a lift pattern against one of the
            # key columns (node1, label, node2, id, or their aliases)
            # will be lowered.
            if len(lift_separator) == 0:
                raise KGTKException("The --lift-separator must not be empty.")

            if base_columns is None or len(base_columns) == 0:
                # The base name list wasn't supplied.  Use [node1, label, node2, id]
                base_columns = list(key_column_names)
                if verbose:
                    print("Using the default base columns: %s" %
                          " ".join(base_columns),
                          file=error_file,
                          flush=True)
            else:
                if verbose:
                    print("Using these base columns: %s" %
                          " ".join(base_columns),
                          file=error_file,
                          flush=True)

            for idx, column_name in enumerate(kr.column_names):
                # Skip the node1, label, node12, and id columns
                if idx in key_column_idxs:
                    if verbose:
                        print("column %s is a key column, skipping." %
                              repr(column_name),
                              file=error_file,
                              flush=True)
                    continue

                # Does this column match a lifting pattern?
                if lower and lift_separator in column_name:
                    base_name, new_label_value = column_name.split(
                        lift_separator, 1)

                    if base_name not in base_columns:
                        if verbose:
                            print(
                                "Column %s contains base name %s, which is not a base column."
                                % (repr(column_name), repr(base_name)),
                                file=error_file,
                                flush=True)
                        continue

                elif normalize:
                    base_name = id_column_name
                    new_label_value = column_name

                else:
                    if verbose:
                        print(
                            "Column %s does not contain the separator %s and not normalizing, skipping."
                            % (repr(column_name), repr(lift_separator)),
                            file=error_file,
                            flush=True)
                    continue

                # This test should be redundant.
                if base_name in kr.column_names:
                    lower_map[idx] = (kr.column_name_map[base_name],
                                      new_label_value)
                else:
                    raise KGTKException(
                        "Base name %s was unexpectedly not found." %
                        repr(base_name))

        if len(lower_map) == 0:
            raise KGTKException("There are no columns to lower or normalize.")

        if verbose:
            print("The following columns will be lowered or normalized",
                  file=error_file,
                  flush=True)
            for idx in sorted(lower_map.keys()):
                column_name = kr.column_names[idx]
                base_idx, new_label_value = lower_map[idx]
                base_name = kr.column_names[base_idx]
                print(" %s from %s (label %s)" %
                      (column_name, base_name, repr(new_label_value)),
                      file=error_file,
                      flush=True)

        output_column_names: typing.List[str] = list()
        for idx, column_name in enumerate(kr.column_names):
            if idx not in lower_map:
                output_column_names.append(column_name)

        # Create the ID builder.
        idb: typing.Optional[KgtkIdBuilder] = None
        if add_id:
            idb = KgtkIdBuilder.from_column_names(output_column_names,
                                                  idbuilder_options)
            output_column_names = idb.column_names.copy()

        if verbose:
            print("The output columns are: %s" % " ".join(output_column_names),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(
            output_column_names,
            output_kgtk_file,
            mode=KgtkWriter.Mode.EDGE,
            require_all_columns=False,  # Simplifies writing the labels
            verbose=verbose,
            very_verbose=very_verbose)
        shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names)

        lkw: typing.Optional[KgtkWriter] = None
        if new_edges_kgtk_file is not None:
            if verbose:
                print("Opening the label output file: %s" %
                      str(new_edges_kgtk_file),
                      file=error_file,
                      flush=True)

            label_column_names = [
                node1_column_name, label_column_name, node2_column_name
            ]
            lkw = KgtkWriter.open(label_column_names,
                                  new_edges_kgtk_file,
                                  mode=KgtkWriter.Mode.EDGE,
                                  verbose=verbose,
                                  very_verbose=very_verbose)

        # Optionally deduplicate the labels
        #  set(node1_value + KgtkFormat.SEPARATOR + node2_value)
        label_set: typing.Set[str] = set()
        label_key: str

        input_line_count: int = 0
        output_line_count: int = 0
        label_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            output_row: typing.List[str] = kw.shuffle(
                row, shuffle_list=shuffle_list)
            kw.write(output_row)
            output_line_count += 1

            id_seq_num: int = 0
            column_idx: int
            for column_idx in lower_map.keys():
                node1_idx: int
                node1_idx, new_label_value = lower_map[column_idx]
                node1_value: str
                node1_value = row[node1_idx]
                if len(node1_value) == 0:
                    if ignore_empty_node1:
                        continue  # TODO: raise an exception
                    else:
                        raise KGTKException(
                            "Empty node1 value when lowering %d to %d: %s in input line %d"
                            % (column_idx, node1_idx, new_label_value,
                               input_line_count))

                item: str = row[column_idx]
                if len(item) == 0:
                    if ignore_empty_node2:
                        continue  # Ignore empty node2 values.
                    else:
                        raise KGTKException(
                            "Empty node2 value when lowering %d to %d: %s in input line %d"
                            % (column_idx, node1_idx, new_label_value,
                               input_line_count))

                # Ths item might be a KGTK list.  Let's split it, because
                # lists aren't allow in the node2 values we'll generate.
                node2_value: str
                for node2_value in KgtkValue.split_list(item):
                    if len(node2_value) == 0:
                        if ignore_empty_node2:
                            continue  # Ignore empty node2 values in a list.
                        else:
                            raise KGTKException(
                                "Empty node2 value in a list when lowering %d to %d: %s in input line %d"
                                % (column_idx, node1_idx, new_label_value,
                                   input_line_count))

                    if deduplicate_new_edges:
                        label_key = node1_value + KgtkFormat.KEY_FIELD_SEPARATOR + new_label_value + KgtkFormat.KEY_FIELD_SEPARATOR + node2_value
                        if label_key in label_set:
                            continue
                        else:
                            label_set.add(label_key)

                    lowered_input_row: typing.List[str] = [
                        "" for idx in range(kr.column_count)
                    ]
                    lowered_input_row[kr.node1_column_idx] = node1_value
                    lowered_input_row[kr.label_column_idx] = new_label_value
                    lowered_input_row[kr.node2_column_idx] = node2_value

                    lowered_output_row: typing.List[str] = kw.shuffle(
                        lowered_input_row, shuffle_list=shuffle_list)
                    if idb is not None:
                        id_seq_num += 0
                        lowered_output_row = idb.build(lowered_output_row,
                                                       id_seq_num,
                                                       already_added=True)
                    if lkw is not None:
                        lkw.write(lowered_output_row)
                        label_line_count += 1
                    else:
                        kw.write(lowered_output_row)
                        label_line_count += 1
                        output_line_count += 1

        if verbose:
            print("Read %d rows, wrote %d rows with %d labels." %
                  (input_line_count, output_line_count, label_line_count),
                  file=error_file,
                  flush=True)

        kw.close()
        if lkw is not None:
            lkw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1