Пример #1
0
 def check_column_name(cls,
                       column_name: str,
                       header_line: str,
                       error_action: ValidationAction,
                       error_file: typing.TextIO = sys.stderr)->typing.List[str]:
     # Returns a list of complaints.
     # Check for valid column names.
     # 1) Check for leading white space
     # 2) Check for trailing white space
     # 3) Check for internal white space
     #    1) except inside "" and '' quoted strings
     # 4) Check for commas
     # 5) Check for vertical bars
     # 6) Check for semicolons (disabled)
     #
     # TODO: It might be possible to make some of these checks more efficient.
     results: typing.List[str] = [ ]
     if column_name.lstrip() != column_name:
         results.append("Column name '%s' starts with leading white space" % column_name)
     if column_name.rstrip() != column_name:
         results.append("Column name '%s' ends with leading white space" % column_name)
     if not (column_name.startswith('"') or column_name.startswith("'")):
         if ''.join(column_name.split()) != column_name.strip():
             results.append("Column name '%s' contains internal white space" % column_name)
     if "," in column_name:
         results.append("Warning: Column name '%s' contains a comma (,)" % column_name)
     if "|" in column_name:
         results.append("Warning: Column name '%s' contains a vertical bar (|)" % column_name)
     # if ";" in column_name:
     #    results.append("Warning: Column name '%s' contains a semicolon (;)" % column_name)
     kv: KgtkValue = KgtkValue(column_name)
     if not kv.is_valid():
         results.append(kv.describe())
     return results
Пример #2
0
 def explode(self, value: KgtkValue, row: typing.List[str],
             explosion: typing.Mapping[str, int],
             new_column_count: int) -> typing.List[str]:
     newrow: typing.List[str] = row.copy()
     if new_column_count > 0:
         # Would it be better to do:
         #
         # if new_column_count > 0:
         #     newrow.extend(["] * new_column_count)
         i: int
         for i in range(new_column_count):
             newrow.append("")
     field_map: typing.Mapping[str,
                               typing.Union[str, int, float,
                                            bool]] = value.get_field_map()
     field_name: str
     idx: int
     for field_name, idx in explosion.items():
         if field_name in field_map:
             newvalue: str
             if KgtkValueFields.FIELD_NAME_FORMATS[field_name] == "str":
                 # Format this as a KGTK string.
                 newvalue = '"' + str(field_map[field_name]) + '"'
             else:
                 # Convert everything else to a KGTK number or symbol
                 newvalue = str(field_map[field_name])
             newrow[idx] = newvalue
         else:
             newrow[
                 idx] = ""  # In case we are overwriting an existing column.
     return newrow
Пример #3
0
    def implode_date_and_times(self,
                               input_line_count: int,
                               row: typing.List[str],
                               implosion: typing.Mapping[str, int],
                               type_name: str,                      
    )->typing.Tuple[str, bool]:
        valid: bool = True

        date_and_times_idx: int = implosion[KgtkValueFields.DATE_AND_TIMES_FIELD_NAME]
        date_and_times_val: str = self.unwrap(row[date_and_times_idx])
        if len(date_and_times_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.DATE_AND_TIMES_FIELD_NAME),
                      file=self.error_file, flush=True)

        precision_idx: int = implosion[KgtkValueFields.PRECISION_FIELD_NAME]
        precision_val: str = self.unwrap(row[precision_idx]) if precision_idx >= 0 else ""

        value: str = "^" + date_and_times_val
        if len(precision_val) > 0:
            value += "/" + precision_val

        if valid and self.validate:
            kv: KgtkValue = KgtkValue(value, options=self.value_options)
            valid = kv.is_date_and_times(validate=True)
            if not valid:
                if self.verbose:
                    print("Input line %d: data type '%s': imploded value '%s' is not a valid date and time." % (input_line_count, type_name, value),
                          file=self.error_file, flush=True)
        return value, valid
Пример #4
0
    def implode_symbol(self,
                       input_line_count: int,
                       row: typing.List[str],
                       implosion: typing.Mapping[str, int],
                       type_name: str,                      
    )->typing.Tuple[str, bool]:
        valid: bool = True
        symbol_idx: int = implosion[KgtkValueFields.SYMBOL_FIELD_NAME]
        symbol_val: str = self.unwrap(row[symbol_idx])
        if len(symbol_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.SYMBOL_FIELD_NAME),
                      file=self.error_file, flush=True)

        if self.escape_pipes:
            symbol_val = symbol_val.replace(KgtkFormat.LIST_SEPARATOR, "\\" + KgtkFormat.LIST_SEPARATOR)

        value: str = symbol_val

        if valid and self.validate:
            kv: KgtkValue = KgtkValue(value, options=self.value_options)
            valid = kv.is_symbol(validate=True)
            if not valid:
                if self.verbose:
                    print("Input line %d: data type '%s': imploded value '%s' is not a valid symbol." % (input_line_count, type_name, value),
                          file=self.error_file, flush=True)

        return value, valid
Пример #5
0
    def convert_and_validate(self, item: str, line_number: int,
                             ew: KgtkWriter) -> typing.Tuple[str, bool]:
        result: str
        is_ok: bool
        result, is_ok = self.convert(item, line_number, ew)

        # Just a little bit of paranoia here regarding tabs and ends-of-lines:
        #
        # TODO: perform these checks (and repairs!) in KgtkValue.
        if "\t" in result:
            result = result.replace("\t", "\\t")
        if "\n" in result:
            result = result.replace("\n", "\\n")
        if "\r" in result:
            result = result.replace("\r", "\\r")

        if is_ok and self.validate:
            kv: KgtkValue = KgtkValue(result, options=self.value_options)
            if not kv.validate():
                if self.verbose:
                    print(
                        "Input line %d: imported value '%s' (from '%s') is invalid."
                        % (line_number, result, item),
                        file=self.error_file,
                        flush=True)
                return result, False
        return result, True
Пример #6
0
    def implode_location_coordinates(self,
                                     input_line_count: int,
                                     row: typing.List[str],
                                     implosion: typing.Mapping[str, int],
                                     type_name: str,                      
    )->typing.Tuple[str, bool]:
        valid: bool = True
        latitude_idx: int = implosion[KgtkValueFields.LATITUDE_FIELD_NAME]
        latitude_val: str = self.unwrap(row[latitude_idx])
        if len(latitude_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.LATITUDE_FIELD_NAME),
                      file=self.error_file, flush=True)

        longitude_idx: int = implosion[KgtkValueFields.LONGITUDE_FIELD_NAME]
        longitude_val: str = self.unwrap(row[longitude_idx])
        if len(longitude_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.LONGITUDE_FIELD_NAME),
                      file=self.error_file, flush=True)

        value: str = "@" + latitude_val + "/" + longitude_val

        if valid and self.validate:
            kv: KgtkValue = KgtkValue(value, options=self.value_options)
            valid =  kv.is_location_coordinates(validate=True)
            if not valid:
                if self.verbose:
                    print("Input line %d: data type '%s': imploded value '%s' is not a valid location coordinates." % (input_line_count, type_name, value),
                          file=self.error_file, flush=True)
        return value, valid
Пример #7
0
 def verify_uniqueness(self, id_value: str, row: typing.List[str],
                       line_number, who: str):
     """
     Verify that ID values are not repeated.  This is OK for the output
     of `kgtk compact`, but is a little too strong for general use.
     The weaker constraint should be that the ID values don't repeat
     with different (node1, label, node2) tuples in an edge file.
     """
     if KgtkFormat.LIST_SEPARATOR in id_value:
         # The ID value might be a list.
         id_v: str
         for id_v in KgtkValue.split_list(id_value):
             if id_v in self.id_set:
                 # TODO: Probably want more error handling options, such as
                 # printing the offending row and choosing to continue.
                 raise ValueError(
                     "Line %d: %s ID '%s' duplicates a previous ID '%s'." %
                     (line_number, who, id_value, id_v))
             else:
                 self.id_set.add(id_v)
     else:
         # Not a list, we can process this faster.
         if id_value in self.id_set:
             # TODO: Probably want more error handling options, such as
             # printing the offending row and choosing to continue.
             raise ValueError(
                 "Line %d: %s ID '%s' duplicates a previous ID '%s'." %
                 (line_number, who, id_value, id_value))
         else:
             self.id_set.add(id_value)
Пример #8
0
    def compact_row(self) -> bool:
        """Compact the current row. Return True if there is at least one list in the
        result, otherwise return False.

        """
        if self.current_row_lists is None:
            return False

        # Preallocate the row, this might be more efficient than appending to it..
        self.current_row = [""] * len(self.current_row_lists)
        idx: int
        item_list: typing.Optional[typing.List[str]]
        saw_list: bool = False
        for idx, item_list in enumerate(self.current_row_lists):
            if item_list is not None:
                if idx in self.keep_first_idx_list:
                    item_list = sorted(
                        item_list[:1])  # Ensure sorting.  Is this redundant?

                if len(item_list) > 1:
                    saw_list = True
                # We don't need to use KgtkValue.join_unique_list(item_list)
                # because self.merge_row(...) and self.expand_row(...) ensure that
                # there are no duplicates.
                #
                # TODO: run timing studies to determine which approach is more efficient.
                self.current_row[idx] = KgtkValue.join_sorted_list(item_list)
        self.current_row_lists = None
        return saw_list
Пример #9
0
    def expand_row(self, row: typing.List[str], force: bool = False):
        if not self.lists_in_input and not force:
            self.current_row = row  # Optimization: leave the row alone if possible.
            return

        # Preallocate the list, this might be more efficient than appending to it..
        self.current_row_lists = [None] * len(row)
        idx: int
        item: str
        for idx, item in enumerate(row):
            if len(item) == 0:
                continue  # Ignore empty items.

            # Start the new current item list:
            current_item_list: typing.Optional[typing.List[str]] = None

            # The row item might itself be a list.
            item2: str
            for item2 in KgtkValue.split_list(item):
                if len(item2) == 0:
                    continue  # Ignore empty items

                if current_item_list is None:
                    # This is the first item.
                    current_item_list = [item2]
                    continue

                # There might be duplicate items in the row item's list.
                if item2 not in current_item_list:
                    current_item_list.append(item2)  # Add unique items.

            self.current_row_lists[idx] = current_item_list
Пример #10
0
    def implode_boolean(
        self,
        input_line_count: int,
        row: typing.List[str],
        implosion: typing.Mapping[str, int],
        type_name: str,
    ) -> typing.Tuple[str, bool]:
        valid: bool = True
        truth_idx: int = implosion[KgtkValueFields.TRUTH_FIELD_NAME]
        truth_val: str = self.unwrap(row[truth_idx])
        if len(truth_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" %
                      (input_line_count, type_name,
                       KgtkValueFields.TRUTH_FIELD_NAME),
                      file=self.error_file,
                      flush=True)

        value: str = truth_val

        if valid and self.validate:
            kv: KgtkValue = KgtkValue(value, options=self.value_options)
            valid = kv.is_boolean(validate=True)
            if not valid:
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': imploded value '%s' is not a valid boolean."
                        % (input_line_count, type_name, value),
                        file=self.error_file,
                        flush=True)

        return value, valid
Пример #11
0
    def compact_row(self):
        if self.current_row_lists is None:
            return

        # Preallocate the list, this might be more efficient than appending to it..
        self.current_row = [None] * len(self.current_row_lists)
        idx: int
        item_list: typing.list[str]
        for idx, item_list in enumerate(self.current_row_lists):
            self.current_row[idx] = KgtkValue.join_sorted_list(item_list)
        self.current_row_lists = None
Пример #12
0
    def implode_quantity(self,
                         input_line_count: int,
                         row: typing.List[str],
                         implosion: typing.Mapping[str, int],
                         type_name: str,                      
    )->typing.Tuple[str, bool]:
        valid: bool = True
        num_idx: int = implosion[KgtkValueFields.NUMBER_FIELD_NAME]
        num_val: str = self.unwrap(row[num_idx])
        if len(num_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.NUMBER_FIELD_NAME),
                      file=self.error_file, flush=True)

        lt_idx: int = implosion[KgtkValueFields.LOW_TOLERANCE_FIELD_NAME]
        lt: str = self.unwrap(row[lt_idx]) if lt_idx >= 0 else ""

        ht_idx: int = implosion[KgtkValueFields.HIGH_TOLERANCE_FIELD_NAME]
        ht: str = self.unwrap(row[ht_idx]) if ht_idx >= 0 else ""

        if len(lt) > 0 ^ len(ht) > 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': low and high tolerance must both be present or absent." % (input_line_count, type_name),
                      file=self.error_file, flush=True)

        si_idx: int = implosion[KgtkValueFields.SI_UNITS_FIELD_NAME]
        si: str = self.unwrap(row[si_idx]) if si_idx >= 0 else ""

        un_idx: int = implosion[KgtkValueFields.UNITS_NODE_FIELD_NAME]
        un: str = self.unwrap(row[un_idx]) if un_idx >= 0 else ""

        value: str = num_val
        if len(lt) > 0 or len(ht) > 0:
            value += "[" + lt + "," + ht + "]"
        value += si + un

        if valid and self.validate:
            kv: KgtkValue = KgtkValue(value, options=self.value_options)
            if self.quantities_include_numbers:
                valid = kv.is_number_or_quantity(validate=True)
                if not valid:
                    if self.verbose:
                        print("Input line %d: data type '%s': imploded value '%s' is not a valid quantity or number." % (input_line_count, type_name, value),
                              file=self.error_file, flush=True)
            else:
                valid = kv.is_quantity(validate=True)
                if not valid:
                    if self.verbose:
                        print("Input line %d: data type '%s': imploded value '%s' is not a valid quantity." % (input_line_count, type_name, value),
                              file=self.error_file, flush=True)
        return value, valid
Пример #13
0
    def build_attr_map(
        self,
        result: typing.MutableMapping[str, typing.Any],
        attr_list: str,
        who: str,
    ):
        if len(attr_list) == 0:
            return

        attr_map: typing.MutableMapping[str, typing.Mapping[str, str]] = {}
        attr: str
        for attr in KgtkValue.split_list(attr_list):
            self.add_attr_to_map(attr_map, attr, who)

        if len(attr_map) > 0:
            result[who] = attr_map
Пример #14
0
    def load_labels(
        self,
        kr: KgtkReader,
        path: Path,
    ) -> typing.Tuple[typing.Mapping[str, str], typing.List[typing.List[str]]]:
        input_rows: typing.List[typing.List[str]] = []
        labels: typing.MutableMapping[str, str] = {}

        node1_column_idx: int
        label_column_idx: int
        node2_column_idx: int
        node1_column_idx, label_column_idx, node2_column_idx = self.lookup_label_table_idxs(
            kr)

        if self.verbose:
            print("Loading labels from %s" % path,
                  file=self.error_file,
                  flush=True)
        key: str
        row: typing.List[str]
        for row in kr:
            if row[label_column_idx] == self.label_column_value:
                # This is a label definition row.
                label_key = row[node1_column_idx]
                label_value: str = row[node2_column_idx]
                if len(label_value) > 0:
                    if label_key in labels:
                        # This label already exists in the table.
                        if self.suppress_duplicate_labels:
                            # Build a list eliminating duplicate elements.
                            # print("Merge '%s' and '%s'" % (key_value, labels[key]), file=self.error_file, flush=True)
                            labels[label_key] = KgtkValue.merge_values(
                                labels[label_key], label_value)
                        else:
                            labels[label_key] = KgtkFormat.LIST_SEPARATOR.join(
                                (labels[label_key], label_value))
                    else:
                        # This is the first instance of this label definition.
                        labels[label_key] = label_value
                if not self.remove_label_records:
                    input_rows.append(row)
            else:
                input_rows.append(row)
        return labels, input_rows
Пример #15
0
    def compact_row(self):
        if self.current_row_lists is None:
            return

        # Preallocate the list, this might be more efficient than appending to it..
        self.current_row = [None] * len(self.current_row_lists)
        idx: int
        item_list: typing.Optional[typing.List[str]]
        for idx, item_list in enumerate(self.current_row_lists):
            if item_list is None:
                self.current_row[idx] = ""
            else:
                # We don't need to use KgtkValue.join_unique_list(item_list)
                # because self.merge_row(...) and self.expand_row(...) ensure that
                # there are no duplicates.
                #
                # TODO: run timing studies to determine which approach is more efficient.
                self.current_row[idx] = KgtkValue.join_sorted_list(item_list)
        self.current_row_lists = None
Пример #16
0
    def merge_row(self, row: typing.List[str]):
        if self.current_row_lists is None:
            if self.current_row is None:
                # TODO: raise a better error
                raise ValueError("Inconsistent state #1 in merge_row.")
            else:
                # We deferred expanding the previous row, but we must do so
                # now:
                self.expand_row(self.current_row, force=True)
                if self.current_row_lists is None:
                    # Keep mypy happy by ensuring that self.current_row_lists is not None.
                    #
                    # TODO: raise a better error.
                    raise ValueError("Inconsistent state #2 in merge_row.")

        idx: int
        item: str
        for idx, item in enumerate(row):
            if len(item) == 0:
                continue  # Ignore empty items

            # We will modify the current item list in place!
            current_item_list: typing.Optional[
                typing.List[str]] = self.current_row_lists[idx]

            # The row item might itself be a list.
            item2: str
            for item2 in KgtkValue.split_list(item):
                if len(item2) == 0:
                    continue  # Ignore empty items.

                if current_item_list is None:
                    # This is the first item.
                    current_item_list = [item2]
                    self.current_row_lists[idx] = current_item_list
                    continue

                # There might be duplicate items in the row item's list.
                if item2 not in current_item_list:
                    current_item_list.append(item2)  # Add unique items.
Пример #17
0
    def add_attr_to_map(
        self,
        attr_map: typing.MutableMapping[str, typing.Mapping[str, str]],
        attr: str,
        who: str,
    ):
        kv: KgtkValue = KgtkValue(attr,
                                  options=self.value_options,
                                  parse_fields=False,
                                  error_file=self.error_file,
                                  verbose=self.verbose)
        if not kv.is_language_qualified_string(validate=True):
            raise ValueError("Invald attr %s for %s" % (attr, who))

        text: str
        language: str
        language_suffix: str
        text, language, language_suffix = KgtkFormat.destringify(kv.value)
        if len(language) == 0:
            raise ValueError("No attr language in %s for %s" % (attr, who))
        lang: str = language + language_suffix
        attr_map[lang] = {"language": lang, "value": text}
Пример #18
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        label_file: KGTKFiles,
        base_columns: typing.Optional[typing.List[str]] = None,
        columns_to_remove: typing.Optional[typing.List[str]] = None,
        label_value: str = KgtkLift.DEFAULT_LABEL_SELECT_COLUMN_VALUE,
        lift_suffix: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SUFFIX,
        deduplicate_labels: bool = True,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    label_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(label_file,
                                                            who="Label file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if label_kgtk_file is not None:
            print("--label-file=%s" % str(label_kgtk_file), file=error_file)

        if base_columns is not None:
            print("--base-columns=%s" % " ".join(base_columns),
                  file=error_file)
        if columns_to_remove is not None:
            print("--columns-to-lower=%s" % " ".join(columns_to_remove),
                  file=error_file)
        print("--label-value=%s" % label_value, file=error_file)
        print("--lift-suffix=%s" % lift_suffix, file=error_file)
        print("--deduplicate-labels=%s" % deduplicate_labels, file=error_file)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Map the index of a column being removed to the index of the base column that supplies its node1 value.
        lower_map: typing.MutableMapping[int, int] = dict()

        # These columns will never be removed:
        key_column_idxs: typing.Set[int] = set(
            (kr.node1_column_idx, kr.label_column_idx, kr.node2_column_idx,
             kr.id_column_idx))
        key_column_idxs.discard(-1)
        key_column_names: typing.Set[str] = set(
            (kr.column_names[idx] for idx in key_column_idxs))

        base_name: str
        column_name: str
        idx: int
        # There are three option patterns.

        if columns_to_remove is not None and len(
                columns_to_remove) > 0 and base_columns is not None and len(
                    base_columns) > 0:
            # Pattern 1: len(columns_to_remove) > 0 and len(base_columns) == len(columns_to_remove)
            # column_names and base_columns are paired.
            if len(columns_to_remove) != len(base_columns):
                raise KGTKException(
                    "There are %d columns to remove but only %d base columns."
                    % (len(columns_to_remove), len(base_columns)))

            for idx, column_name in enumerate(columns_to_remove):
                base_name = base_columns[idx]
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if base_name not in kr.column_names:
                    raise KGTKException(
                        "For column name %s, base name %s is unknown" %
                        (repr(column_name), repr(base_name)))

                lower_map[kr.column_name_map[
                    column_name]] = kr.column_name_map[base_name]

        elif columns_to_remove is not None and len(columns_to_remove) > 0 and (
                base_columns is None or len(base_columns) == 0):
            # Pattern 2: len(columns_to_remove) > 0 and len(base_columns) == 0
            # Each column name is stripped of the lift suffix to determine the base name.
            if len(lift_suffix) == 0:
                raise KGTKException("The --lift-suffix must not be empty.")

            for idx, column_name in enumerate(columns_to_remove):
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if not column_name.endswith(lift_suffix):
                    raise KGTKException("Unable to parse column name %s." %
                                        repr(column_name))

                base_name = column_name[:-len(lift_suffix)]

                if base_name not in kr.column_names:
                    raise KGTKException(
                        "For column name %s, base name %s is not known" %
                        (repr(column_name), repr(base_name)))

                lower_map[kr.column_name_map[
                    column_name]] = kr.column_name_map[base_name]

        elif columns_to_remove is None or len(columns_to_remove) == 0:
            # Pattern 3: len(columns_to_remove) == 0.
            if len(lift_suffix) == 0:
                raise KGTKException("The --lift-suffix must not be empty.")

            if base_columns is None or len(base_columns) == 0:
                # The base name list wasn't supplied.  Use [node1, label, node2, id]
                base_columns = list(key_column_names)

            for idx, column_name in enumerate(kr.column_names):
                # Skip the node1, label, node12, and id columns
                if idx in key_column_idxs:
                    continue

                # Does this column match a lifting pattern?
                for base_name in base_columns:
                    if len(base_name) == 0:
                        continue
                    if column_name == base_name + lift_suffix:
                        lower_map[idx] = kr.column_name_map[base_name]

        if len(lower_map) == 0:
            raise KGTKException("There are no columns to lower.")

        if verbose:
            print("The following columns will be lowered",
                  file=error_file,
                  flush=True)
            for idx in sorted(lower_map.keys()):
                column_name = kr.column_names[idx]
                base_name = kr.column_names[lower_map[idx]]
                print(" %s from %s" % (column_name, base_name),
                      file=error_file,
                      flush=True)

        output_column_names: typing.List[str] = list()
        for idx, column_name in enumerate(kr.column_names):
            if idx not in lower_map:
                output_column_names.append(column_name)
        if verbose:
            print("The output columns are: %s" % " ".join(output_column_names),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(
            output_column_names,
            output_kgtk_file,
            mode=KgtkWriter.Mode.EDGE,
            require_all_columns=False,  # Simplifies writing the labels
            verbose=verbose,
            very_verbose=very_verbose)
        shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names)

        lkw: typing.Optional[KgtkWriter] = None
        if label_kgtk_file is not None:
            if verbose:
                print("Opening the label output file: %s" %
                      str(label_kgtk_file),
                      file=error_file,
                      flush=True)

            label_column_names = [
                KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2
            ]
            lkw = KgtkWriter.open(label_column_names,
                                  label_kgtk_file,
                                  mode=KgtkWriter.Mode.EDGE,
                                  verbose=verbose,
                                  very_verbose=very_verbose)

        # Optionally deduplicate the labels
        #  set(node1_value + KgtkFormat.SEPARATOR + node2_value)
        label_set: typing.Set[str] = set()
        label_key: str

        # If labels will be written to the output file and deduplication is enabled:
        check_existing_labels: bool = \
            deduplicate_labels and \
            lkw is None and \
            kr.node1_column_idx >= 0 and \
            kr.label_column_idx >= 0 and \
            kr.node2_column_idx >= 0

        input_line_count: int = 0
        output_line_count: int = 0
        label_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if check_existing_labels and row[
                    kr.label_column_idx] == label_value:
                label_key = row[
                    kr.node1_column_idx] + KgtkFormat.COLUMN_SEPARATOR + row[
                        kr.node2_column_idx]
                if label_key in label_set:
                    continue
                else:
                    label_set.add(label_key)

            kw.write(row, shuffle_list=shuffle_list)
            output_line_count += 1

            column_idx: int
            for column_idx in lower_map.keys():
                node1_value: str = row[lower_map[column_idx]]
                if len(node1_value) == 0:
                    continue  # TODO: raise an exception

                item: str = row[column_idx]
                if len(item) == 0:
                    continue  # Ignore empty node2 values.

                # Ths item might be a KGTK list.  Let's split it, because
                # lists aren't allow in the node2 values we'll generate.
                node2_value: str
                for node2_value in KgtkValue.split_list(item):
                    if len(node2_value) == 0:
                        continue  # Ignore empty node2 values.

                    if deduplicate_labels:
                        label_key = node1_value + KgtkFormat.COLUMN_SEPARATOR + node2_value
                        if label_key in label_set:
                            continue
                        else:
                            label_set.add(label_key)

                    output_map: typing.Mapping[str, str] = {
                        KgtkFormat.NODE1: node1_value,
                        KgtkFormat.LABEL: label_value,
                        KgtkFormat.NODE2: node2_value,
                    }
                    if lkw is None:
                        kw.writemap(output_map)
                        label_line_count += 1
                        output_line_count += 1
                    else:
                        lkw.writemap(output_map)
                        label_line_count += 1

        if verbose:
            print("Read %d rows, wrote %d rows with %d labels." %
                  (input_line_count, output_line_count, label_line_count),
                  file=error_file,
                  flush=True)

        kw.close()
        if lkw is not None:
            lkw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
Пример #19
0
    def load_labels(
        self,
        kr: KgtkReader,
        path: Path,
        save_input: bool = True,
        labels_needed: typing.Optional[typing.Set[str]] = None,
    ) -> typing.Tuple[typing.Mapping[str, str], typing.List[typing.List[str]]]:
        input_rows: typing.List[typing.List[str]] = []
        labels: typing.MutableMapping[str, str] = {}

        label_match_column_idx: int
        label_select_column_idx: int
        label_value_column_idx: int
        label_match_column_idx, label_select_column_idx, label_value_column_idx = self.lookup_label_table_idxs(
            kr)

        if self.verbose:
            print("Loading labels from %s" % path,
                  file=self.error_file,
                  flush=True)
            if labels_needed is not None:
                print("Filtering for needed labels",
                      file=self.error_file,
                      flush=True)
            print("label_match_column_idx=%d (%s)." %
                  (label_match_column_idx,
                   kr.column_names[label_match_column_idx]),
                  file=self.error_file,
                  flush=True)
            print("label_select_column_idx=%d (%s)." %
                  (label_select_column_idx,
                   kr.column_names[label_select_column_idx]),
                  file=self.error_file,
                  flush=True)
            print("label_value_column_idx=%d (%s)." %
                  (label_value_column_idx,
                   kr.column_names[label_value_column_idx]),
                  file=self.error_file,
                  flush=True)
            print("label_select_column_value='%s'." %
                  self.label_select_column_value,
                  file=self.error_file,
                  flush=True)

        key: str
        row: typing.List[str]
        for row in kr:
            if row[label_select_column_idx] == self.label_select_column_value:
                # This is a label definition row.
                label_key = row[label_match_column_idx]
                label_value: str = row[label_value_column_idx]
                if len(label_value) > 0:
                    if label_key in labels:
                        # This label already exists in the table.
                        if self.suppress_duplicate_labels:
                            # Build a list eliminating duplicate elements.
                            # print("Merge '%s' and '%s'" % (key_value, labels[key]), file=self.error_file, flush=True)
                            labels[label_key] = KgtkValue.merge_values(
                                labels[label_key], label_value)
                        else:
                            labels[label_key] = KgtkFormat.LIST_SEPARATOR.join(
                                (labels[label_key], label_value))
                    else:
                        # This is the first instance of this label definition.
                        if labels_needed is not None:
                            if label_key in labels_needed:
                                labels[label_key] = label_value
                        else:
                            labels[label_key] = label_value
                if save_input and not self.remove_label_records:
                    input_rows.append(row)
            else:
                if save_input:
                    input_rows.append(row)
        return labels, input_rows
Пример #20
0
    def process_qual_datavalue(self, value: str, qual_row: typing.List[str],
                               datatype: str):
        datavalue: typing.MutableMapping[str, typing.Union[str, typing.Mapping[
            str, typing.Optional[typing.Union[str, int, float]]]]] = dict()
        datavalue["type"] = qual_row[self.qual_val_type_idx]

        valuemap: typing.MutableMapping[str, typing.Optional[typing.Union[
            str, int, float]]] = dict()
        datavalue["value"] = valuemap

        entity_type: str = qual_row[self.qual_entity_type_idx]
        if len(entity_type) > 0:
            valuemap["entity-type"] = entity_type
            valuemap["id"] = value

            # TODO: Is this the right thing to do for Q16097-F1?
            numeric_id: str = value[1:]
            if "-" in numeric_id:
                numeric_id = numeric_id[:numeric_id.index("-")]
            valuemap["numeric-id"] = int(numeric_id)
            return datavalue

        kv = KgtkValue(value,
                       options=self.value_options,
                       parse_fields=True,
                       error_file=self.error_file,
                       verbose=self.verbose)
        if not kv.validate():
            # raise ValueError("Invalid KGTK value '%s'" % value)
            print("Warning: Invalid KGTK value '%s'" % value,
                  file=self.error_file,
                  flush=True)
        if kv.fields is None:
            raise ValueError("KGTK value %s is missing fields." % value)

        if kv.is_number():
            if kv.fields.numberstr is None:
                raise ValueError("number is missing numberstr for %s." % value)

            valuemap["amount"] = kv.fields.numberstr  # TODO: add plus sign
            valuemap["unit"] = "1"
            return datavalue

        if kv.is_quantity():
            if kv.fields.numberstr is None:
                raise ValueError("quantity is missing numberstr for %s." %
                                 value)
            valuemap["amount"] = kv.fields.numberstr  # TODO: add plus sign

            if kv.fields.units_node is None:
                # TODO: Research this further.  Why did we get here?  Is it because import_wikidata
                # dropped the units?
                #
                # raise ValueError("quantity is missing units_node for %s in: %s" % (value, " ".join(qual_row)))
                valuemap["unit"] = "undefined"
            else:
                valuemap[
                    "unit"] = "http://www.wikidata.org/entity/" + kv.fields.units_node

            if kv.fields.low_tolerancestr is not None and len(
                    kv.fields.low_tolerancestr) > 0:
                valuemap[
                    "lowerBound"] = kv.fields.low_tolerancestr  # TODO: add plus sign

            if kv.fields.high_tolerancestr is not None and len(
                    kv.fields.high_tolerancestr) > 0:
                valuemap[
                    "higherBound"] = kv.fields.high_tolerancestr  # TODO: add plus sign
            return datavalue

        if kv.is_language_qualified_string():
            text: str
            language: str
            language_suffix: str
            text, language, language_suffix = KgtkFormat.destringify(
                value)  # TODO: KgtkValue should do this to text
            language += language_suffix
            valuemap["text"] = text
            valuemap["language"] = language
            return datavalue

        if kv.is_string():
            valuemap["type"] = "string"
            valuemap["value"] = KgtkFormat.unstringify(
                value)  # TODO: KgtkValue should do this to text
            return datavalue

        if kv.is_date_and_times():
            if kv.fields.zonestr is None:
                raise ValueError("timezone is missing from %s." % value)
            if kv.fields.zonestr != "Z":
                raise ValueError("Only Z-time is supported, error in %s." %
                                 value)

            if kv.fields.date_and_time is None:
                raise ValueError("date_and_time is missing from %s." % value)
            valuemap["time"] = kv.fields.date_and_time
            valuemap["timezone"] = 0
            valuemap["before"] = 0
            valuemap["after"] = 0

            if kv.fields.precision is None:
                raise ValueError(
                    "date_and_time precision is missing from %s." % value)
            valuemap["precision"] = kv.fields.precision

            valuemap[
                "calendarmodel"] = "http://www.wikidata.org/entity/" + qual_row[
                    self.qual_calendar_idx]
            return datavalue

        if kv.is_location_coordinates():
            if kv.fields.latitude is None:
                raise ValueError("latitude is missing from %s" % value)
            valuemap["latitude"] = kv.fields.latitude

            if kv.fields.longitude is None:
                raise ValueError("longitude is missing from %s" % value)
            valuemap["longitude"] = kv.fields.longitude

            valuemap["altitide"] = None  # deprecated

            valuemap["precision"] = float(qual_row[self.qual_precision_idx])

            valuemap["globe"] = "http://www.wikidata.org/entity/Q2"
            return datavalue

        # Default: convert the symbol to a string.
        valuemap["type"] = "string"
        valuemap["value"] = KgtkFormat.unstringify(
            '"' + value + '"')  # TODO: KgtkValue should do this to text
        return datavalue
Пример #21
0
    def process_edge_datavalue(self, value: str, edge_row: typing.List[str],
                               datatype: str):
        datavalue: typing.MutableMapping[str, typing.Union[str, typing.Mapping[
            str, typing.Optional[typing.Union[str, int, float]]]]] = dict()
        datavalue["type"] = edge_row[self.edge_val_type_idx]

        valuemap: typing.MutableMapping[str, typing.Optional[typing.Union[
            str, int, float]]] = dict()
        datavalue["value"] = valuemap

        entity_type: str = edge_row[self.edge_entity_type_idx]
        if len(entity_type) > 0:
            valuemap["entity-type"] = entity_type
            valuemap["id"] = value

            # TODO: Is this the right thing to do?
            numeric_id: str = value[1:]
            if "-" in numeric_id:
                numeric_id = numeric_id[:numeric_id.index("-")]
            valuemap["numeric-id"] = int(numeric_id)
            return datavalue

        kv = KgtkValue(value,
                       options=self.value_options,
                       parse_fields=True,
                       error_file=self.error_file,
                       verbose=self.verbose)
        if not kv.validate():
            # raise ValueError("Invalid KGTK value '%s'" % value)
            print("Warning: Invalid KGTK value '%s'" % value,
                  file=self.error_file,
                  flush=True)
        if kv.fields is None:
            raise ValueError("KGTK value '%s' is missing fields." % value)

        if kv.is_number():
            if kv.fields.numberstr is None:
                raise ValueError("number is missing numberstr.")

            valuemap["amount"] = kv.fields.numberstr  # TODO: add plus sign
            valuemap["unit"] = "1"
            return datavalue

        if kv.is_quantity():
            if kv.fields.numberstr is None:
                raise ValueError("quantity is missing numberstr.")
            valuemap["amount"] = kv.fields.numberstr  # TODO: add plus sign

            if kv.fields.units_node is None:
                # TODO: research this further.
                #
                # raise ValueError("quantity is missing units_node for %s." % value)
                valuemap["init"] = "undefined"
            else:
                valuemap[
                    "unit"] = "http://www.wikidata.org/entity/" + kv.fields.units_node

            if kv.fields.low_tolerancestr is not None and len(
                    kv.fields.low_tolerancestr) > 0:
                valuemap[
                    "lowerBound"] = kv.fields.low_tolerancestr  # TODO: add plus sign

            if kv.fields.high_tolerancestr is not None and len(
                    kv.fields.high_tolerancestr) > 0:
                valuemap[
                    "higherBound"] = kv.fields.high_tolerancestr  # TODO: add plus sign
            return datavalue

        if kv.is_language_qualified_string():
            text: str
            language: str
            language_suffix: str
            text, language, language_suffix = KgtkFormat.destringify(
                value)  # TODO: KgtkValue should do this to text
            language += language_suffix
            valuemap["text"] = text
            valuemap["language"] = language
            return datavalue

        if kv.is_string():
            valuemap["type"] = "string"
            valuemap["value"] = KgtkFormat.unstringify(
                value)  # TODO: KgtkValue should do this to text
            return datavalue

        if kv.is_date_and_times():
            if kv.fields.zonestr is None:
                raise ValueError("timezone is missing.")
            if kv.fields.zonestr != "Z":
                raise ValueError("Only Z-time is supported.")

            if kv.fields.date_and_time is None:
                raise ValueError("date_and_time is missing.")
            valuemap["time"] = kv.fields.date_and_time
            valuemap["timezone"] = 0
            valuemap["before"] = 0
            valuemap["after"] = 0

            if kv.fields.precision is None:
                raise ValueError("date_and_time precision is missing.")
            valuemap["precision"] = kv.fields.precision

            valuemap[
                "calendarmodel"] = "http://www.wikidata.org/entity/" + edge_row[
                    self.edge_calendar_idx]
            return datavalue

        if kv.is_location_coordinates:
            if kv.fields.latitude is None:
                raise ValueError("latitude is missing")
            valuemap["latitude"] = kv.fields.latitude

            if kv.fields.longitude is None:
                raise ValueError("longitude is missing")
            valuemap["longitude"] = kv.fields.longitude

            valuemap["altitide"] = None  # deprecated

            # TODO: Validate that it's OK to have location coordinates without precision.
            precision: str = edge_row[self.edge_precision_idx]
            if len(precision) > 0:
                try:
                    valuemap["precision"] = float(
                        edge_row[self.edge_precision_idx])
                except ValueError:
                    print("Invalid precision '%s'" % precision,
                          file=self.error_file,
                          flush=True)

            valuemap["globe"] = "http://www.wikidata.org/entity/Q2"
            return datavalue

        # Default: treat as string.
        valuemap["type"] = "string"
        valuemap["value"] = KgtkFormat.unstringify(
            value)  # TODO: KgtkValue should do this to text
        return datavalue
Пример #22
0
    def process_as_merge(self, ikr: KgtkReader, lkr: KgtkReader):
        """
        Process the lift as a merge between two sorted files.

        """
        if self.verbose:
            print("Merging sorted input and label files.",
                  file=self.error_file,
                  flush=True)
        lift_column_idxs: typing.List[int] = self.build_lift_column_idxs(ikr)
        if len(lift_column_idxs) != 1:
            raise ValueError("Expecting exactly one lift_column_idxs, got %d" %
                             len(lift_column_idxs))

        ew: KgtkWriter
        lifted_output_column_idxs: typing.List[int]
        ew, lifted_output_column_idxs = self.open_output_writer(
            ikr, lift_column_idxs)

        new_columns: int = len(ew.column_names) - len(ikr.column_names)
        if new_columns not in (0, 1):
            raise ValueError("Expecing zero or one new columns, got %d" %
                             new_columns)

        lift_column_idx: int = lift_column_idxs[0]  # For convenience
        lifted_output_column_idx: int = lifted_output_column_idxs[
            0]  # For convenience

        node1_column_idx: int
        label_column_idx: int
        node2_column_idx: int
        node1_column_idx, label_column_idx, node2_column_idx = self.lookup_label_table_idxs(
            lkr)

        current_label_row: typing.Optional[typing.List[str]] = None
        more_labels: bool = True
        # Read the first label record.
        try:
            current_label_row = lkr.nextrow()
        except StopIteration:
            more_labels = False

        input_line_count: int = 0

        # We carry last_value_to_lift and lifted_label_value over
        # iterations in case the input file has multiple records with
        # the same value to lift.
        last_value_to_lift: typing.Optional[str] = None
        lifted_label_value: str = ""

        if self.verbose:
            print("Processing the input records.",
                  file=self.error_file,
                  flush=True)

        row: typing.List[str]
        for row in ikr:
            input_line_count += 1
            value_to_lift: str = row[lift_column_idx]
            if last_value_to_lift is None or value_to_lift != last_value_to_lift:
                last_value_to_lift = value_to_lift
                lifted_label_value = ""

                # Read label records until we come to the first record that
                # has a node1 value equal to or greater than the value we we want to lift.
                while more_labels and current_label_row is not None and current_label_row[
                        node1_column_idx] < value_to_lift:
                    try:
                        current_label_row = lkr.nextrow()
                    except StopIteration:
                        more_labels = False
                        break

                # While the label records have node1 values equal to the value we are trying to lift,
                # look for label values from the label file.
                while more_labels and current_label_row is not None and current_label_row[
                        node1_column_idx] == value_to_lift:
                    if current_label_row[
                            label_column_idx] == self.label_column_value:
                        label_value: str = current_label_row[node2_column_idx]
                        if len(label_value) > 0:
                            if len(lifted_label_value) > 0:
                                if self.suppress_duplicate_labels:
                                    lifted_label_value = KgtkValue.merge_values(
                                        lifted_label_value, label_value)
                                else:
                                    lifted_label_value = KgtkFormat.LIST_SEPARATOR.join(
                                        (lifted_label_value, label_value))
                            else:
                                lifted_label_value = label_value

                    try:
                        current_label_row = lkr.nextrow()
                    except StopIteration:
                        more_labels = False
                        break

            output_row: typing.List[str] = row.copy()
            if new_columns > 0:
                output_row.append("")
            output_row[lifted_output_column_idx] = lifted_label_value
            ew.write(output_row)

        if more_labels:
            lkr.close()

        if self.verbose:
            print("Read %d input records." % (input_line_count),
                  file=self.error_file,
                  flush=True)
Пример #23
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        columns: typing.Optional[typing.List[str]] = None,
        labels: typing.Optional[typing.List[str]] = None,
        id_column_name: typing.Optional[str] = None,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    import os

    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)

        if columns is not None:
            print("--columns=%s" % " ".join(columns), file=error_file)
        if labels is not None:
            print("--labels=%s" % " ".join(labels), file=error_file)
        if id_column_name is not None:
            print("--id-column=%s" % id_column_name, file=error_file)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if verbose:
        print("Starting normalize_nodes pid=%d" % (os.getpid()),
              file=error_file,
              flush=True)

    label_map: typing.MutableMapping[str, str] = dict()
    if labels is not None and len(labels) > 0:
        if columns is None:
            raise KGTKException(
                "--columns must be supplied when --labels is used.")
        if len(columns) != len(labels):
            raise KGTKException("%d columns were supplied, but %d labels." %
                                (len(columns), len(labels)))
        idx: int
        label: str
        for idx, label in enumerate(labels):
            label_map[columns[idx]] = label

    try:
        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        id_column_idx: int = kr.get_id_column_index(id_column_name)
        if id_column_idx < 0:
            raise KGTKException("Unknown ID column %s" % repr(id_column_name))

        output_column_names: typing.List[str] = [
            KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2
        ]

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(output_column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        input_line_count: int = 0
        output_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            node1_value: str = row[id_column_idx]

            column_idx: int
            column_name: str
            for column_idx, column_name in enumerate(kr.column_names):
                if column_idx == id_column_idx:
                    continue
                if columns is not None and column_name not in columns:
                    continue

                label_value: str = label_map.get(column_name, column_name)

                new_value: str = row[column_idx]
                if len(new_value) == 0:
                    continue  # ignore empty values.

                # The column value might contain a KGTK list.  Since node2 isn't supposed
                # to contain lists, we'll split it.
                node2_value: str
                for node2_value in KgtkValue.split_list(new_value):
                    if len(node2_value) == 0:
                        continue  # node2 shouldn't contain empty values

                    output_row: typing.List[str] = [
                        node1_value, label_value, node2_value
                    ]
                    kw.write(output_row)
                    output_line_count += 1

        if verbose:
            print("Read %d node rows, wrote %d edge rows." %
                  (input_line_count, output_line_count),
                  file=error_file,
                  flush=True)

        kw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
Пример #24
0
    def process(self):
        if len(self.column_name) == 0:
            raise ValueError("The name of the column to explode is empty.")

        selected_field_names: typing.List[str] = []
        field_name: str

        if self.type_names is not None:
            if self.verbose:
                print("Validate the names of the data types to extract.",
                      file=self.error_file,
                      flush=True)
            type_name: str
            for type_name in self.type_names:
                if type_name not in KgtkValueFields.DEFAULT_DATA_TYPE_FIELDS:
                    raise ValueError("Unknown data type name '%s'." %
                                     type_name)
                # Merge this KGTK data type's fields into the list of selected fields:
                for field_name in KgtkValueFields.DEFAULT_DATA_TYPE_FIELDS[
                        type_name]:
                    if field_name not in selected_field_names:
                        selected_field_names.append(field_name)

        if self.field_names is not None:
            # Forget the fields selected above, choose these instead:
            selected_field_names = []
            if self.verbose:
                print("Validate the names of the fields to extract.",
                      file=self.error_file,
                      flush=True)
            for field_name in self.field_names:
                if field_name not in KgtkValueFields.FIELD_NAMES:
                    raise ValueError("Unknown field name '%s'." % field_name)
                # Merge this field into the list of selected fields:
                if field_name not in selected_field_names:
                    selected_field_names.append(field_name)

        if len(selected_field_names) == 0:
            raise ValueError("The list of fields to explode is empty.")

        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            else:
                print("Reading the input data from stdin",
                      file=self.error_file,
                      flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        if self.verbose:
            print("Check that the source column '%s' is present." %
                  self.column_name,
                  file=self.error_file,
                  flush=True)
        if self.column_name not in kr.column_name_map:
            raise ValueError("Column name '%s' not found in the input file." %
                             self.column_name)
        column_idx: int = kr.column_name_map[self.column_name]

        if self.verbose:
            print(
                "Build the map of exploded columns and list of new column names",
                file=self.error_file,
                flush=True)
        explosion: typing.MutableMapping[str, idx] = {}
        column_names: typing.List[str] = kr.column_names.copy()
        for field_name in selected_field_names:
            exploded_name: str = self.prefix + field_name
            if self.verbose:
                print("Field '%s' becomes '%s'" % (field_name, exploded_name),
                      file=self.error_file,
                      flush=True)
            if exploded_name in explosion:
                raise ValueError(
                    "Field name '%s' is duplicated in the field list.")
            if exploded_name in kr.column_names:
                if self.overwrite_columns:
                    existing_idx = kr.column_name_map[exploded_name]
                    explosion[field_name] = existing_idx
                    if self.verbose:
                        print(
                            "Field '%s' is overwriting existing column '%s' (idx=%d)"
                            % (field_name, exploded_name, existing_idx),
                            file=self.error_file,
                            flush=True)
                else:
                    raise ValueError(
                        "Exploded column '%s' already exists and not allowed to overwrite"
                        % exploded_name)
            else:
                column_names.append(exploded_name)
                exploded_idx: int = len(column_names) - 1
                explosion[field_name] = exploded_idx
                if self.verbose:
                    print("Field '%s' becomes new column '%s' (idx=%d)" %
                          (field_name, exploded_name, exploded_idx),
                          file=self.error_file,
                          flush=True)
        new_column_count: int = len(column_names) - kr.column_count
        if self.verbose:
            print("%d columns + %d columns = %d columns" %
                  (kr.column_count, new_column_count, len(column_names)))
            print("Explosion length: %d" % len(explosion))

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(column_names,
                                         self.output_file_path,
                                         mode=kr.mode,
                                         output_format=self.output_format,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        if self.verbose:
            print("Expanding records from %s" % self.input_file_path,
                  file=self.error_file,
                  flush=True)
        input_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            # Parse the value for the colummn being exploded:
            item_to_explode: str = row[column_idx]
            value: KgtkValue = KgtkValue(item_to_explode,
                                         options=self.value_options,
                                         parse_fields=True)
            value.validate()
            if not value.is_valid():
                if self.verbose:
                    print("Not exploding invalid item '%s' in input line %d" %
                          (item_to_explode, input_line_count),
                          file=self.error_file,
                          flush=True)
                ew.write(row)  # This will be filled to the proper length
                output_line_count += 1
                continue

            if self.expand_list and value.is_list():
                if self.verbose:
                    print("Expanding a list: '%s'" % item_to_explode,
                          file=self.error_file,
                          flush=True)
                subvalue: KgtkValue
                for subvalue in value.get_list_items():
                    if self.very_verbose:
                        print("Exploding '%s'" % subvalue.value)
                    ew.write(
                        self.explode(subvalue, row, explosion,
                                     new_column_count))
                    output_line_count += 1
            else:
                if self.very_verbose:
                    print("Exploding '%s'" % value.value)
                ew.write(self.explode(value, row, explosion, new_column_count))
                output_line_count += 1

        if self.verbose:
            print("Read %d records, wrote %d records." %
                  (input_line_count, output_line_count),
                  file=self.error_file,
                  flush=True)

        ew.close()
Пример #25
0
def run(input_file: KGTKFiles,
        output_file: KGTKFiles,
        output_format: typing.Optional[str],

        column_names_list: typing.List[typing.List[str]],
        into_column_names_list: typing.List[typing.List[str]],
        operation: str,
        values_list: typing.List[typing.List[str]],
        with_values_list: typing.List[typing.List[str]],
        limit: typing.Optional[int],
        format_string: typing.Optional[str],

        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want.
)->int:
    # import modules locally
    import datetime as dt
    from pathlib import Path
    import re
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions
    from kgtk.value.kgtkvalue import KgtkValue

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Flatten the input lists.
    column_names: typing.List[str] = flatten_arg_list(column_names_list)
    into_column_names: typing.List[str] = flatten_arg_list(into_column_names_list)
    values: typing.List[str] = flatten_arg_list(values_list)
    with_values: typing.List[str] = flatten_arg_list(with_values_list)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file, flush=True)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True)
        if output_format is not None:
            print("--output-format=%s" % output_format, file=error_file, flush=True)
        if len(column_names) > 0:
            print("--columns %s" % " ".join(column_names), file=error_file, flush=True)
        if len(into_column_names) > 0:
            print("--into %s" % " ".join(into_column_names), file=error_file, flush=True)
        print("--operation=%s" % str(operation), file=error_file, flush=True)
        if len(values) > 0:
            print("--values %s" % " ".join(values), file=error_file, flush=True)
        if len(with_values) > 0:
            print("--with-values %s" % " ".join(with_values), file=error_file, flush=True)
        if limit is not None:
            print("--limit %d" % limit, file=error_file, flush=True)
        if format_string is not None:
            print("--format=%s" % format_string, file=error_file, flush=True)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        if verbose:
            print("Opening the input file %s" % str(input_kgtk_file), file=error_file, flush=True)
        kr = KgtkReader.open(input_kgtk_file,
                             options=reader_options,
                             value_options = value_options,
                             error_file=error_file,
                             verbose=verbose,
                             very_verbose=very_verbose,
        )

        remaining_names: typing.List[str] = kr.column_names.copy()
        selected_names: typing.List[str] = [ ]
        save_selected_names: typing.Optional[typing.List[str]] = None

        ellipses: str = "..." # All unmentioned columns
        ranger: str = ".." # All columns between two columns.

        idx: int

        saw_ranger: bool = False
        column_name: str
        for column_name in column_names:
            if column_name == ellipses:
                if save_selected_names is not None:
                    raise KGTKException("Elipses may appear only once")

                if saw_ranger:
                    raise KGTKException("Elipses may not appear directly after a range operator ('..').")

                save_selected_names = selected_names
                selected_names = [ ]
                continue

            if column_name == ranger:
                if len(selected_names) == 0:
                    raise KGTKException("The column range operator ('..') may not appear without a preceeding column name.")
                saw_ranger = True
                continue

            if column_name not in kr.column_names:
                raise KGTKException("Unknown column name '%s'." % column_name)
            if column_name not in remaining_names:
                raise KGTKException("Column name '%s' was duplicated in the list." % column_name)

            if saw_ranger:
                saw_ranger = False
                prior_column_name: str = selected_names[-1]
                prior_column_idx: int = kr.column_name_map[prior_column_name]
                column_name_idx: int = kr.column_name_map[column_name]
                start_idx: int
                end_idx: int
                idx_inc: int
                if column_name_idx > prior_column_idx:
                    start_idx = prior_column_idx + 1
                    end_idx = column_name_idx - 1
                    idx_inc = 1
                else:
                    start_idx = prior_column_idx - 1
                    end_idx = column_name_idx + 1
                    idx_inc = -1

                idx = start_idx
                while idx <= end_idx:
                    idx_column_name: str = kr.column_names[idx]
                    if idx_column_name not in remaining_names:
                        raise KGTKException("Column name '%s' (%s .. %s) was duplicated in the list." % (column_name, prior_column_name, column_name))
                   
                    selected_names.append(idx_column_name)
                    remaining_names.remove(idx_column_name)
                    idx += idx_inc

            selected_names.append(column_name)
            remaining_names.remove(column_name)

        if saw_ranger:
            raise KGTKException("The column ranger operator ('..') may not end the list of column names.")

        if len(remaining_names) > 0 and save_selected_names is None:
            if verbose:
                print("Omitting the following columns: %s" % " ".join(remaining_names), file=error_file, flush=True)
        if save_selected_names is not None:
            if len(remaining_names) > 0:
                save_selected_names.extend(remaining_names)
            if len(selected_names) > 0:
                save_selected_names.extend(selected_names)
            selected_names = save_selected_names

        sources: typing.List[int] = [ ]
        name: str
        for name in selected_names:
            sources.append(kr.column_name_map[name])

        new_column_count: int = 0
        into_column_idxs: typing.List[int] = [ ]
        into_column_idx: int
        output_column_names: typing.List[str] = kr.column_names.copy()
        into_column_name: str
        for idx, into_column_name in enumerate(into_column_names):
            if into_column_name in kr.column_name_map:
                into_column_idx = kr.column_name_map[into_column_name]
                into_column_idxs.append(into_column_idx)
                if verbose:
                    print("Putting result %d of the calculation into old column %d (%s)." % (idx + 1, into_column_idx, into_column_name), file=error_file, flush=True)
            else:
                new_column_count += 1
                into_column_idx = len(output_column_names)
                into_column_idxs.append(into_column_idx)
                output_column_names.append(into_column_name)
                if verbose:
                    print("Putting result %d of the calculation into new column %d (%s)." % (idx + 1, into_column_idx, into_column_name), file=error_file, flush=True)

        if verbose:
            print("Opening the output file %s" % str(output_kgtk_file), file=error_file, flush=True)
        kw: KgtkWriter = KgtkWriter.open(output_column_names,
                                         output_kgtk_file,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         output_format=output_format,
                                         verbose=verbose,
                                         very_verbose=very_verbose,
        )

        if limit is None:
            limit = 0

        substitute_re: typing.Optional[typing.Pattern] = None

        if operation == AND_OP:
            if len(sources) == 0:
                raise KGTKException("And needs at least one source, got %d" % len(sources))
            if len(into_column_idxs) != 1:
                raise KGTKException("And needs 1 destination column, got %d" % len(into_column_idxs))

        elif operation == AVERAGE_OP:
            if len(sources) == 0:
                raise KGTKException("Average needs at least one source, got %d" % len(sources))
            if len(into_column_idxs) != 1:
                raise KGTKException("Average needs 1 destination column, got %d" % len(into_column_idxs))

        elif operation == CAPITALIZE_OP:
            if len(sources) == 0:
                raise KGTKException("Capitalize needs at least one source, got %d" % len(sources))
            if len(sources) != len(into_column_idxs):
                raise KGTKException("Capitalize needs the same number of input columns and into columns, got %d and %d" % (len(sources), len(into_column_idxs)))

        elif operation == CASEFOLD_OP:
            if len(sources) == 0:
                raise KGTKException("Casefold needs at least one source, got %d" % len(sources))
            if len(sources) != len(into_column_idxs):
                raise KGTKException("Casefold needs the same number of input columns and into columns, got %d and %d" % (len(sources), len(into_column_idxs)))

        elif operation == COPY_OP:
            if len(sources) == 0:
                raise KGTKException("Copy needs at least one source, got %d" % len(sources))
            if len(selected_names) != len(into_column_idxs):
                raise KGTKException("Copy needs the same number of input columns and into columns, got %d and %d" % (len(selected_names), len(into_column_idxs)))

        elif operation == EQ_OP:
            if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1):
                raise KGTKException("Eq needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values)))
            if len(into_column_idxs) != 1:
                raise KGTKException("Eq needs 1 destination columns, got %d" % len(into_column_idxs))

        elif operation == FROMISOFORMAT_OP:
            if len(sources) != 1:
                raise KGTKException("Fromisoformat needs one source, got %d" % len(sources))
            if len(values) != len(into_column_idxs):
                raise KGTKException("Fromisoformat needs the same number of values and into columns, got %d and %d" % (len(values), len(into_column_idxs)))

        elif operation == GE_OP:
            if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1):
                raise KGTKException("Ge needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values)))
            if len(into_column_idxs) != 1:
                raise KGTKException("Ge needs 1 destination columns, got %d" % len(into_column_idxs))

        elif operation == GT_OP:
            if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1):
                raise KGTKException("Gt needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values)))
            if len(into_column_idxs) != 1:
                raise KGTKException("Gt needs 1 destination columns, got %d" % len(into_column_idxs))

        elif operation == IS_OP:
            if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1):
                raise KGTKException("Is needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values)))
            if len(into_column_idxs) != 1:
                raise KGTKException("Is needs 1 destination columns, got %d" % len(into_column_idxs))

        elif operation == IS_IN_OP:
            if len(sources) != 1:
                raise KGTKException("Is in needs one source, got %d" % len(sources))
            if len(values) == 0:
                raise KGTKException("Is in needs at least one value, got %d" % len(values))
            if len(into_column_idxs) != 1:
                raise KGTKException("Is in needs 1 destination columns, got %d" % len(into_column_idxs))

        elif operation == IS_NOT_OP:
            if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1):
                raise KGTKException("Is not needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values)))
            if len(into_column_idxs) != 1:
                raise KGTKException("Is not needs 1 destination columns, got %d" % len(into_column_idxs))

        elif operation == JOIN_OP:
            if len(sources) == 0:
                raise KGTKException("Join needs at least one source, got %d" % len(sources))
            if len(into_column_idxs) != 1:
                raise KGTKException("Join needs 1 destination columns, got %d" % len(into_column_idxs))
            if len(values) != 1:
                raise KGTKException("Join needs 1 value, got %d" % len(values))

        elif operation == LE_OP:
            if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1):
                raise KGTKException("Le needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values)))
            if len(into_column_idxs) != 1:
                raise KGTKException("Le needs 1 destination columns, got %d" % len(into_column_idxs))

        elif operation == LT_OP:
            if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1):
                raise KGTKException("Lt needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values)))
            if len(into_column_idxs) != 1:
                raise KGTKException("Lt needs 1 destination columns, got %d" % len(into_column_idxs))

        elif operation == LOWER_OP:
            if len(sources) == 0:
                raise KGTKException("Lower needs at least one source, got %d" % len(sources))
            if len(sources) != len(into_column_idxs):
                raise KGTKException("Lower needs the same number of input columns and into columns, got %d and %d" % (len(sources), len(into_column_idxs)))

        elif operation == MAX_OP:
            if len(sources) == 0:
                raise KGTKException("Max needs at least one source, got %d" % len(sources))
            if len(into_column_idxs) != 1:
                raise KGTKException("Max needs 1 destination columns, got %d" % len(into_column_idxs))

        elif operation == MIN_OP:
            if len(sources) == 0:
                raise KGTKException("Min needs at least one source, got %d" % len(sources))
            if len(into_column_idxs) != 1:
                raise KGTKException("Min needs 1 destination columns, got %d" % len(into_column_idxs))

        elif operation == NE_OP:
            if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1):
                raise KGTKException("Ne needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values)))
            if len(into_column_idxs) != 1:
                raise KGTKException("Ne needs 1 destination columns, got %d" % len(into_column_idxs))

        elif operation == NOR_OP:
            if len(sources) == 0:
                raise KGTKException("Nor needs at least one source, got %d" % len(sources))
            if len(into_column_idxs) != 1:
                raise KGTKException("Nor needs 1 destination column, got %d" % len(into_column_idxs))

        elif operation == NOT_OP:
            if len(sources) == 0:
                raise KGTKException("Not needs at least one source, got %d" % len(sources))
            if len(into_column_idxs) != len(sources):
                raise KGTKException("Nand needs the same number of input columns and into colums, got %d and %d" % (len(sources), len(into_column_idxs)))

        elif operation == OR_OP:
            if len(sources) == 0:
                raise KGTKException("Or needs at least one source, got %d" % len(sources))
            if len(into_column_idxs) != 1:
                raise KGTKException("Or needs 1 destination column, got %d" % len(into_column_idxs))

        elif operation == PERCENTAGE_OP:
            if len(into_column_idxs) != 1:
                raise KGTKException("Percent needs 1 destination columns, got %d" % len(into_column_idxs))
            if len(selected_names) != 2:
                raise KGTKException("Percent needs 2 input columns, got %d" % len(selected_names))

        elif operation == REPLACE_OP:
            if len(into_column_idxs) != 1:
                raise KGTKException("Replace needs 1 destination column, got %d" % len(into_column_idxs))
            if len(selected_names) != 1:
                raise KGTKException("Replace needs 1 input column, got %d" % len(selected_names))
            if len(values) != 1:
                raise KGTKException("Replace needs one value, got %d" % len(values))
            if len(with_values) != 1:
                raise KGTKException("Replace needs one with-value, got %d" % len(with_values))

        elif operation == SET_OP:
            if len(sources) != 0:
                raise KGTKException("Set needs no sources, got %d" % len(sources))
            if len(into_column_idxs) == 0:
                raise KGTKException("Set needs at least one destination column, got %d" % len(into_column_idxs))
            if len(values) == 0:
                raise KGTKException("Set needs at least one value, got %d" % len(values))
            if len(into_column_idxs) != len(values):
                raise KGTKException("Set needs the same number of destination columns and values, got %d and %d" % (len(into_column_idxs), len(values)))

        elif operation == SUBSTITUTE_OP:
            if len(into_column_idxs) != 1:
                raise KGTKException("Substitute needs 1 destination column, got %d" % len(into_column_idxs))
            if len(selected_names) != 1:
                raise KGTKException("Substitute needs 1 input column, got %d" % len(selected_names))
            if len(values) != 1:
                raise KGTKException("Substitute needs one value, got %d" % len(values))
            if len(with_values) != 1:
                raise KGTKException("Substitute needs one with-value, got %d" % len(with_values))
            substitute_re = re.compile(values[0])

        elif operation == SUM_OP:
            if len(sources) == 0:
                raise KGTKException("Sum needs at least one source, got %d" % len(sources))
            if len(into_column_idxs) != 1:
                raise KGTKException("Sum needs 1 destination columns, got %d" % len(into_column_idxs))

        elif operation == SWAPCASE_OP:
            if len(sources) == 0:
                raise KGTKException("Swapcase needs at least one source, got %d" % len(sources))
            if len(sources) != len(into_column_idxs):
                raise KGTKException("Swapcase needs the same number of input columns and into columns, got %d and %d" % (len(sources), len(into_column_idxs)))

        elif operation == TITLE_OP:
            if len(sources) == 0:
                raise KGTKException("Title needs at least one source, got %d" % len(sources))
            if len(sources) != len(into_column_idxs):
                raise KGTKException("Title needs the same number of input columns and into columns, got %d and %d" % (len(sources), len(into_column_idxs)))

        elif operation == UPPER_OP:
            if len(sources) == 0:
                raise KGTKException("Upper needs at least one source, got %d" % len(sources))
            if len(sources) != len(into_column_idxs):
                raise KGTKException("Upper needs the same number of input columns and into columns, got %d and %d" % (len(sources), len(into_column_idxs)))

        elif operation == XOR_OP:
            if len(sources) == 0:
                raise KGTKException("Xor needs at least one source, got %d" % len(sources))
            if len(into_column_idxs) != 1:
                raise KGTKException("Xor needs 1 destination column, got %d" % len(into_column_idxs))


        fs: str = format_string if format_string is not None else "%5.2f"
        item: str
        item2: str
        kv: KgtkValue
        bresult: bool

        into_column_idx = into_column_idxs[0] # for convenience

        input_data_lines: int = 0
        row: typing.List[str]
        for row in kr:
            input_data_lines += 1

            output_row: typing.List[str] = row.copy()
            for idx in range(new_column_count):
                output_row.append("") # Easiest way to add a new column.

            if operation == AND_OP:
                bresult = True
                for idx in sources:
                    kv = KgtkValue(row[idx])
                    if kv.is_boolean():
                        bresult = bresult and kv.is_true()

                output_row[into_column_idx] = KgtkValue.to_boolean(bresult)

            elif operation == AVERAGE_OP:
                atotal: float = 0
                acount: int = 0
                for idx in sources:
                    item = row[idx]
                    if len(item) > 0:
                        atotal += float(item)
                        acount += 1
                output_row[into_column_idx] = (fs % (atotal / float(acount))) if acount > 0 else ""                

            elif operation == CAPITALIZE_OP:
                for idx in range(len(sources)):
                    output_row[into_column_idxs[idx]] = row[sources[idx]].capitalize()

            elif operation == CASEFOLD_OP:
                for idx in range(len(sources)):
                    output_row[into_column_idxs[idx]] = row[sources[idx]].casefold()

            elif operation == COPY_OP:
                for idx in range(len(sources)):
                    output_row[into_column_idxs[idx]] = row[sources[idx]]

            elif operation == EQ_OP:
                if len(sources) == 1:
                    if len(row[sources[0]]) > 0 and len(row[sources[1]]) > 0:
                        output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) == float(row[sources[1]]))
                    else:
                        output_row[into_column_idx] = ""
                else:
                    if len(row[sources[0]]) > 0 and len(values[0]) > 0:
                        output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) == float(values[0]))
                    else:
                        output_row[into_column_idx] = ""

            elif operation == FROMISOFORMAT_OP:
                dtval: str = row[sources[0]]
                if dtval.startswith(KgtkFormat.DATE_AND_TIMES_SIGIL):
                    kgtkdatestr: str = row[sources[0]][1:] # Strip the leading ^
                    isodatestr: str
                    precisionstr: str
                    if "/" in kgtkdatestr:
                        isodatestr, precisionstr = kgtkdatestr.split("/")
                    else:
                        isodatestr = kgtkdatestr
                        precisionstr = ""
                    if isodatestr.endswith("Z"):
                        isodatestr = isodatestr[:-1]

                    into_idx: int
                    value_name: str
                    try:
                        dtvar: dt.datetime = dt.datetime.fromisoformat(isodatestr)
                        for idx in range(len(values)):
                            value_name = values[idx]
                            into_idx = into_column_idxs[idx]
                            
                            if value_name == "year":
                                output_row[into_idx] = str(dtvar.year)

                            elif value_name == "month":
                                output_row[into_idx] = str(dtvar.month)
                    
                            elif value_name == "day":
                                output_row[into_idx] = str(dtvar.day)

                            elif value_name == "hour":
                                output_row[into_idx] = str(dtvar.hour)
                    
                            elif value_name == "minute":
                                output_row[into_idx] = str(dtvar.minute)
                    
                            elif value_name == "second":
                                output_row[into_idx] = str(dtvar.second)
                    
                            elif value_name == "microsecond":
                                output_row[into_idx] = str(dtvar.microsecond)

                            elif value_name == "error":
                                output_row[into_idx] = ""

                            else:
                                raise KGTKException("Unknown date component %s" % repr(value_name))

                    except ValueError as e:
                        print("Error parsing %s in [%s]: %s" % (repr(isodatestr), "|".join([repr(x) for x in row]), str(e)),
                              file=error_file, flush=True)

                        for idx in range(len(values)):
                            value_name = values[idx]
                            into_idx = into_column_idxs[idx]
                            if value_name == "error":
                                output_row[into_idx] = str(e)
                            else:
                                output_row[into_idx] = ""

                else:
                    # Not a date/time value, clear the result columns.
                    for idx in range(len(values)):
                        output_row[into_column_idxs[idx]] = ""
                    
            elif operation == GE_OP:
                if len(sources) == 1:
                    if len(row[sources[0]]) > 0 and len(row[sources[1]]) > 0:
                        output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) >= float(row[sources[1]]))
                    else:
                        output_row[into_column_idx] = ""
                else:
                    if len(row[sources[0]]) > 0 and len(values[0]) > 0:
                        output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) >= float(values[0]))
                    else:
                        output_row[into_column_idx] = ""

            elif operation == GT_OP:
                if len(sources) == 1:
                    if len(row[sources[0]]) > 0 and len(row[sources[1]]) > 0:
                        output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) > float(row[sources[1]]))
                    else:
                        output_row[into_column_idx] = ""
                else:
                    if len(row[sources[0]]) > 0 and len(values[0]) > 0:
                        output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) > float(values[0]))
                    else:
                        output_row[into_column_idx] = ""

            elif operation == IS_OP:
                if len(sources) == 1:
                    output_row[into_column_idx] = KgtkValue.to_boolean(row[sources[0]] == row[sources[1]])
                else:
                    output_row[into_column_idx] = KgtkValue.to_boolean(row[sources[0]] == values[0])

            elif operation == IS_IN_OP:
                bresult = False
                item = row[sources[0]]
                for item2 in values:
                    if item == item2:
                        bresult = True
                        break
                output_row[into_column_idx] = KgtkValue.to_boolean(bresult)

            elif operation == IS_NOT_OP:
                if len(sources) == 1:
                    output_row[into_column_idx] = KgtkValue.to_boolean(row[sources[0]] != row[sources[1]])
                else:
                    output_row[into_column_idx] = KgtkValue.to_boolean(row[sources[0]] != values[0])

            elif operation == JOIN_OP:
                output_row[into_column_idx] = values[0].join((row[sources[idx]] for idx in range(len(sources))))

            elif operation == LE_OP:
                if len(sources) == 1:
                    if len(row[sources[0]]) > 0 and len(row[sources[1]]) > 0:
                        output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) <= float(row[sources[1]]))
                    else:
                        output_row[into_column_idx] = ""
                else:
                    if len(row[sources[0]]) > 0 and len(values[0]) > 0:
                        output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) <= float(values[0]))
                    else:
                        output_row[into_column_idx] = ""

            elif operation == LT_OP:
                if len(sources) == 1:
                    if len(row[sources[0]]) > 0 and len(row[sources[1]]) > 0:
                        output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) < float(row[sources[1]]))
                    else:
                        output_row[into_column_idx] = ""
                else:
                    if len(row[sources[0]]) > 0 and len(values[0]) > 0:
                        output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) < float(values[0]))
                    else:
                        output_row[into_column_idx] = ""

            elif operation == LOWER_OP:
                for idx in range(len(sources)):
                    output_row[into_column_idxs[idx]] = row[sources[idx]].lower()

            elif operation == MAX_OP:
                max_result: typing.Optional[float] = None
                for idx in sources:
                    item = row[idx]
                    if len(item) > 0:
                        max_value: float = float(item)
                        if max_result is None or max_value > max_result:
                            max_result = max_value
                output_row[into_column_idx] = (fs % max_result) if max_result is not None else ""

            elif operation == MIN_OP:
                min_result: typing.Optional[float] = None
                for idx in sources:
                    item = row[idx]
                    if len(item) > 0:
                        min_value: float = float(item)
                        if min_result is None or min_value < min_result:
                            min_result = min_value
                output_row[into_column_idx] = (fs % min_result) if min_result is not None else ""

            elif operation == NAND_OP:
                bresult = True
                for idx in sources:
                    kv = KgtkValue(row[idx])
                    if kv.is_boolean():
                        bresult = bresult and kv.is_true()

                output_row[into_column_idx] = KgtkValue.to_boolean(not bresult)

            elif operation == NE_OP:
                if len(sources) == 1:
                    if len(row[sources[0]]) > 0 and len(row[sources[1]]) > 0:
                        output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) != float(row[sources[1]]))
                    else:
                        output_row[into_column_idx] = ""
                else:
                    if len(row[sources[0]]) > 0 and len(values[0]) > 0:
                        output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) != float(values[0]))
                    else:
                        output_row[into_column_idx] = ""

            elif operation == NOR_OP:
                bresult = False
                for idx in sources:
                    kv = KgtkValue(row[idx])
                    if kv.is_boolean():
                        bresult = bresult or kv.is_true()

                output_row[into_column_idx] = KgtkValue.to_boolean(not bresult)

            elif operation == NOT_OP:
                for idx in sources:
                    kv = KgtkValue(row[idx])
                    if kv.is_boolean():
                        output_row[into_column_idxs[idx]] = KgtkValue.to_boolean(not kv.is_true())
                    else:
                        output_row[into_column_idxs[idx]] = ""

            elif operation == OR_OP:
                bresult = False
                for idx in sources:
                    kv = KgtkValue(row[idx])
                    if kv.is_boolean():
                        bresult = bresult or kv.is_true()

                output_row[into_column_idx] = KgtkValue.to_boolean(bresult)

            elif operation == PERCENTAGE_OP:
                output_row[into_column_idx] = fs % (float(row[sources[0]]) * 100 / float(row[sources[1]]))

            elif operation == REPLACE_OP:
                if limit == 0:
                    output_row[into_column_idx] = row[sources[0]].replace(values[0], with_values[0])
                else:
                    output_row[into_column_idx] = row[sources[0]].replace(values[0], with_values[0], limit)

            elif operation == SET_OP:
                for idx in range(len(values)):
                    output_row[into_column_idxs[idx]] = values[idx]

            elif operation == SUBSTITUTE_OP and substitute_re is not None:
                output_row[into_column_idx] = substitute_re.sub(with_values[0], row[sources[0]], count=limit)

            elif operation == SUM_OP:
                total: float = 0
                for idx in sources:
                    item = row[idx]
                    if len(item) > 0:
                        total += float(item)
                for item in values:
                    if len(item) > 0:
                        total += float(item)
                output_row[into_column_idx] = fs % total
                
            elif operation == SWAPCASE_OP:
                for idx in range(len(sources)):
                    output_row[into_column_idxs[idx]] = row[sources[idx]].swapcase()

            elif operation == TITLE_OP:
                for idx in range(len(sources)):
                    output_row[into_column_idxs[idx]] = row[sources[idx]].title()

            elif operation == UPPER_OP:
                for idx in range(len(sources)):
                    output_row[into_column_idxs[idx]] = row[sources[idx]].upper()

            elif operation == XOR_OP:
                bresult = False
                for idx in sources:
                    kv = KgtkValue(row[idx])
                    if kv.is_boolean():
                        bresult = bresult != kv.is_true()

                output_row[into_column_idx] = KgtkValue.to_boolean(bresult)

            kw.write(output_row)

        # Flush the output file so far:
        kw.flush()

        if verbose:
            print("Read %d data lines from file %s" % (input_data_lines, input_kgtk_file), file=error_file, flush=True)

        kw.close()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Пример #26
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        new_edges_file: KGTKFiles,
        base_columns: typing.Optional[typing.List[str]] = None,
        columns_to_lower: typing.Optional[typing.List[str]] = None,
        label_value: str = KgtkLift.DEFAULT_LABEL_SELECT_COLUMN_VALUE,
        lift_separator: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SEPARATOR,
        lower: bool = False,
        normalize: bool = False,
        deduplicate_new_edges: bool = True,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    new_edges_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(new_edges_file,
                                                            who="Label file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if new_edges_kgtk_file is not None:
            print("--label-file=%s" % str(new_edges_kgtk_file),
                  file=error_file)

        if base_columns is not None:
            print("--base-columns=%s" % " ".join(base_columns),
                  file=error_file)
        if columns_to_lower is not None:
            print("--columns-to-lower=%s" % " ".join(columns_to_lower),
                  file=error_file)
        print("--label-value=%s" % label_value, file=error_file)
        print("--lift-separator=%s" % lift_separator, file=error_file)
        print("--lower=%s" % lower, file=error_file)
        print("--normalize=%s" % normalize, file=error_file)
        print("--deduplicate-labels=%s" % deduplicate_new_edges,
              file=error_file)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if not lower and not normalize:
        raise KGTKException(
            "One or both of --lower and --normalize must be requested.")

    try:
        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Map the index of a column being removed to the index of the base column that supplies its node1 value.
        lower_map: typing.MutableMapping[int, typing.Tuple[int, str]] = dict()

        node1_column_name: str = kr.get_node1_column_actual_name()
        label_column_name: str = kr.get_label_column_actual_name()
        node2_column_name: str = kr.get_node2_column_actual_name()
        id_column_name: str = kr.get_id_column_actual_name()

        key_column_names: typing.List[str] = list()
        key_column_idxs: typing.Set[int] = set()

        if node1_column_name != "":
            if verbose:
                print("Node1 column name: %s" % node1_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(node1_column_name)
            key_column_idxs.add(kr.node1_column_idx)

        if label_column_name != "":
            if verbose:
                print("Label column name: %s" % label_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(label_column_name)
            key_column_idxs.add(kr.label_column_idx)

        if node2_column_name != "":
            if verbose:
                print("Node2 column name: %s" % node2_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(node2_column_name)
            key_column_idxs.add(kr.node2_column_idx)

        if id_column_name != "":
            if verbose:
                print("Id column name: %s" % id_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(id_column_name)
            key_column_idxs.add(kr.id_column_idx)
        elif normalize:
            raise KGTKException(
                "--normalize was requested but the ID column was not found.")

        base_name: str
        new_label_value: str
        column_name: str
        idx: int
        # There are three option patterns.

        if columns_to_lower is not None and len(
                columns_to_lower) > 0 and base_columns is not None and len(
                    base_columns) > 0:
            # Pattern 1: len(columns_to_lower) > 0 and len(base_columns) == len(columns_to_lower)
            # column_names and base_columns are paired. New records use label_value.
            if len(columns_to_lower) != len(base_columns):
                raise KGTKException(
                    "There are %d columns to remove but only %d base columns."
                    % (len(columns_to_lower), len(base_columns)))

            if len(label_value) == 0:
                raise KGTKException("The --label-value must not be empty.")

            for idx, column_name in enumerate(columns_to_lower):
                base_name = base_columns[idx]
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if base_name not in kr.column_names:
                    raise KGTKException(
                        "For column name %s, base name %s is unknown" %
                        (repr(column_name), repr(base_name)))

                if normalize and base_name == id_column_name:
                    lower_map[kr.column_name_map[column_name]] = (
                        kr.column_name_map[base_name], column_name)
                else:
                    if not lower:
                        raise KGTKException(
                            "--lower is not enabled for column %s, base name %s"
                            % (repr(column_name), repr(base_name)))
                    lower_map[kr.column_name_map[column_name]] = (
                        kr.column_name_map[base_name], label_value)

        elif columns_to_lower is not None and len(columns_to_lower) > 0 and (
                base_columns is None or len(base_columns) == 0):
            # Pattern 2: len(columns_to_lower) > 0 and len(base_columns) == 0
            # Each column name is split at the lift separator to determine the base name and label value.
            if len(lift_separator) == 0:
                raise KGTKException("The --lift-separator must not be empty.")

            for idx, column_name in enumerate(columns_to_lower):
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if lower and lift_separator in column_name:
                    base_name, new_label_value = column_name.split(
                        lift_separator, 1)
                    if base_name not in kr.column_names:
                        raise KGTKException(
                            "For column name %s, base name %s is not known" %
                            (repr(column_name), repr(base_name)))

                elif normalize:
                    base_name = id_column_name
                    new_label_value = column_name

                else:
                    raise KGTKException(
                        "Unable to parse column name %s, no separator (%s)." %
                        (repr(column_name), repr(lift_separator)))

                lower_map[kr.column_name_map[column_name]] = (
                    kr.column_name_map[base_name], new_label_value)

        elif columns_to_lower is None or len(columns_to_lower) == 0:
            # Pattern 3: len(columns_to_lower) == 0.
            # Any column that matches a lift pattern against one of the
            # key columns (node1, label, node2, id, or their aliases)
            # will be lowered.
            if len(lift_separator) == 0:
                raise KGTKException("The --lift-separator must not be empty.")

            if base_columns is None or len(base_columns) == 0:
                # The base name list wasn't supplied.  Use [node1, label, node2, id]
                base_columns = list(key_column_names)
                if verbose:
                    print("Using the default base columns: %s" %
                          " ".join(base_columns),
                          file=error_file,
                          flush=True)
            else:
                if verbose:
                    print("Using these base columns: %s" %
                          " ".join(base_columns),
                          file=error_file,
                          flush=True)

            for idx, column_name in enumerate(kr.column_names):
                # Skip the node1, label, node12, and id columns
                if idx in key_column_idxs:
                    if verbose:
                        print("column %s is a key column, skipping." %
                              repr(column_name),
                              file=error_file,
                              flush=True)
                    continue

                # Does this column match a lifting pattern?
                if lower and lift_separator in column_name:
                    base_name, new_label_value = column_name.split(
                        lift_separator, 1)

                    if base_name not in base_columns:
                        if verbose:
                            print(
                                "Column %s contains base name %s, which is not a base column."
                                % (repr(column_name), repr(base_name)),
                                file=error_file,
                                flush=True)
                        continue

                elif normalize:
                    base_name = id_column_name
                    new_label_value = column_name

                else:
                    if verbose:
                        print(
                            "Column %s does not contain the separator %s and not normalizing, skipping."
                            % (repr(column_name), repr(lift_separator)),
                            file=error_file,
                            flush=True)
                    continue

                # This test should be redundant.
                if base_name in kr.column_names:
                    lower_map[idx] = (kr.column_name_map[base_name],
                                      new_label_value)
                else:
                    raise KGTKException(
                        "Base name %s was unexpectedly not found." %
                        repr(base_name))

        if len(lower_map) == 0:
            raise KGTKException("There are no columns to lower or normalize.")

        if verbose:
            print("The following columns will be lowered or normalized",
                  file=error_file,
                  flush=True)
            for idx in sorted(lower_map.keys()):
                column_name = kr.column_names[idx]
                base_idx, new_label_value = lower_map[idx]
                base_name = kr.column_names[base_idx]
                print(" %s from %s (label %s)" %
                      (column_name, base_name, repr(new_label_value)),
                      file=error_file,
                      flush=True)

        output_column_names: typing.List[str] = list()
        for idx, column_name in enumerate(kr.column_names):
            if idx not in lower_map:
                output_column_names.append(column_name)
        if verbose:
            print("The output columns are: %s" % " ".join(output_column_names),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(
            output_column_names,
            output_kgtk_file,
            mode=KgtkWriter.Mode.EDGE,
            require_all_columns=False,  # Simplifies writing the labels
            verbose=verbose,
            very_verbose=very_verbose)
        shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names)

        lkw: typing.Optional[KgtkWriter] = None
        if new_edges_kgtk_file is not None:
            if verbose:
                print("Opening the label output file: %s" %
                      str(new_edges_kgtk_file),
                      file=error_file,
                      flush=True)

            label_column_names = [
                node1_column_name, label_column_name, node2_column_name
            ]
            lkw = KgtkWriter.open(label_column_names,
                                  new_edges_kgtk_file,
                                  mode=KgtkWriter.Mode.EDGE,
                                  verbose=verbose,
                                  very_verbose=very_verbose)

        # Optionally deduplicate the labels
        #  set(node1_value + KgtkFormat.SEPARATOR + node2_value)
        label_set: typing.Set[str] = set()
        label_key: str

        input_line_count: int = 0
        output_line_count: int = 0
        label_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            kw.write(row, shuffle_list=shuffle_list)
            output_line_count += 1

            column_idx: int
            for column_idx in lower_map.keys():
                node1_idx: int
                node1_idx, new_label_value = lower_map[column_idx]
                node1_value: str
                node1_value = row[node1_idx]
                if len(node1_value) == 0:
                    continue  # TODO: raise an exception

                item: str = row[column_idx]
                if len(item) == 0:
                    continue  # Ignore empty node2 values.

                # Ths item might be a KGTK list.  Let's split it, because
                # lists aren't allow in the node2 values we'll generate.
                node2_value: str
                for node2_value in KgtkValue.split_list(item):
                    if len(node2_value) == 0:
                        continue  # Ignore empty node2 values.

                    if deduplicate_new_edges:
                        label_key = node1_value + KgtkFormat.KEY_FIELD_SEPARATOR + new_label_value + KgtkFormat.KEY_FIELD_SEPARATOR + node2_value
                        if label_key in label_set:
                            continue
                        else:
                            label_set.add(label_key)

                    output_map: typing.Mapping[str, str] = {
                        node1_column_name: node1_value,
                        label_column_name: new_label_value,
                        node2_column_name: node2_value,
                    }
                    if lkw is None:
                        kw.writemap(output_map)
                        label_line_count += 1
                        output_line_count += 1
                    else:
                        lkw.writemap(output_map)
                        label_line_count += 1

        if verbose:
            print("Read %d rows, wrote %d rows with %d labels." %
                  (input_line_count, output_line_count, label_line_count),
                  file=error_file,
                  flush=True)

        kw.close()
        if lkw is not None:
            lkw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
Пример #27
0
    def implode_language_qualified_string(
        self,
        input_line_count: int,
        row: typing.List[str],
        implosion: typing.Mapping[str, int],
        type_name: str,
    ) -> typing.Tuple[str, bool]:
        valid: bool = True
        text_idx: int = implosion[KgtkValueFields.TEXT_FIELD_NAME]
        text_val: str = row[text_idx]
        if len(text_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" %
                      (input_line_count, type_name,
                       KgtkValueFields.TEXT_FIELD_NAME),
                      file=self.error_file,
                      flush=True)

        elif len(text_val) == 1:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is too short" %
                      (input_line_count, type_name,
                       KgtkValueFields.TEXT_FIELD_NAME),
                      file=self.error_file,
                      flush=True)
        else:
            if not text_val.startswith('"'):
                valid = False
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': %s field does not start with a double quote"
                        % (input_line_count, type_name,
                           KgtkValueFields.TEXT_FIELD_NAME),
                        file=self.error_file,
                        flush=True)
            if not text_val.endswith('"'):
                valid = False
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': %s field does not end with a double quote"
                        % (input_line_count, type_name,
                           KgtkValueFields.TEXT_FIELD_NAME),
                        file=self.error_file,
                        flush=True)

        language_idx: int = implosion[KgtkValueFields.LANGUAGE_FIELD_NAME]
        language_val: str = self.unwrap(row[language_idx])
        if len(language_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" %
                      (input_line_count, type_name,
                       KgtkValueFields.LANGUAGE_FIELD_NAME),
                      file=self.error_file,
                      flush=True)

        suf_idx: int = implosion[KgtkValueFields.LANGUAGE_SUFFIX_FIELD_NAME]
        suf: str = self.unwrap(row[suf_idx]) if suf_idx >= 0 else ""
        if len(suf) > 0 and not suf.startswith("-"):
            # As a siecial favor, we'll accept language suffixes that do not
            # start with a dash.  We'll prepend the dash.
            suf = "-" + suf

        value: str = ""
        if valid:
            # This subterfuge uses Python's literal parser to parse the string.
            if not self.escape_pipes:
                # ast.literal_eval(...) doesn't treat backslash pipe (\|) as an escaped pipe (|).
                # (this is documented behavior) so we will remove escaped pipes manually.
                text_val = text_val.replace('\\|', '|')
            value = KgtkFormat.stringify(ast.literal_eval(text_val),
                                         language=language_val,
                                         language_suffix=suf)

        if valid and self.validate:
            kv: KgtkValue = KgtkValue(value, options=self.value_options)
            valid = kv.is_language_qualified_string(validate=True)
            if not valid:
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': imploded value '%s' is not a valid language qualified string."
                        % (input_line_count, type_name, value),
                        file=self.error_file,
                        flush=True)
        return value, valid
Пример #28
0
    def implode_string(
        self,
        input_line_count: int,
        row: typing.List[str],
        implosion: typing.Mapping[str, int],
        type_name: str,
    ) -> typing.Tuple[str, bool]:
        valid: bool = True
        if KgtkValueFields.LANGUAGE_FIELD_NAME in implosion:
            language_idx: int = implosion[KgtkValueFields.LANGUAGE_FIELD_NAME]
            if language_idx >= 0:
                language_val: str = self.unwrap(row[language_idx])
                if len(language_val) > 0:
                    if self.general_strings:
                        return self.implode_language_qualified_string(
                            input_line_count, row, implosion, type_name)
                    else:
                        valid = False
                        if self.verbose:
                            print(
                                "Input line %d: data type '%s': %s field is not empty"
                                % (input_line_count, type_name,
                                   KgtkValueFields.LANGUAGE_FIELD_NAME),
                                file=self.error_file,
                                flush=True)

        text_idx: int = implosion[KgtkValueFields.TEXT_FIELD_NAME]
        text_val: str = row[text_idx]
        if len(text_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" %
                      (input_line_count, type_name,
                       KgtkValueFields.TEXT_FIELD_NAME),
                      file=self.error_file,
                      flush=True)
        elif len(text_val) == 1:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is too short" %
                      (input_line_count, type_name,
                       KgtkValueFields.TEXT_FIELD_NAME),
                      file=self.error_file,
                      flush=True)
        else:
            if not text_val.startswith('"'):
                valid = False
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': %s field does not start with a double quote"
                        % (input_line_count, type_name,
                           KgtkValueFields.TEXT_FIELD_NAME),
                        file=self.error_file,
                        flush=True)
            if not text_val.endswith('"'):
                valid = False
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': %s field does not end with a double quote"
                        % (input_line_count, type_name,
                           KgtkValueFields.TEXT_FIELD_NAME),
                        file=self.error_file,
                        flush=True)

        value: str = ""
        if valid:
            # This subterfuge uses Python's literal parser to parse the string.
            if not self.escape_pipes:
                # ast.literal_eval(...) doesn't treat backslash pipe (\|) as an escaped pipe (|).
                # (this is documented behavior) so we will remove escaped pipes manually.
                text_val = text_val.replace('\\|', '|')
            value = KgtkFormat.stringify(ast.literal_eval(text_val))

        if valid and self.validate:
            kv: KgtkValue = KgtkValue(value, options=self.value_options)
            valid = kv.is_string(validate=True)
            if not valid:
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': imploded value '%s' is not a valid string."
                        % (input_line_count, type_name, value),
                        file=self.error_file,
                        flush=True)
        return value, valid