def check_column_name(cls, column_name: str, header_line: str, error_action: ValidationAction, error_file: typing.TextIO = sys.stderr)->typing.List[str]: # Returns a list of complaints. # Check for valid column names. # 1) Check for leading white space # 2) Check for trailing white space # 3) Check for internal white space # 1) except inside "" and '' quoted strings # 4) Check for commas # 5) Check for vertical bars # 6) Check for semicolons (disabled) # # TODO: It might be possible to make some of these checks more efficient. results: typing.List[str] = [ ] if column_name.lstrip() != column_name: results.append("Column name '%s' starts with leading white space" % column_name) if column_name.rstrip() != column_name: results.append("Column name '%s' ends with leading white space" % column_name) if not (column_name.startswith('"') or column_name.startswith("'")): if ''.join(column_name.split()) != column_name.strip(): results.append("Column name '%s' contains internal white space" % column_name) if "," in column_name: results.append("Warning: Column name '%s' contains a comma (,)" % column_name) if "|" in column_name: results.append("Warning: Column name '%s' contains a vertical bar (|)" % column_name) # if ";" in column_name: # results.append("Warning: Column name '%s' contains a semicolon (;)" % column_name) kv: KgtkValue = KgtkValue(column_name) if not kv.is_valid(): results.append(kv.describe()) return results
def explode(self, value: KgtkValue, row: typing.List[str], explosion: typing.Mapping[str, int], new_column_count: int) -> typing.List[str]: newrow: typing.List[str] = row.copy() if new_column_count > 0: # Would it be better to do: # # if new_column_count > 0: # newrow.extend(["] * new_column_count) i: int for i in range(new_column_count): newrow.append("") field_map: typing.Mapping[str, typing.Union[str, int, float, bool]] = value.get_field_map() field_name: str idx: int for field_name, idx in explosion.items(): if field_name in field_map: newvalue: str if KgtkValueFields.FIELD_NAME_FORMATS[field_name] == "str": # Format this as a KGTK string. newvalue = '"' + str(field_map[field_name]) + '"' else: # Convert everything else to a KGTK number or symbol newvalue = str(field_map[field_name]) newrow[idx] = newvalue else: newrow[ idx] = "" # In case we are overwriting an existing column. return newrow
def implode_date_and_times(self, input_line_count: int, row: typing.List[str], implosion: typing.Mapping[str, int], type_name: str, )->typing.Tuple[str, bool]: valid: bool = True date_and_times_idx: int = implosion[KgtkValueFields.DATE_AND_TIMES_FIELD_NAME] date_and_times_val: str = self.unwrap(row[date_and_times_idx]) if len(date_and_times_val) == 0: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.DATE_AND_TIMES_FIELD_NAME), file=self.error_file, flush=True) precision_idx: int = implosion[KgtkValueFields.PRECISION_FIELD_NAME] precision_val: str = self.unwrap(row[precision_idx]) if precision_idx >= 0 else "" value: str = "^" + date_and_times_val if len(precision_val) > 0: value += "/" + precision_val if valid and self.validate: kv: KgtkValue = KgtkValue(value, options=self.value_options) valid = kv.is_date_and_times(validate=True) if not valid: if self.verbose: print("Input line %d: data type '%s': imploded value '%s' is not a valid date and time." % (input_line_count, type_name, value), file=self.error_file, flush=True) return value, valid
def implode_symbol(self, input_line_count: int, row: typing.List[str], implosion: typing.Mapping[str, int], type_name: str, )->typing.Tuple[str, bool]: valid: bool = True symbol_idx: int = implosion[KgtkValueFields.SYMBOL_FIELD_NAME] symbol_val: str = self.unwrap(row[symbol_idx]) if len(symbol_val) == 0: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.SYMBOL_FIELD_NAME), file=self.error_file, flush=True) if self.escape_pipes: symbol_val = symbol_val.replace(KgtkFormat.LIST_SEPARATOR, "\\" + KgtkFormat.LIST_SEPARATOR) value: str = symbol_val if valid and self.validate: kv: KgtkValue = KgtkValue(value, options=self.value_options) valid = kv.is_symbol(validate=True) if not valid: if self.verbose: print("Input line %d: data type '%s': imploded value '%s' is not a valid symbol." % (input_line_count, type_name, value), file=self.error_file, flush=True) return value, valid
def convert_and_validate(self, item: str, line_number: int, ew: KgtkWriter) -> typing.Tuple[str, bool]: result: str is_ok: bool result, is_ok = self.convert(item, line_number, ew) # Just a little bit of paranoia here regarding tabs and ends-of-lines: # # TODO: perform these checks (and repairs!) in KgtkValue. if "\t" in result: result = result.replace("\t", "\\t") if "\n" in result: result = result.replace("\n", "\\n") if "\r" in result: result = result.replace("\r", "\\r") if is_ok and self.validate: kv: KgtkValue = KgtkValue(result, options=self.value_options) if not kv.validate(): if self.verbose: print( "Input line %d: imported value '%s' (from '%s') is invalid." % (line_number, result, item), file=self.error_file, flush=True) return result, False return result, True
def implode_location_coordinates(self, input_line_count: int, row: typing.List[str], implosion: typing.Mapping[str, int], type_name: str, )->typing.Tuple[str, bool]: valid: bool = True latitude_idx: int = implosion[KgtkValueFields.LATITUDE_FIELD_NAME] latitude_val: str = self.unwrap(row[latitude_idx]) if len(latitude_val) == 0: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.LATITUDE_FIELD_NAME), file=self.error_file, flush=True) longitude_idx: int = implosion[KgtkValueFields.LONGITUDE_FIELD_NAME] longitude_val: str = self.unwrap(row[longitude_idx]) if len(longitude_val) == 0: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.LONGITUDE_FIELD_NAME), file=self.error_file, flush=True) value: str = "@" + latitude_val + "/" + longitude_val if valid and self.validate: kv: KgtkValue = KgtkValue(value, options=self.value_options) valid = kv.is_location_coordinates(validate=True) if not valid: if self.verbose: print("Input line %d: data type '%s': imploded value '%s' is not a valid location coordinates." % (input_line_count, type_name, value), file=self.error_file, flush=True) return value, valid
def verify_uniqueness(self, id_value: str, row: typing.List[str], line_number, who: str): """ Verify that ID values are not repeated. This is OK for the output of `kgtk compact`, but is a little too strong for general use. The weaker constraint should be that the ID values don't repeat with different (node1, label, node2) tuples in an edge file. """ if KgtkFormat.LIST_SEPARATOR in id_value: # The ID value might be a list. id_v: str for id_v in KgtkValue.split_list(id_value): if id_v in self.id_set: # TODO: Probably want more error handling options, such as # printing the offending row and choosing to continue. raise ValueError( "Line %d: %s ID '%s' duplicates a previous ID '%s'." % (line_number, who, id_value, id_v)) else: self.id_set.add(id_v) else: # Not a list, we can process this faster. if id_value in self.id_set: # TODO: Probably want more error handling options, such as # printing the offending row and choosing to continue. raise ValueError( "Line %d: %s ID '%s' duplicates a previous ID '%s'." % (line_number, who, id_value, id_value)) else: self.id_set.add(id_value)
def compact_row(self) -> bool: """Compact the current row. Return True if there is at least one list in the result, otherwise return False. """ if self.current_row_lists is None: return False # Preallocate the row, this might be more efficient than appending to it.. self.current_row = [""] * len(self.current_row_lists) idx: int item_list: typing.Optional[typing.List[str]] saw_list: bool = False for idx, item_list in enumerate(self.current_row_lists): if item_list is not None: if idx in self.keep_first_idx_list: item_list = sorted( item_list[:1]) # Ensure sorting. Is this redundant? if len(item_list) > 1: saw_list = True # We don't need to use KgtkValue.join_unique_list(item_list) # because self.merge_row(...) and self.expand_row(...) ensure that # there are no duplicates. # # TODO: run timing studies to determine which approach is more efficient. self.current_row[idx] = KgtkValue.join_sorted_list(item_list) self.current_row_lists = None return saw_list
def expand_row(self, row: typing.List[str], force: bool = False): if not self.lists_in_input and not force: self.current_row = row # Optimization: leave the row alone if possible. return # Preallocate the list, this might be more efficient than appending to it.. self.current_row_lists = [None] * len(row) idx: int item: str for idx, item in enumerate(row): if len(item) == 0: continue # Ignore empty items. # Start the new current item list: current_item_list: typing.Optional[typing.List[str]] = None # The row item might itself be a list. item2: str for item2 in KgtkValue.split_list(item): if len(item2) == 0: continue # Ignore empty items if current_item_list is None: # This is the first item. current_item_list = [item2] continue # There might be duplicate items in the row item's list. if item2 not in current_item_list: current_item_list.append(item2) # Add unique items. self.current_row_lists[idx] = current_item_list
def implode_boolean( self, input_line_count: int, row: typing.List[str], implosion: typing.Mapping[str, int], type_name: str, ) -> typing.Tuple[str, bool]: valid: bool = True truth_idx: int = implosion[KgtkValueFields.TRUTH_FIELD_NAME] truth_val: str = self.unwrap(row[truth_idx]) if len(truth_val) == 0: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.TRUTH_FIELD_NAME), file=self.error_file, flush=True) value: str = truth_val if valid and self.validate: kv: KgtkValue = KgtkValue(value, options=self.value_options) valid = kv.is_boolean(validate=True) if not valid: if self.verbose: print( "Input line %d: data type '%s': imploded value '%s' is not a valid boolean." % (input_line_count, type_name, value), file=self.error_file, flush=True) return value, valid
def compact_row(self): if self.current_row_lists is None: return # Preallocate the list, this might be more efficient than appending to it.. self.current_row = [None] * len(self.current_row_lists) idx: int item_list: typing.list[str] for idx, item_list in enumerate(self.current_row_lists): self.current_row[idx] = KgtkValue.join_sorted_list(item_list) self.current_row_lists = None
def implode_quantity(self, input_line_count: int, row: typing.List[str], implosion: typing.Mapping[str, int], type_name: str, )->typing.Tuple[str, bool]: valid: bool = True num_idx: int = implosion[KgtkValueFields.NUMBER_FIELD_NAME] num_val: str = self.unwrap(row[num_idx]) if len(num_val) == 0: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.NUMBER_FIELD_NAME), file=self.error_file, flush=True) lt_idx: int = implosion[KgtkValueFields.LOW_TOLERANCE_FIELD_NAME] lt: str = self.unwrap(row[lt_idx]) if lt_idx >= 0 else "" ht_idx: int = implosion[KgtkValueFields.HIGH_TOLERANCE_FIELD_NAME] ht: str = self.unwrap(row[ht_idx]) if ht_idx >= 0 else "" if len(lt) > 0 ^ len(ht) > 0: valid = False if self.verbose: print("Input line %d: data type '%s': low and high tolerance must both be present or absent." % (input_line_count, type_name), file=self.error_file, flush=True) si_idx: int = implosion[KgtkValueFields.SI_UNITS_FIELD_NAME] si: str = self.unwrap(row[si_idx]) if si_idx >= 0 else "" un_idx: int = implosion[KgtkValueFields.UNITS_NODE_FIELD_NAME] un: str = self.unwrap(row[un_idx]) if un_idx >= 0 else "" value: str = num_val if len(lt) > 0 or len(ht) > 0: value += "[" + lt + "," + ht + "]" value += si + un if valid and self.validate: kv: KgtkValue = KgtkValue(value, options=self.value_options) if self.quantities_include_numbers: valid = kv.is_number_or_quantity(validate=True) if not valid: if self.verbose: print("Input line %d: data type '%s': imploded value '%s' is not a valid quantity or number." % (input_line_count, type_name, value), file=self.error_file, flush=True) else: valid = kv.is_quantity(validate=True) if not valid: if self.verbose: print("Input line %d: data type '%s': imploded value '%s' is not a valid quantity." % (input_line_count, type_name, value), file=self.error_file, flush=True) return value, valid
def build_attr_map( self, result: typing.MutableMapping[str, typing.Any], attr_list: str, who: str, ): if len(attr_list) == 0: return attr_map: typing.MutableMapping[str, typing.Mapping[str, str]] = {} attr: str for attr in KgtkValue.split_list(attr_list): self.add_attr_to_map(attr_map, attr, who) if len(attr_map) > 0: result[who] = attr_map
def load_labels( self, kr: KgtkReader, path: Path, ) -> typing.Tuple[typing.Mapping[str, str], typing.List[typing.List[str]]]: input_rows: typing.List[typing.List[str]] = [] labels: typing.MutableMapping[str, str] = {} node1_column_idx: int label_column_idx: int node2_column_idx: int node1_column_idx, label_column_idx, node2_column_idx = self.lookup_label_table_idxs( kr) if self.verbose: print("Loading labels from %s" % path, file=self.error_file, flush=True) key: str row: typing.List[str] for row in kr: if row[label_column_idx] == self.label_column_value: # This is a label definition row. label_key = row[node1_column_idx] label_value: str = row[node2_column_idx] if len(label_value) > 0: if label_key in labels: # This label already exists in the table. if self.suppress_duplicate_labels: # Build a list eliminating duplicate elements. # print("Merge '%s' and '%s'" % (key_value, labels[key]), file=self.error_file, flush=True) labels[label_key] = KgtkValue.merge_values( labels[label_key], label_value) else: labels[label_key] = KgtkFormat.LIST_SEPARATOR.join( (labels[label_key], label_value)) else: # This is the first instance of this label definition. labels[label_key] = label_value if not self.remove_label_records: input_rows.append(row) else: input_rows.append(row) return labels, input_rows
def compact_row(self): if self.current_row_lists is None: return # Preallocate the list, this might be more efficient than appending to it.. self.current_row = [None] * len(self.current_row_lists) idx: int item_list: typing.Optional[typing.List[str]] for idx, item_list in enumerate(self.current_row_lists): if item_list is None: self.current_row[idx] = "" else: # We don't need to use KgtkValue.join_unique_list(item_list) # because self.merge_row(...) and self.expand_row(...) ensure that # there are no duplicates. # # TODO: run timing studies to determine which approach is more efficient. self.current_row[idx] = KgtkValue.join_sorted_list(item_list) self.current_row_lists = None
def merge_row(self, row: typing.List[str]): if self.current_row_lists is None: if self.current_row is None: # TODO: raise a better error raise ValueError("Inconsistent state #1 in merge_row.") else: # We deferred expanding the previous row, but we must do so # now: self.expand_row(self.current_row, force=True) if self.current_row_lists is None: # Keep mypy happy by ensuring that self.current_row_lists is not None. # # TODO: raise a better error. raise ValueError("Inconsistent state #2 in merge_row.") idx: int item: str for idx, item in enumerate(row): if len(item) == 0: continue # Ignore empty items # We will modify the current item list in place! current_item_list: typing.Optional[ typing.List[str]] = self.current_row_lists[idx] # The row item might itself be a list. item2: str for item2 in KgtkValue.split_list(item): if len(item2) == 0: continue # Ignore empty items. if current_item_list is None: # This is the first item. current_item_list = [item2] self.current_row_lists[idx] = current_item_list continue # There might be duplicate items in the row item's list. if item2 not in current_item_list: current_item_list.append(item2) # Add unique items.
def add_attr_to_map( self, attr_map: typing.MutableMapping[str, typing.Mapping[str, str]], attr: str, who: str, ): kv: KgtkValue = KgtkValue(attr, options=self.value_options, parse_fields=False, error_file=self.error_file, verbose=self.verbose) if not kv.is_language_qualified_string(validate=True): raise ValueError("Invald attr %s for %s" % (attr, who)) text: str language: str language_suffix: str text, language, language_suffix = KgtkFormat.destringify(kv.value) if len(language) == 0: raise ValueError("No attr language in %s for %s" % (attr, who)) lang: str = language + language_suffix attr_map[lang] = {"language": lang, "value": text}
def run( input_file: KGTKFiles, output_file: KGTKFiles, label_file: KGTKFiles, base_columns: typing.Optional[typing.List[str]] = None, columns_to_remove: typing.Optional[typing.List[str]] = None, label_value: str = KgtkLift.DEFAULT_LABEL_SELECT_COLUMN_VALUE, lift_suffix: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SUFFIX, deduplicate_labels: bool = True, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) label_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file(label_file, who="Label file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if label_kgtk_file is not None: print("--label-file=%s" % str(label_kgtk_file), file=error_file) if base_columns is not None: print("--base-columns=%s" % " ".join(base_columns), file=error_file) if columns_to_remove is not None: print("--columns-to-lower=%s" % " ".join(columns_to_remove), file=error_file) print("--label-value=%s" % label_value, file=error_file) print("--lift-suffix=%s" % lift_suffix, file=error_file) print("--deduplicate-labels=%s" % deduplicate_labels, file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) # Map the index of a column being removed to the index of the base column that supplies its node1 value. lower_map: typing.MutableMapping[int, int] = dict() # These columns will never be removed: key_column_idxs: typing.Set[int] = set( (kr.node1_column_idx, kr.label_column_idx, kr.node2_column_idx, kr.id_column_idx)) key_column_idxs.discard(-1) key_column_names: typing.Set[str] = set( (kr.column_names[idx] for idx in key_column_idxs)) base_name: str column_name: str idx: int # There are three option patterns. if columns_to_remove is not None and len( columns_to_remove) > 0 and base_columns is not None and len( base_columns) > 0: # Pattern 1: len(columns_to_remove) > 0 and len(base_columns) == len(columns_to_remove) # column_names and base_columns are paired. if len(columns_to_remove) != len(base_columns): raise KGTKException( "There are %d columns to remove but only %d base columns." % (len(columns_to_remove), len(base_columns))) for idx, column_name in enumerate(columns_to_remove): base_name = base_columns[idx] if column_name not in kr.column_names: raise KGTKException( "Column %s is an unknown column, cannot remove it." % repr(column_name)) if column_name in key_column_names: raise KGTKException( "Column %s is a key column, cannot remove it." % repr(column_name)) if base_name not in kr.column_names: raise KGTKException( "For column name %s, base name %s is unknown" % (repr(column_name), repr(base_name))) lower_map[kr.column_name_map[ column_name]] = kr.column_name_map[base_name] elif columns_to_remove is not None and len(columns_to_remove) > 0 and ( base_columns is None or len(base_columns) == 0): # Pattern 2: len(columns_to_remove) > 0 and len(base_columns) == 0 # Each column name is stripped of the lift suffix to determine the base name. if len(lift_suffix) == 0: raise KGTKException("The --lift-suffix must not be empty.") for idx, column_name in enumerate(columns_to_remove): if column_name not in kr.column_names: raise KGTKException( "Column %s is an unknown column, cannot remove it." % repr(column_name)) if column_name in key_column_names: raise KGTKException( "Column %s is a key column, cannot remove it." % repr(column_name)) if not column_name.endswith(lift_suffix): raise KGTKException("Unable to parse column name %s." % repr(column_name)) base_name = column_name[:-len(lift_suffix)] if base_name not in kr.column_names: raise KGTKException( "For column name %s, base name %s is not known" % (repr(column_name), repr(base_name))) lower_map[kr.column_name_map[ column_name]] = kr.column_name_map[base_name] elif columns_to_remove is None or len(columns_to_remove) == 0: # Pattern 3: len(columns_to_remove) == 0. if len(lift_suffix) == 0: raise KGTKException("The --lift-suffix must not be empty.") if base_columns is None or len(base_columns) == 0: # The base name list wasn't supplied. Use [node1, label, node2, id] base_columns = list(key_column_names) for idx, column_name in enumerate(kr.column_names): # Skip the node1, label, node12, and id columns if idx in key_column_idxs: continue # Does this column match a lifting pattern? for base_name in base_columns: if len(base_name) == 0: continue if column_name == base_name + lift_suffix: lower_map[idx] = kr.column_name_map[base_name] if len(lower_map) == 0: raise KGTKException("There are no columns to lower.") if verbose: print("The following columns will be lowered", file=error_file, flush=True) for idx in sorted(lower_map.keys()): column_name = kr.column_names[idx] base_name = kr.column_names[lower_map[idx]] print(" %s from %s" % (column_name, base_name), file=error_file, flush=True) output_column_names: typing.List[str] = list() for idx, column_name in enumerate(kr.column_names): if idx not in lower_map: output_column_names.append(column_name) if verbose: print("The output columns are: %s" % " ".join(output_column_names), file=error_file, flush=True) if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open( output_column_names, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=False, # Simplifies writing the labels verbose=verbose, very_verbose=very_verbose) shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names) lkw: typing.Optional[KgtkWriter] = None if label_kgtk_file is not None: if verbose: print("Opening the label output file: %s" % str(label_kgtk_file), file=error_file, flush=True) label_column_names = [ KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2 ] lkw = KgtkWriter.open(label_column_names, label_kgtk_file, mode=KgtkWriter.Mode.EDGE, verbose=verbose, very_verbose=very_verbose) # Optionally deduplicate the labels # set(node1_value + KgtkFormat.SEPARATOR + node2_value) label_set: typing.Set[str] = set() label_key: str # If labels will be written to the output file and deduplication is enabled: check_existing_labels: bool = \ deduplicate_labels and \ lkw is None and \ kr.node1_column_idx >= 0 and \ kr.label_column_idx >= 0 and \ kr.node2_column_idx >= 0 input_line_count: int = 0 output_line_count: int = 0 label_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if check_existing_labels and row[ kr.label_column_idx] == label_value: label_key = row[ kr.node1_column_idx] + KgtkFormat.COLUMN_SEPARATOR + row[ kr.node2_column_idx] if label_key in label_set: continue else: label_set.add(label_key) kw.write(row, shuffle_list=shuffle_list) output_line_count += 1 column_idx: int for column_idx in lower_map.keys(): node1_value: str = row[lower_map[column_idx]] if len(node1_value) == 0: continue # TODO: raise an exception item: str = row[column_idx] if len(item) == 0: continue # Ignore empty node2 values. # Ths item might be a KGTK list. Let's split it, because # lists aren't allow in the node2 values we'll generate. node2_value: str for node2_value in KgtkValue.split_list(item): if len(node2_value) == 0: continue # Ignore empty node2 values. if deduplicate_labels: label_key = node1_value + KgtkFormat.COLUMN_SEPARATOR + node2_value if label_key in label_set: continue else: label_set.add(label_key) output_map: typing.Mapping[str, str] = { KgtkFormat.NODE1: node1_value, KgtkFormat.LABEL: label_value, KgtkFormat.NODE2: node2_value, } if lkw is None: kw.writemap(output_map) label_line_count += 1 output_line_count += 1 else: lkw.writemap(output_map) label_line_count += 1 if verbose: print("Read %d rows, wrote %d rows with %d labels." % (input_line_count, output_line_count, label_line_count), file=error_file, flush=True) kw.close() if lkw is not None: lkw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def load_labels( self, kr: KgtkReader, path: Path, save_input: bool = True, labels_needed: typing.Optional[typing.Set[str]] = None, ) -> typing.Tuple[typing.Mapping[str, str], typing.List[typing.List[str]]]: input_rows: typing.List[typing.List[str]] = [] labels: typing.MutableMapping[str, str] = {} label_match_column_idx: int label_select_column_idx: int label_value_column_idx: int label_match_column_idx, label_select_column_idx, label_value_column_idx = self.lookup_label_table_idxs( kr) if self.verbose: print("Loading labels from %s" % path, file=self.error_file, flush=True) if labels_needed is not None: print("Filtering for needed labels", file=self.error_file, flush=True) print("label_match_column_idx=%d (%s)." % (label_match_column_idx, kr.column_names[label_match_column_idx]), file=self.error_file, flush=True) print("label_select_column_idx=%d (%s)." % (label_select_column_idx, kr.column_names[label_select_column_idx]), file=self.error_file, flush=True) print("label_value_column_idx=%d (%s)." % (label_value_column_idx, kr.column_names[label_value_column_idx]), file=self.error_file, flush=True) print("label_select_column_value='%s'." % self.label_select_column_value, file=self.error_file, flush=True) key: str row: typing.List[str] for row in kr: if row[label_select_column_idx] == self.label_select_column_value: # This is a label definition row. label_key = row[label_match_column_idx] label_value: str = row[label_value_column_idx] if len(label_value) > 0: if label_key in labels: # This label already exists in the table. if self.suppress_duplicate_labels: # Build a list eliminating duplicate elements. # print("Merge '%s' and '%s'" % (key_value, labels[key]), file=self.error_file, flush=True) labels[label_key] = KgtkValue.merge_values( labels[label_key], label_value) else: labels[label_key] = KgtkFormat.LIST_SEPARATOR.join( (labels[label_key], label_value)) else: # This is the first instance of this label definition. if labels_needed is not None: if label_key in labels_needed: labels[label_key] = label_value else: labels[label_key] = label_value if save_input and not self.remove_label_records: input_rows.append(row) else: if save_input: input_rows.append(row) return labels, input_rows
def process_qual_datavalue(self, value: str, qual_row: typing.List[str], datatype: str): datavalue: typing.MutableMapping[str, typing.Union[str, typing.Mapping[ str, typing.Optional[typing.Union[str, int, float]]]]] = dict() datavalue["type"] = qual_row[self.qual_val_type_idx] valuemap: typing.MutableMapping[str, typing.Optional[typing.Union[ str, int, float]]] = dict() datavalue["value"] = valuemap entity_type: str = qual_row[self.qual_entity_type_idx] if len(entity_type) > 0: valuemap["entity-type"] = entity_type valuemap["id"] = value # TODO: Is this the right thing to do for Q16097-F1? numeric_id: str = value[1:] if "-" in numeric_id: numeric_id = numeric_id[:numeric_id.index("-")] valuemap["numeric-id"] = int(numeric_id) return datavalue kv = KgtkValue(value, options=self.value_options, parse_fields=True, error_file=self.error_file, verbose=self.verbose) if not kv.validate(): # raise ValueError("Invalid KGTK value '%s'" % value) print("Warning: Invalid KGTK value '%s'" % value, file=self.error_file, flush=True) if kv.fields is None: raise ValueError("KGTK value %s is missing fields." % value) if kv.is_number(): if kv.fields.numberstr is None: raise ValueError("number is missing numberstr for %s." % value) valuemap["amount"] = kv.fields.numberstr # TODO: add plus sign valuemap["unit"] = "1" return datavalue if kv.is_quantity(): if kv.fields.numberstr is None: raise ValueError("quantity is missing numberstr for %s." % value) valuemap["amount"] = kv.fields.numberstr # TODO: add plus sign if kv.fields.units_node is None: # TODO: Research this further. Why did we get here? Is it because import_wikidata # dropped the units? # # raise ValueError("quantity is missing units_node for %s in: %s" % (value, " ".join(qual_row))) valuemap["unit"] = "undefined" else: valuemap[ "unit"] = "http://www.wikidata.org/entity/" + kv.fields.units_node if kv.fields.low_tolerancestr is not None and len( kv.fields.low_tolerancestr) > 0: valuemap[ "lowerBound"] = kv.fields.low_tolerancestr # TODO: add plus sign if kv.fields.high_tolerancestr is not None and len( kv.fields.high_tolerancestr) > 0: valuemap[ "higherBound"] = kv.fields.high_tolerancestr # TODO: add plus sign return datavalue if kv.is_language_qualified_string(): text: str language: str language_suffix: str text, language, language_suffix = KgtkFormat.destringify( value) # TODO: KgtkValue should do this to text language += language_suffix valuemap["text"] = text valuemap["language"] = language return datavalue if kv.is_string(): valuemap["type"] = "string" valuemap["value"] = KgtkFormat.unstringify( value) # TODO: KgtkValue should do this to text return datavalue if kv.is_date_and_times(): if kv.fields.zonestr is None: raise ValueError("timezone is missing from %s." % value) if kv.fields.zonestr != "Z": raise ValueError("Only Z-time is supported, error in %s." % value) if kv.fields.date_and_time is None: raise ValueError("date_and_time is missing from %s." % value) valuemap["time"] = kv.fields.date_and_time valuemap["timezone"] = 0 valuemap["before"] = 0 valuemap["after"] = 0 if kv.fields.precision is None: raise ValueError( "date_and_time precision is missing from %s." % value) valuemap["precision"] = kv.fields.precision valuemap[ "calendarmodel"] = "http://www.wikidata.org/entity/" + qual_row[ self.qual_calendar_idx] return datavalue if kv.is_location_coordinates(): if kv.fields.latitude is None: raise ValueError("latitude is missing from %s" % value) valuemap["latitude"] = kv.fields.latitude if kv.fields.longitude is None: raise ValueError("longitude is missing from %s" % value) valuemap["longitude"] = kv.fields.longitude valuemap["altitide"] = None # deprecated valuemap["precision"] = float(qual_row[self.qual_precision_idx]) valuemap["globe"] = "http://www.wikidata.org/entity/Q2" return datavalue # Default: convert the symbol to a string. valuemap["type"] = "string" valuemap["value"] = KgtkFormat.unstringify( '"' + value + '"') # TODO: KgtkValue should do this to text return datavalue
def process_edge_datavalue(self, value: str, edge_row: typing.List[str], datatype: str): datavalue: typing.MutableMapping[str, typing.Union[str, typing.Mapping[ str, typing.Optional[typing.Union[str, int, float]]]]] = dict() datavalue["type"] = edge_row[self.edge_val_type_idx] valuemap: typing.MutableMapping[str, typing.Optional[typing.Union[ str, int, float]]] = dict() datavalue["value"] = valuemap entity_type: str = edge_row[self.edge_entity_type_idx] if len(entity_type) > 0: valuemap["entity-type"] = entity_type valuemap["id"] = value # TODO: Is this the right thing to do? numeric_id: str = value[1:] if "-" in numeric_id: numeric_id = numeric_id[:numeric_id.index("-")] valuemap["numeric-id"] = int(numeric_id) return datavalue kv = KgtkValue(value, options=self.value_options, parse_fields=True, error_file=self.error_file, verbose=self.verbose) if not kv.validate(): # raise ValueError("Invalid KGTK value '%s'" % value) print("Warning: Invalid KGTK value '%s'" % value, file=self.error_file, flush=True) if kv.fields is None: raise ValueError("KGTK value '%s' is missing fields." % value) if kv.is_number(): if kv.fields.numberstr is None: raise ValueError("number is missing numberstr.") valuemap["amount"] = kv.fields.numberstr # TODO: add plus sign valuemap["unit"] = "1" return datavalue if kv.is_quantity(): if kv.fields.numberstr is None: raise ValueError("quantity is missing numberstr.") valuemap["amount"] = kv.fields.numberstr # TODO: add plus sign if kv.fields.units_node is None: # TODO: research this further. # # raise ValueError("quantity is missing units_node for %s." % value) valuemap["init"] = "undefined" else: valuemap[ "unit"] = "http://www.wikidata.org/entity/" + kv.fields.units_node if kv.fields.low_tolerancestr is not None and len( kv.fields.low_tolerancestr) > 0: valuemap[ "lowerBound"] = kv.fields.low_tolerancestr # TODO: add plus sign if kv.fields.high_tolerancestr is not None and len( kv.fields.high_tolerancestr) > 0: valuemap[ "higherBound"] = kv.fields.high_tolerancestr # TODO: add plus sign return datavalue if kv.is_language_qualified_string(): text: str language: str language_suffix: str text, language, language_suffix = KgtkFormat.destringify( value) # TODO: KgtkValue should do this to text language += language_suffix valuemap["text"] = text valuemap["language"] = language return datavalue if kv.is_string(): valuemap["type"] = "string" valuemap["value"] = KgtkFormat.unstringify( value) # TODO: KgtkValue should do this to text return datavalue if kv.is_date_and_times(): if kv.fields.zonestr is None: raise ValueError("timezone is missing.") if kv.fields.zonestr != "Z": raise ValueError("Only Z-time is supported.") if kv.fields.date_and_time is None: raise ValueError("date_and_time is missing.") valuemap["time"] = kv.fields.date_and_time valuemap["timezone"] = 0 valuemap["before"] = 0 valuemap["after"] = 0 if kv.fields.precision is None: raise ValueError("date_and_time precision is missing.") valuemap["precision"] = kv.fields.precision valuemap[ "calendarmodel"] = "http://www.wikidata.org/entity/" + edge_row[ self.edge_calendar_idx] return datavalue if kv.is_location_coordinates: if kv.fields.latitude is None: raise ValueError("latitude is missing") valuemap["latitude"] = kv.fields.latitude if kv.fields.longitude is None: raise ValueError("longitude is missing") valuemap["longitude"] = kv.fields.longitude valuemap["altitide"] = None # deprecated # TODO: Validate that it's OK to have location coordinates without precision. precision: str = edge_row[self.edge_precision_idx] if len(precision) > 0: try: valuemap["precision"] = float( edge_row[self.edge_precision_idx]) except ValueError: print("Invalid precision '%s'" % precision, file=self.error_file, flush=True) valuemap["globe"] = "http://www.wikidata.org/entity/Q2" return datavalue # Default: treat as string. valuemap["type"] = "string" valuemap["value"] = KgtkFormat.unstringify( value) # TODO: KgtkValue should do this to text return datavalue
def process_as_merge(self, ikr: KgtkReader, lkr: KgtkReader): """ Process the lift as a merge between two sorted files. """ if self.verbose: print("Merging sorted input and label files.", file=self.error_file, flush=True) lift_column_idxs: typing.List[int] = self.build_lift_column_idxs(ikr) if len(lift_column_idxs) != 1: raise ValueError("Expecting exactly one lift_column_idxs, got %d" % len(lift_column_idxs)) ew: KgtkWriter lifted_output_column_idxs: typing.List[int] ew, lifted_output_column_idxs = self.open_output_writer( ikr, lift_column_idxs) new_columns: int = len(ew.column_names) - len(ikr.column_names) if new_columns not in (0, 1): raise ValueError("Expecing zero or one new columns, got %d" % new_columns) lift_column_idx: int = lift_column_idxs[0] # For convenience lifted_output_column_idx: int = lifted_output_column_idxs[ 0] # For convenience node1_column_idx: int label_column_idx: int node2_column_idx: int node1_column_idx, label_column_idx, node2_column_idx = self.lookup_label_table_idxs( lkr) current_label_row: typing.Optional[typing.List[str]] = None more_labels: bool = True # Read the first label record. try: current_label_row = lkr.nextrow() except StopIteration: more_labels = False input_line_count: int = 0 # We carry last_value_to_lift and lifted_label_value over # iterations in case the input file has multiple records with # the same value to lift. last_value_to_lift: typing.Optional[str] = None lifted_label_value: str = "" if self.verbose: print("Processing the input records.", file=self.error_file, flush=True) row: typing.List[str] for row in ikr: input_line_count += 1 value_to_lift: str = row[lift_column_idx] if last_value_to_lift is None or value_to_lift != last_value_to_lift: last_value_to_lift = value_to_lift lifted_label_value = "" # Read label records until we come to the first record that # has a node1 value equal to or greater than the value we we want to lift. while more_labels and current_label_row is not None and current_label_row[ node1_column_idx] < value_to_lift: try: current_label_row = lkr.nextrow() except StopIteration: more_labels = False break # While the label records have node1 values equal to the value we are trying to lift, # look for label values from the label file. while more_labels and current_label_row is not None and current_label_row[ node1_column_idx] == value_to_lift: if current_label_row[ label_column_idx] == self.label_column_value: label_value: str = current_label_row[node2_column_idx] if len(label_value) > 0: if len(lifted_label_value) > 0: if self.suppress_duplicate_labels: lifted_label_value = KgtkValue.merge_values( lifted_label_value, label_value) else: lifted_label_value = KgtkFormat.LIST_SEPARATOR.join( (lifted_label_value, label_value)) else: lifted_label_value = label_value try: current_label_row = lkr.nextrow() except StopIteration: more_labels = False break output_row: typing.List[str] = row.copy() if new_columns > 0: output_row.append("") output_row[lifted_output_column_idx] = lifted_label_value ew.write(output_row) if more_labels: lkr.close() if self.verbose: print("Read %d input records." % (input_line_count), file=self.error_file, flush=True)
def run( input_file: KGTKFiles, output_file: KGTKFiles, columns: typing.Optional[typing.List[str]] = None, labels: typing.Optional[typing.List[str]] = None, id_column_name: typing.Optional[str] = None, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally import os from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if columns is not None: print("--columns=%s" % " ".join(columns), file=error_file) if labels is not None: print("--labels=%s" % " ".join(labels), file=error_file) if id_column_name is not None: print("--id-column=%s" % id_column_name, file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) if verbose: print("Starting normalize_nodes pid=%d" % (os.getpid()), file=error_file, flush=True) label_map: typing.MutableMapping[str, str] = dict() if labels is not None and len(labels) > 0: if columns is None: raise KGTKException( "--columns must be supplied when --labels is used.") if len(columns) != len(labels): raise KGTKException("%d columns were supplied, but %d labels." % (len(columns), len(labels))) idx: int label: str for idx, label in enumerate(labels): label_map[columns[idx]] = label try: if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) id_column_idx: int = kr.get_id_column_index(id_column_name) if id_column_idx < 0: raise KGTKException("Unknown ID column %s" % repr(id_column_name)) output_column_names: typing.List[str] = [ KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2 ] if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(output_column_names, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, verbose=verbose, very_verbose=very_verbose) input_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 node1_value: str = row[id_column_idx] column_idx: int column_name: str for column_idx, column_name in enumerate(kr.column_names): if column_idx == id_column_idx: continue if columns is not None and column_name not in columns: continue label_value: str = label_map.get(column_name, column_name) new_value: str = row[column_idx] if len(new_value) == 0: continue # ignore empty values. # The column value might contain a KGTK list. Since node2 isn't supposed # to contain lists, we'll split it. node2_value: str for node2_value in KgtkValue.split_list(new_value): if len(node2_value) == 0: continue # node2 shouldn't contain empty values output_row: typing.List[str] = [ node1_value, label_value, node2_value ] kw.write(output_row) output_line_count += 1 if verbose: print("Read %d node rows, wrote %d edge rows." % (input_line_count, output_line_count), file=error_file, flush=True) kw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def process(self): if len(self.column_name) == 0: raise ValueError("The name of the column to explode is empty.") selected_field_names: typing.List[str] = [] field_name: str if self.type_names is not None: if self.verbose: print("Validate the names of the data types to extract.", file=self.error_file, flush=True) type_name: str for type_name in self.type_names: if type_name not in KgtkValueFields.DEFAULT_DATA_TYPE_FIELDS: raise ValueError("Unknown data type name '%s'." % type_name) # Merge this KGTK data type's fields into the list of selected fields: for field_name in KgtkValueFields.DEFAULT_DATA_TYPE_FIELDS[ type_name]: if field_name not in selected_field_names: selected_field_names.append(field_name) if self.field_names is not None: # Forget the fields selected above, choose these instead: selected_field_names = [] if self.verbose: print("Validate the names of the fields to extract.", file=self.error_file, flush=True) for field_name in self.field_names: if field_name not in KgtkValueFields.FIELD_NAMES: raise ValueError("Unknown field name '%s'." % field_name) # Merge this field into the list of selected fields: if field_name not in selected_field_names: selected_field_names.append(field_name) if len(selected_field_names) == 0: raise ValueError("The list of fields to explode is empty.") # Open the input file. if self.verbose: if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) if self.verbose: print("Check that the source column '%s' is present." % self.column_name, file=self.error_file, flush=True) if self.column_name not in kr.column_name_map: raise ValueError("Column name '%s' not found in the input file." % self.column_name) column_idx: int = kr.column_name_map[self.column_name] if self.verbose: print( "Build the map of exploded columns and list of new column names", file=self.error_file, flush=True) explosion: typing.MutableMapping[str, idx] = {} column_names: typing.List[str] = kr.column_names.copy() for field_name in selected_field_names: exploded_name: str = self.prefix + field_name if self.verbose: print("Field '%s' becomes '%s'" % (field_name, exploded_name), file=self.error_file, flush=True) if exploded_name in explosion: raise ValueError( "Field name '%s' is duplicated in the field list.") if exploded_name in kr.column_names: if self.overwrite_columns: existing_idx = kr.column_name_map[exploded_name] explosion[field_name] = existing_idx if self.verbose: print( "Field '%s' is overwriting existing column '%s' (idx=%d)" % (field_name, exploded_name, existing_idx), file=self.error_file, flush=True) else: raise ValueError( "Exploded column '%s' already exists and not allowed to overwrite" % exploded_name) else: column_names.append(exploded_name) exploded_idx: int = len(column_names) - 1 explosion[field_name] = exploded_idx if self.verbose: print("Field '%s' becomes new column '%s' (idx=%d)" % (field_name, exploded_name, exploded_idx), file=self.error_file, flush=True) new_column_count: int = len(column_names) - kr.column_count if self.verbose: print("%d columns + %d columns = %d columns" % (kr.column_count, new_column_count, len(column_names))) print("Explosion length: %d" % len(explosion)) # Open the output file. ew: KgtkWriter = KgtkWriter.open(column_names, self.output_file_path, mode=kr.mode, output_format=self.output_format, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) if self.verbose: print("Expanding records from %s" % self.input_file_path, file=self.error_file, flush=True) input_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 # Parse the value for the colummn being exploded: item_to_explode: str = row[column_idx] value: KgtkValue = KgtkValue(item_to_explode, options=self.value_options, parse_fields=True) value.validate() if not value.is_valid(): if self.verbose: print("Not exploding invalid item '%s' in input line %d" % (item_to_explode, input_line_count), file=self.error_file, flush=True) ew.write(row) # This will be filled to the proper length output_line_count += 1 continue if self.expand_list and value.is_list(): if self.verbose: print("Expanding a list: '%s'" % item_to_explode, file=self.error_file, flush=True) subvalue: KgtkValue for subvalue in value.get_list_items(): if self.very_verbose: print("Exploding '%s'" % subvalue.value) ew.write( self.explode(subvalue, row, explosion, new_column_count)) output_line_count += 1 else: if self.very_verbose: print("Exploding '%s'" % value.value) ew.write(self.explode(value, row, explosion, new_column_count)) output_line_count += 1 if self.verbose: print("Read %d records, wrote %d records." % (input_line_count, output_line_count), file=self.error_file, flush=True) ew.close()
def run(input_file: KGTKFiles, output_file: KGTKFiles, output_format: typing.Optional[str], column_names_list: typing.List[typing.List[str]], into_column_names_list: typing.List[typing.List[str]], operation: str, values_list: typing.List[typing.List[str]], with_values_list: typing.List[typing.List[str]], limit: typing.Optional[int], format_string: typing.Optional[str], errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. )->int: # import modules locally import datetime as dt from pathlib import Path import re import sys from kgtk.exceptions import KGTKException from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions from kgtk.value.kgtkvalue import KgtkValue input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Flatten the input lists. column_names: typing.List[str] = flatten_arg_list(column_names_list) into_column_names: typing.List[str] = flatten_arg_list(into_column_names_list) values: typing.List[str] = flatten_arg_list(values_list) with_values: typing.List[str] = flatten_arg_list(with_values_list) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file, flush=True) print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True) if output_format is not None: print("--output-format=%s" % output_format, file=error_file, flush=True) if len(column_names) > 0: print("--columns %s" % " ".join(column_names), file=error_file, flush=True) if len(into_column_names) > 0: print("--into %s" % " ".join(into_column_names), file=error_file, flush=True) print("--operation=%s" % str(operation), file=error_file, flush=True) if len(values) > 0: print("--values %s" % " ".join(values), file=error_file, flush=True) if len(with_values) > 0: print("--with-values %s" % " ".join(with_values), file=error_file, flush=True) if limit is not None: print("--limit %d" % limit, file=error_file, flush=True) if format_string is not None: print("--format=%s" % format_string, file=error_file, flush=True) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: if verbose: print("Opening the input file %s" % str(input_kgtk_file), file=error_file, flush=True) kr = KgtkReader.open(input_kgtk_file, options=reader_options, value_options = value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) remaining_names: typing.List[str] = kr.column_names.copy() selected_names: typing.List[str] = [ ] save_selected_names: typing.Optional[typing.List[str]] = None ellipses: str = "..." # All unmentioned columns ranger: str = ".." # All columns between two columns. idx: int saw_ranger: bool = False column_name: str for column_name in column_names: if column_name == ellipses: if save_selected_names is not None: raise KGTKException("Elipses may appear only once") if saw_ranger: raise KGTKException("Elipses may not appear directly after a range operator ('..').") save_selected_names = selected_names selected_names = [ ] continue if column_name == ranger: if len(selected_names) == 0: raise KGTKException("The column range operator ('..') may not appear without a preceeding column name.") saw_ranger = True continue if column_name not in kr.column_names: raise KGTKException("Unknown column name '%s'." % column_name) if column_name not in remaining_names: raise KGTKException("Column name '%s' was duplicated in the list." % column_name) if saw_ranger: saw_ranger = False prior_column_name: str = selected_names[-1] prior_column_idx: int = kr.column_name_map[prior_column_name] column_name_idx: int = kr.column_name_map[column_name] start_idx: int end_idx: int idx_inc: int if column_name_idx > prior_column_idx: start_idx = prior_column_idx + 1 end_idx = column_name_idx - 1 idx_inc = 1 else: start_idx = prior_column_idx - 1 end_idx = column_name_idx + 1 idx_inc = -1 idx = start_idx while idx <= end_idx: idx_column_name: str = kr.column_names[idx] if idx_column_name not in remaining_names: raise KGTKException("Column name '%s' (%s .. %s) was duplicated in the list." % (column_name, prior_column_name, column_name)) selected_names.append(idx_column_name) remaining_names.remove(idx_column_name) idx += idx_inc selected_names.append(column_name) remaining_names.remove(column_name) if saw_ranger: raise KGTKException("The column ranger operator ('..') may not end the list of column names.") if len(remaining_names) > 0 and save_selected_names is None: if verbose: print("Omitting the following columns: %s" % " ".join(remaining_names), file=error_file, flush=True) if save_selected_names is not None: if len(remaining_names) > 0: save_selected_names.extend(remaining_names) if len(selected_names) > 0: save_selected_names.extend(selected_names) selected_names = save_selected_names sources: typing.List[int] = [ ] name: str for name in selected_names: sources.append(kr.column_name_map[name]) new_column_count: int = 0 into_column_idxs: typing.List[int] = [ ] into_column_idx: int output_column_names: typing.List[str] = kr.column_names.copy() into_column_name: str for idx, into_column_name in enumerate(into_column_names): if into_column_name in kr.column_name_map: into_column_idx = kr.column_name_map[into_column_name] into_column_idxs.append(into_column_idx) if verbose: print("Putting result %d of the calculation into old column %d (%s)." % (idx + 1, into_column_idx, into_column_name), file=error_file, flush=True) else: new_column_count += 1 into_column_idx = len(output_column_names) into_column_idxs.append(into_column_idx) output_column_names.append(into_column_name) if verbose: print("Putting result %d of the calculation into new column %d (%s)." % (idx + 1, into_column_idx, into_column_name), file=error_file, flush=True) if verbose: print("Opening the output file %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(output_column_names, output_kgtk_file, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, mode=KgtkWriter.Mode[kr.mode.name], output_format=output_format, verbose=verbose, very_verbose=very_verbose, ) if limit is None: limit = 0 substitute_re: typing.Optional[typing.Pattern] = None if operation == AND_OP: if len(sources) == 0: raise KGTKException("And needs at least one source, got %d" % len(sources)) if len(into_column_idxs) != 1: raise KGTKException("And needs 1 destination column, got %d" % len(into_column_idxs)) elif operation == AVERAGE_OP: if len(sources) == 0: raise KGTKException("Average needs at least one source, got %d" % len(sources)) if len(into_column_idxs) != 1: raise KGTKException("Average needs 1 destination column, got %d" % len(into_column_idxs)) elif operation == CAPITALIZE_OP: if len(sources) == 0: raise KGTKException("Capitalize needs at least one source, got %d" % len(sources)) if len(sources) != len(into_column_idxs): raise KGTKException("Capitalize needs the same number of input columns and into columns, got %d and %d" % (len(sources), len(into_column_idxs))) elif operation == CASEFOLD_OP: if len(sources) == 0: raise KGTKException("Casefold needs at least one source, got %d" % len(sources)) if len(sources) != len(into_column_idxs): raise KGTKException("Casefold needs the same number of input columns and into columns, got %d and %d" % (len(sources), len(into_column_idxs))) elif operation == COPY_OP: if len(sources) == 0: raise KGTKException("Copy needs at least one source, got %d" % len(sources)) if len(selected_names) != len(into_column_idxs): raise KGTKException("Copy needs the same number of input columns and into columns, got %d and %d" % (len(selected_names), len(into_column_idxs))) elif operation == EQ_OP: if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1): raise KGTKException("Eq needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values))) if len(into_column_idxs) != 1: raise KGTKException("Eq needs 1 destination columns, got %d" % len(into_column_idxs)) elif operation == FROMISOFORMAT_OP: if len(sources) != 1: raise KGTKException("Fromisoformat needs one source, got %d" % len(sources)) if len(values) != len(into_column_idxs): raise KGTKException("Fromisoformat needs the same number of values and into columns, got %d and %d" % (len(values), len(into_column_idxs))) elif operation == GE_OP: if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1): raise KGTKException("Ge needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values))) if len(into_column_idxs) != 1: raise KGTKException("Ge needs 1 destination columns, got %d" % len(into_column_idxs)) elif operation == GT_OP: if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1): raise KGTKException("Gt needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values))) if len(into_column_idxs) != 1: raise KGTKException("Gt needs 1 destination columns, got %d" % len(into_column_idxs)) elif operation == IS_OP: if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1): raise KGTKException("Is needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values))) if len(into_column_idxs) != 1: raise KGTKException("Is needs 1 destination columns, got %d" % len(into_column_idxs)) elif operation == IS_IN_OP: if len(sources) != 1: raise KGTKException("Is in needs one source, got %d" % len(sources)) if len(values) == 0: raise KGTKException("Is in needs at least one value, got %d" % len(values)) if len(into_column_idxs) != 1: raise KGTKException("Is in needs 1 destination columns, got %d" % len(into_column_idxs)) elif operation == IS_NOT_OP: if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1): raise KGTKException("Is not needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values))) if len(into_column_idxs) != 1: raise KGTKException("Is not needs 1 destination columns, got %d" % len(into_column_idxs)) elif operation == JOIN_OP: if len(sources) == 0: raise KGTKException("Join needs at least one source, got %d" % len(sources)) if len(into_column_idxs) != 1: raise KGTKException("Join needs 1 destination columns, got %d" % len(into_column_idxs)) if len(values) != 1: raise KGTKException("Join needs 1 value, got %d" % len(values)) elif operation == LE_OP: if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1): raise KGTKException("Le needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values))) if len(into_column_idxs) != 1: raise KGTKException("Le needs 1 destination columns, got %d" % len(into_column_idxs)) elif operation == LT_OP: if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1): raise KGTKException("Lt needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values))) if len(into_column_idxs) != 1: raise KGTKException("Lt needs 1 destination columns, got %d" % len(into_column_idxs)) elif operation == LOWER_OP: if len(sources) == 0: raise KGTKException("Lower needs at least one source, got %d" % len(sources)) if len(sources) != len(into_column_idxs): raise KGTKException("Lower needs the same number of input columns and into columns, got %d and %d" % (len(sources), len(into_column_idxs))) elif operation == MAX_OP: if len(sources) == 0: raise KGTKException("Max needs at least one source, got %d" % len(sources)) if len(into_column_idxs) != 1: raise KGTKException("Max needs 1 destination columns, got %d" % len(into_column_idxs)) elif operation == MIN_OP: if len(sources) == 0: raise KGTKException("Min needs at least one source, got %d" % len(sources)) if len(into_column_idxs) != 1: raise KGTKException("Min needs 1 destination columns, got %d" % len(into_column_idxs)) elif operation == NE_OP: if (len(sources) == 2 and len(values) == 0) or (len(sources) == 1 and len(values) == 1): raise KGTKException("Ne needs two sources or one source and one value, got %d sources and %d values" % (len(sources), len(values))) if len(into_column_idxs) != 1: raise KGTKException("Ne needs 1 destination columns, got %d" % len(into_column_idxs)) elif operation == NOR_OP: if len(sources) == 0: raise KGTKException("Nor needs at least one source, got %d" % len(sources)) if len(into_column_idxs) != 1: raise KGTKException("Nor needs 1 destination column, got %d" % len(into_column_idxs)) elif operation == NOT_OP: if len(sources) == 0: raise KGTKException("Not needs at least one source, got %d" % len(sources)) if len(into_column_idxs) != len(sources): raise KGTKException("Nand needs the same number of input columns and into colums, got %d and %d" % (len(sources), len(into_column_idxs))) elif operation == OR_OP: if len(sources) == 0: raise KGTKException("Or needs at least one source, got %d" % len(sources)) if len(into_column_idxs) != 1: raise KGTKException("Or needs 1 destination column, got %d" % len(into_column_idxs)) elif operation == PERCENTAGE_OP: if len(into_column_idxs) != 1: raise KGTKException("Percent needs 1 destination columns, got %d" % len(into_column_idxs)) if len(selected_names) != 2: raise KGTKException("Percent needs 2 input columns, got %d" % len(selected_names)) elif operation == REPLACE_OP: if len(into_column_idxs) != 1: raise KGTKException("Replace needs 1 destination column, got %d" % len(into_column_idxs)) if len(selected_names) != 1: raise KGTKException("Replace needs 1 input column, got %d" % len(selected_names)) if len(values) != 1: raise KGTKException("Replace needs one value, got %d" % len(values)) if len(with_values) != 1: raise KGTKException("Replace needs one with-value, got %d" % len(with_values)) elif operation == SET_OP: if len(sources) != 0: raise KGTKException("Set needs no sources, got %d" % len(sources)) if len(into_column_idxs) == 0: raise KGTKException("Set needs at least one destination column, got %d" % len(into_column_idxs)) if len(values) == 0: raise KGTKException("Set needs at least one value, got %d" % len(values)) if len(into_column_idxs) != len(values): raise KGTKException("Set needs the same number of destination columns and values, got %d and %d" % (len(into_column_idxs), len(values))) elif operation == SUBSTITUTE_OP: if len(into_column_idxs) != 1: raise KGTKException("Substitute needs 1 destination column, got %d" % len(into_column_idxs)) if len(selected_names) != 1: raise KGTKException("Substitute needs 1 input column, got %d" % len(selected_names)) if len(values) != 1: raise KGTKException("Substitute needs one value, got %d" % len(values)) if len(with_values) != 1: raise KGTKException("Substitute needs one with-value, got %d" % len(with_values)) substitute_re = re.compile(values[0]) elif operation == SUM_OP: if len(sources) == 0: raise KGTKException("Sum needs at least one source, got %d" % len(sources)) if len(into_column_idxs) != 1: raise KGTKException("Sum needs 1 destination columns, got %d" % len(into_column_idxs)) elif operation == SWAPCASE_OP: if len(sources) == 0: raise KGTKException("Swapcase needs at least one source, got %d" % len(sources)) if len(sources) != len(into_column_idxs): raise KGTKException("Swapcase needs the same number of input columns and into columns, got %d and %d" % (len(sources), len(into_column_idxs))) elif operation == TITLE_OP: if len(sources) == 0: raise KGTKException("Title needs at least one source, got %d" % len(sources)) if len(sources) != len(into_column_idxs): raise KGTKException("Title needs the same number of input columns and into columns, got %d and %d" % (len(sources), len(into_column_idxs))) elif operation == UPPER_OP: if len(sources) == 0: raise KGTKException("Upper needs at least one source, got %d" % len(sources)) if len(sources) != len(into_column_idxs): raise KGTKException("Upper needs the same number of input columns and into columns, got %d and %d" % (len(sources), len(into_column_idxs))) elif operation == XOR_OP: if len(sources) == 0: raise KGTKException("Xor needs at least one source, got %d" % len(sources)) if len(into_column_idxs) != 1: raise KGTKException("Xor needs 1 destination column, got %d" % len(into_column_idxs)) fs: str = format_string if format_string is not None else "%5.2f" item: str item2: str kv: KgtkValue bresult: bool into_column_idx = into_column_idxs[0] # for convenience input_data_lines: int = 0 row: typing.List[str] for row in kr: input_data_lines += 1 output_row: typing.List[str] = row.copy() for idx in range(new_column_count): output_row.append("") # Easiest way to add a new column. if operation == AND_OP: bresult = True for idx in sources: kv = KgtkValue(row[idx]) if kv.is_boolean(): bresult = bresult and kv.is_true() output_row[into_column_idx] = KgtkValue.to_boolean(bresult) elif operation == AVERAGE_OP: atotal: float = 0 acount: int = 0 for idx in sources: item = row[idx] if len(item) > 0: atotal += float(item) acount += 1 output_row[into_column_idx] = (fs % (atotal / float(acount))) if acount > 0 else "" elif operation == CAPITALIZE_OP: for idx in range(len(sources)): output_row[into_column_idxs[idx]] = row[sources[idx]].capitalize() elif operation == CASEFOLD_OP: for idx in range(len(sources)): output_row[into_column_idxs[idx]] = row[sources[idx]].casefold() elif operation == COPY_OP: for idx in range(len(sources)): output_row[into_column_idxs[idx]] = row[sources[idx]] elif operation == EQ_OP: if len(sources) == 1: if len(row[sources[0]]) > 0 and len(row[sources[1]]) > 0: output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) == float(row[sources[1]])) else: output_row[into_column_idx] = "" else: if len(row[sources[0]]) > 0 and len(values[0]) > 0: output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) == float(values[0])) else: output_row[into_column_idx] = "" elif operation == FROMISOFORMAT_OP: dtval: str = row[sources[0]] if dtval.startswith(KgtkFormat.DATE_AND_TIMES_SIGIL): kgtkdatestr: str = row[sources[0]][1:] # Strip the leading ^ isodatestr: str precisionstr: str if "/" in kgtkdatestr: isodatestr, precisionstr = kgtkdatestr.split("/") else: isodatestr = kgtkdatestr precisionstr = "" if isodatestr.endswith("Z"): isodatestr = isodatestr[:-1] into_idx: int value_name: str try: dtvar: dt.datetime = dt.datetime.fromisoformat(isodatestr) for idx in range(len(values)): value_name = values[idx] into_idx = into_column_idxs[idx] if value_name == "year": output_row[into_idx] = str(dtvar.year) elif value_name == "month": output_row[into_idx] = str(dtvar.month) elif value_name == "day": output_row[into_idx] = str(dtvar.day) elif value_name == "hour": output_row[into_idx] = str(dtvar.hour) elif value_name == "minute": output_row[into_idx] = str(dtvar.minute) elif value_name == "second": output_row[into_idx] = str(dtvar.second) elif value_name == "microsecond": output_row[into_idx] = str(dtvar.microsecond) elif value_name == "error": output_row[into_idx] = "" else: raise KGTKException("Unknown date component %s" % repr(value_name)) except ValueError as e: print("Error parsing %s in [%s]: %s" % (repr(isodatestr), "|".join([repr(x) for x in row]), str(e)), file=error_file, flush=True) for idx in range(len(values)): value_name = values[idx] into_idx = into_column_idxs[idx] if value_name == "error": output_row[into_idx] = str(e) else: output_row[into_idx] = "" else: # Not a date/time value, clear the result columns. for idx in range(len(values)): output_row[into_column_idxs[idx]] = "" elif operation == GE_OP: if len(sources) == 1: if len(row[sources[0]]) > 0 and len(row[sources[1]]) > 0: output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) >= float(row[sources[1]])) else: output_row[into_column_idx] = "" else: if len(row[sources[0]]) > 0 and len(values[0]) > 0: output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) >= float(values[0])) else: output_row[into_column_idx] = "" elif operation == GT_OP: if len(sources) == 1: if len(row[sources[0]]) > 0 and len(row[sources[1]]) > 0: output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) > float(row[sources[1]])) else: output_row[into_column_idx] = "" else: if len(row[sources[0]]) > 0 and len(values[0]) > 0: output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) > float(values[0])) else: output_row[into_column_idx] = "" elif operation == IS_OP: if len(sources) == 1: output_row[into_column_idx] = KgtkValue.to_boolean(row[sources[0]] == row[sources[1]]) else: output_row[into_column_idx] = KgtkValue.to_boolean(row[sources[0]] == values[0]) elif operation == IS_IN_OP: bresult = False item = row[sources[0]] for item2 in values: if item == item2: bresult = True break output_row[into_column_idx] = KgtkValue.to_boolean(bresult) elif operation == IS_NOT_OP: if len(sources) == 1: output_row[into_column_idx] = KgtkValue.to_boolean(row[sources[0]] != row[sources[1]]) else: output_row[into_column_idx] = KgtkValue.to_boolean(row[sources[0]] != values[0]) elif operation == JOIN_OP: output_row[into_column_idx] = values[0].join((row[sources[idx]] for idx in range(len(sources)))) elif operation == LE_OP: if len(sources) == 1: if len(row[sources[0]]) > 0 and len(row[sources[1]]) > 0: output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) <= float(row[sources[1]])) else: output_row[into_column_idx] = "" else: if len(row[sources[0]]) > 0 and len(values[0]) > 0: output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) <= float(values[0])) else: output_row[into_column_idx] = "" elif operation == LT_OP: if len(sources) == 1: if len(row[sources[0]]) > 0 and len(row[sources[1]]) > 0: output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) < float(row[sources[1]])) else: output_row[into_column_idx] = "" else: if len(row[sources[0]]) > 0 and len(values[0]) > 0: output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) < float(values[0])) else: output_row[into_column_idx] = "" elif operation == LOWER_OP: for idx in range(len(sources)): output_row[into_column_idxs[idx]] = row[sources[idx]].lower() elif operation == MAX_OP: max_result: typing.Optional[float] = None for idx in sources: item = row[idx] if len(item) > 0: max_value: float = float(item) if max_result is None or max_value > max_result: max_result = max_value output_row[into_column_idx] = (fs % max_result) if max_result is not None else "" elif operation == MIN_OP: min_result: typing.Optional[float] = None for idx in sources: item = row[idx] if len(item) > 0: min_value: float = float(item) if min_result is None or min_value < min_result: min_result = min_value output_row[into_column_idx] = (fs % min_result) if min_result is not None else "" elif operation == NAND_OP: bresult = True for idx in sources: kv = KgtkValue(row[idx]) if kv.is_boolean(): bresult = bresult and kv.is_true() output_row[into_column_idx] = KgtkValue.to_boolean(not bresult) elif operation == NE_OP: if len(sources) == 1: if len(row[sources[0]]) > 0 and len(row[sources[1]]) > 0: output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) != float(row[sources[1]])) else: output_row[into_column_idx] = "" else: if len(row[sources[0]]) > 0 and len(values[0]) > 0: output_row[into_column_idx] = KgtkValue.to_boolean(float(row[sources[0]]) != float(values[0])) else: output_row[into_column_idx] = "" elif operation == NOR_OP: bresult = False for idx in sources: kv = KgtkValue(row[idx]) if kv.is_boolean(): bresult = bresult or kv.is_true() output_row[into_column_idx] = KgtkValue.to_boolean(not bresult) elif operation == NOT_OP: for idx in sources: kv = KgtkValue(row[idx]) if kv.is_boolean(): output_row[into_column_idxs[idx]] = KgtkValue.to_boolean(not kv.is_true()) else: output_row[into_column_idxs[idx]] = "" elif operation == OR_OP: bresult = False for idx in sources: kv = KgtkValue(row[idx]) if kv.is_boolean(): bresult = bresult or kv.is_true() output_row[into_column_idx] = KgtkValue.to_boolean(bresult) elif operation == PERCENTAGE_OP: output_row[into_column_idx] = fs % (float(row[sources[0]]) * 100 / float(row[sources[1]])) elif operation == REPLACE_OP: if limit == 0: output_row[into_column_idx] = row[sources[0]].replace(values[0], with_values[0]) else: output_row[into_column_idx] = row[sources[0]].replace(values[0], with_values[0], limit) elif operation == SET_OP: for idx in range(len(values)): output_row[into_column_idxs[idx]] = values[idx] elif operation == SUBSTITUTE_OP and substitute_re is not None: output_row[into_column_idx] = substitute_re.sub(with_values[0], row[sources[0]], count=limit) elif operation == SUM_OP: total: float = 0 for idx in sources: item = row[idx] if len(item) > 0: total += float(item) for item in values: if len(item) > 0: total += float(item) output_row[into_column_idx] = fs % total elif operation == SWAPCASE_OP: for idx in range(len(sources)): output_row[into_column_idxs[idx]] = row[sources[idx]].swapcase() elif operation == TITLE_OP: for idx in range(len(sources)): output_row[into_column_idxs[idx]] = row[sources[idx]].title() elif operation == UPPER_OP: for idx in range(len(sources)): output_row[into_column_idxs[idx]] = row[sources[idx]].upper() elif operation == XOR_OP: bresult = False for idx in sources: kv = KgtkValue(row[idx]) if kv.is_boolean(): bresult = bresult != kv.is_true() output_row[into_column_idx] = KgtkValue.to_boolean(bresult) kw.write(output_row) # Flush the output file so far: kw.flush() if verbose: print("Read %d data lines from file %s" % (input_data_lines, input_kgtk_file), file=error_file, flush=True) kw.close() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, new_edges_file: KGTKFiles, base_columns: typing.Optional[typing.List[str]] = None, columns_to_lower: typing.Optional[typing.List[str]] = None, label_value: str = KgtkLift.DEFAULT_LABEL_SELECT_COLUMN_VALUE, lift_separator: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SEPARATOR, lower: bool = False, normalize: bool = False, deduplicate_new_edges: bool = True, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) new_edges_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file(new_edges_file, who="Label file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if new_edges_kgtk_file is not None: print("--label-file=%s" % str(new_edges_kgtk_file), file=error_file) if base_columns is not None: print("--base-columns=%s" % " ".join(base_columns), file=error_file) if columns_to_lower is not None: print("--columns-to-lower=%s" % " ".join(columns_to_lower), file=error_file) print("--label-value=%s" % label_value, file=error_file) print("--lift-separator=%s" % lift_separator, file=error_file) print("--lower=%s" % lower, file=error_file) print("--normalize=%s" % normalize, file=error_file) print("--deduplicate-labels=%s" % deduplicate_new_edges, file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) if not lower and not normalize: raise KGTKException( "One or both of --lower and --normalize must be requested.") try: if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) # Map the index of a column being removed to the index of the base column that supplies its node1 value. lower_map: typing.MutableMapping[int, typing.Tuple[int, str]] = dict() node1_column_name: str = kr.get_node1_column_actual_name() label_column_name: str = kr.get_label_column_actual_name() node2_column_name: str = kr.get_node2_column_actual_name() id_column_name: str = kr.get_id_column_actual_name() key_column_names: typing.List[str] = list() key_column_idxs: typing.Set[int] = set() if node1_column_name != "": if verbose: print("Node1 column name: %s" % node1_column_name, file=error_file, flush=True) key_column_names.append(node1_column_name) key_column_idxs.add(kr.node1_column_idx) if label_column_name != "": if verbose: print("Label column name: %s" % label_column_name, file=error_file, flush=True) key_column_names.append(label_column_name) key_column_idxs.add(kr.label_column_idx) if node2_column_name != "": if verbose: print("Node2 column name: %s" % node2_column_name, file=error_file, flush=True) key_column_names.append(node2_column_name) key_column_idxs.add(kr.node2_column_idx) if id_column_name != "": if verbose: print("Id column name: %s" % id_column_name, file=error_file, flush=True) key_column_names.append(id_column_name) key_column_idxs.add(kr.id_column_idx) elif normalize: raise KGTKException( "--normalize was requested but the ID column was not found.") base_name: str new_label_value: str column_name: str idx: int # There are three option patterns. if columns_to_lower is not None and len( columns_to_lower) > 0 and base_columns is not None and len( base_columns) > 0: # Pattern 1: len(columns_to_lower) > 0 and len(base_columns) == len(columns_to_lower) # column_names and base_columns are paired. New records use label_value. if len(columns_to_lower) != len(base_columns): raise KGTKException( "There are %d columns to remove but only %d base columns." % (len(columns_to_lower), len(base_columns))) if len(label_value) == 0: raise KGTKException("The --label-value must not be empty.") for idx, column_name in enumerate(columns_to_lower): base_name = base_columns[idx] if column_name not in kr.column_names: raise KGTKException( "Column %s is an unknown column, cannot remove it." % repr(column_name)) if column_name in key_column_names: raise KGTKException( "Column %s is a key column, cannot remove it." % repr(column_name)) if base_name not in kr.column_names: raise KGTKException( "For column name %s, base name %s is unknown" % (repr(column_name), repr(base_name))) if normalize and base_name == id_column_name: lower_map[kr.column_name_map[column_name]] = ( kr.column_name_map[base_name], column_name) else: if not lower: raise KGTKException( "--lower is not enabled for column %s, base name %s" % (repr(column_name), repr(base_name))) lower_map[kr.column_name_map[column_name]] = ( kr.column_name_map[base_name], label_value) elif columns_to_lower is not None and len(columns_to_lower) > 0 and ( base_columns is None or len(base_columns) == 0): # Pattern 2: len(columns_to_lower) > 0 and len(base_columns) == 0 # Each column name is split at the lift separator to determine the base name and label value. if len(lift_separator) == 0: raise KGTKException("The --lift-separator must not be empty.") for idx, column_name in enumerate(columns_to_lower): if column_name not in kr.column_names: raise KGTKException( "Column %s is an unknown column, cannot remove it." % repr(column_name)) if column_name in key_column_names: raise KGTKException( "Column %s is a key column, cannot remove it." % repr(column_name)) if lower and lift_separator in column_name: base_name, new_label_value = column_name.split( lift_separator, 1) if base_name not in kr.column_names: raise KGTKException( "For column name %s, base name %s is not known" % (repr(column_name), repr(base_name))) elif normalize: base_name = id_column_name new_label_value = column_name else: raise KGTKException( "Unable to parse column name %s, no separator (%s)." % (repr(column_name), repr(lift_separator))) lower_map[kr.column_name_map[column_name]] = ( kr.column_name_map[base_name], new_label_value) elif columns_to_lower is None or len(columns_to_lower) == 0: # Pattern 3: len(columns_to_lower) == 0. # Any column that matches a lift pattern against one of the # key columns (node1, label, node2, id, or their aliases) # will be lowered. if len(lift_separator) == 0: raise KGTKException("The --lift-separator must not be empty.") if base_columns is None or len(base_columns) == 0: # The base name list wasn't supplied. Use [node1, label, node2, id] base_columns = list(key_column_names) if verbose: print("Using the default base columns: %s" % " ".join(base_columns), file=error_file, flush=True) else: if verbose: print("Using these base columns: %s" % " ".join(base_columns), file=error_file, flush=True) for idx, column_name in enumerate(kr.column_names): # Skip the node1, label, node12, and id columns if idx in key_column_idxs: if verbose: print("column %s is a key column, skipping." % repr(column_name), file=error_file, flush=True) continue # Does this column match a lifting pattern? if lower and lift_separator in column_name: base_name, new_label_value = column_name.split( lift_separator, 1) if base_name not in base_columns: if verbose: print( "Column %s contains base name %s, which is not a base column." % (repr(column_name), repr(base_name)), file=error_file, flush=True) continue elif normalize: base_name = id_column_name new_label_value = column_name else: if verbose: print( "Column %s does not contain the separator %s and not normalizing, skipping." % (repr(column_name), repr(lift_separator)), file=error_file, flush=True) continue # This test should be redundant. if base_name in kr.column_names: lower_map[idx] = (kr.column_name_map[base_name], new_label_value) else: raise KGTKException( "Base name %s was unexpectedly not found." % repr(base_name)) if len(lower_map) == 0: raise KGTKException("There are no columns to lower or normalize.") if verbose: print("The following columns will be lowered or normalized", file=error_file, flush=True) for idx in sorted(lower_map.keys()): column_name = kr.column_names[idx] base_idx, new_label_value = lower_map[idx] base_name = kr.column_names[base_idx] print(" %s from %s (label %s)" % (column_name, base_name, repr(new_label_value)), file=error_file, flush=True) output_column_names: typing.List[str] = list() for idx, column_name in enumerate(kr.column_names): if idx not in lower_map: output_column_names.append(column_name) if verbose: print("The output columns are: %s" % " ".join(output_column_names), file=error_file, flush=True) if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open( output_column_names, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=False, # Simplifies writing the labels verbose=verbose, very_verbose=very_verbose) shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names) lkw: typing.Optional[KgtkWriter] = None if new_edges_kgtk_file is not None: if verbose: print("Opening the label output file: %s" % str(new_edges_kgtk_file), file=error_file, flush=True) label_column_names = [ node1_column_name, label_column_name, node2_column_name ] lkw = KgtkWriter.open(label_column_names, new_edges_kgtk_file, mode=KgtkWriter.Mode.EDGE, verbose=verbose, very_verbose=very_verbose) # Optionally deduplicate the labels # set(node1_value + KgtkFormat.SEPARATOR + node2_value) label_set: typing.Set[str] = set() label_key: str input_line_count: int = 0 output_line_count: int = 0 label_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 kw.write(row, shuffle_list=shuffle_list) output_line_count += 1 column_idx: int for column_idx in lower_map.keys(): node1_idx: int node1_idx, new_label_value = lower_map[column_idx] node1_value: str node1_value = row[node1_idx] if len(node1_value) == 0: continue # TODO: raise an exception item: str = row[column_idx] if len(item) == 0: continue # Ignore empty node2 values. # Ths item might be a KGTK list. Let's split it, because # lists aren't allow in the node2 values we'll generate. node2_value: str for node2_value in KgtkValue.split_list(item): if len(node2_value) == 0: continue # Ignore empty node2 values. if deduplicate_new_edges: label_key = node1_value + KgtkFormat.KEY_FIELD_SEPARATOR + new_label_value + KgtkFormat.KEY_FIELD_SEPARATOR + node2_value if label_key in label_set: continue else: label_set.add(label_key) output_map: typing.Mapping[str, str] = { node1_column_name: node1_value, label_column_name: new_label_value, node2_column_name: node2_value, } if lkw is None: kw.writemap(output_map) label_line_count += 1 output_line_count += 1 else: lkw.writemap(output_map) label_line_count += 1 if verbose: print("Read %d rows, wrote %d rows with %d labels." % (input_line_count, output_line_count, label_line_count), file=error_file, flush=True) kw.close() if lkw is not None: lkw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def implode_language_qualified_string( self, input_line_count: int, row: typing.List[str], implosion: typing.Mapping[str, int], type_name: str, ) -> typing.Tuple[str, bool]: valid: bool = True text_idx: int = implosion[KgtkValueFields.TEXT_FIELD_NAME] text_val: str = row[text_idx] if len(text_val) == 0: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) elif len(text_val) == 1: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is too short" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) else: if not text_val.startswith('"'): valid = False if self.verbose: print( "Input line %d: data type '%s': %s field does not start with a double quote" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) if not text_val.endswith('"'): valid = False if self.verbose: print( "Input line %d: data type '%s': %s field does not end with a double quote" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) language_idx: int = implosion[KgtkValueFields.LANGUAGE_FIELD_NAME] language_val: str = self.unwrap(row[language_idx]) if len(language_val) == 0: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.LANGUAGE_FIELD_NAME), file=self.error_file, flush=True) suf_idx: int = implosion[KgtkValueFields.LANGUAGE_SUFFIX_FIELD_NAME] suf: str = self.unwrap(row[suf_idx]) if suf_idx >= 0 else "" if len(suf) > 0 and not suf.startswith("-"): # As a siecial favor, we'll accept language suffixes that do not # start with a dash. We'll prepend the dash. suf = "-" + suf value: str = "" if valid: # This subterfuge uses Python's literal parser to parse the string. if not self.escape_pipes: # ast.literal_eval(...) doesn't treat backslash pipe (\|) as an escaped pipe (|). # (this is documented behavior) so we will remove escaped pipes manually. text_val = text_val.replace('\\|', '|') value = KgtkFormat.stringify(ast.literal_eval(text_val), language=language_val, language_suffix=suf) if valid and self.validate: kv: KgtkValue = KgtkValue(value, options=self.value_options) valid = kv.is_language_qualified_string(validate=True) if not valid: if self.verbose: print( "Input line %d: data type '%s': imploded value '%s' is not a valid language qualified string." % (input_line_count, type_name, value), file=self.error_file, flush=True) return value, valid
def implode_string( self, input_line_count: int, row: typing.List[str], implosion: typing.Mapping[str, int], type_name: str, ) -> typing.Tuple[str, bool]: valid: bool = True if KgtkValueFields.LANGUAGE_FIELD_NAME in implosion: language_idx: int = implosion[KgtkValueFields.LANGUAGE_FIELD_NAME] if language_idx >= 0: language_val: str = self.unwrap(row[language_idx]) if len(language_val) > 0: if self.general_strings: return self.implode_language_qualified_string( input_line_count, row, implosion, type_name) else: valid = False if self.verbose: print( "Input line %d: data type '%s': %s field is not empty" % (input_line_count, type_name, KgtkValueFields.LANGUAGE_FIELD_NAME), file=self.error_file, flush=True) text_idx: int = implosion[KgtkValueFields.TEXT_FIELD_NAME] text_val: str = row[text_idx] if len(text_val) == 0: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) elif len(text_val) == 1: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is too short" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) else: if not text_val.startswith('"'): valid = False if self.verbose: print( "Input line %d: data type '%s': %s field does not start with a double quote" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) if not text_val.endswith('"'): valid = False if self.verbose: print( "Input line %d: data type '%s': %s field does not end with a double quote" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) value: str = "" if valid: # This subterfuge uses Python's literal parser to parse the string. if not self.escape_pipes: # ast.literal_eval(...) doesn't treat backslash pipe (\|) as an escaped pipe (|). # (this is documented behavior) so we will remove escaped pipes manually. text_val = text_val.replace('\\|', '|') value = KgtkFormat.stringify(ast.literal_eval(text_val)) if valid and self.validate: kv: KgtkValue = KgtkValue(value, options=self.value_options) valid = kv.is_string(validate=True) if not valid: if self.verbose: print( "Input line %d: data type '%s': imploded value '%s' is not a valid string." % (input_line_count, type_name, value), file=self.error_file, flush=True) return value, valid