def run( input_file: KGTKFiles, output_file: KGTKFiles, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys import typing from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict( kwargs) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: # First create the KgtkReader. It provides parameters used by the ID # column builder. Next, create the ID column builder, which provides a # possibly revised list of column names for the KgtkWriter. Create # the KgtkWriter. Last, process the data stream. # Open the input file. kr: KgtkReader = KgtkReader.open( input_kgtk_file, error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) # Create the ID builder. idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options) # Open the output file. ew: KgtkWriter = KgtkWriter.open(idb.column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=verbose, very_verbose=very_verbose) # Process the input file, building IDs. idb.process(kr, ew) # Clean up. ew.close() kr.close() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def process(self): # Open the input file. if self.verbose: if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) self.id_column_idx = kr.id_column_idx # If requested, create the ID column builder. # Assemble the list of output column names. output_column_names: typing.List[str] idb: typing.Optional[KgtkIdBuilder] = None if self.build_id: if self.idbuilder_options is None: raise ValueError( "ID build requested but ID builder options are missing") idb = KgtkIdBuilder.new(kr, self.idbuilder_options) output_column_names = idb.column_names else: output_column_names = kr.column_names # Build the list of key column edges: key_idx_list: typing.List[int] = [] if len(self.key_column_names) == 0: if kr.is_edge_file: # Add the KGTK edge file required columns. key_idx_list.append(kr.node1_column_idx) key_idx_list.append(kr.label_column_idx) key_idx_list.append(kr.node2_column_idx) if not self.compact_id and kr.id_column_idx >= 0: key_idx_list.append(kr.id_column_idx) elif kr.is_node_file: # Add the KGTK node file required column: key_idx_list.append(kr.id_column_idx) else: raise ValueError( "The input file is neither an edge nor a node file. Key columns must be supplied." ) else: # Append columns to the list of key column indices, # silently removing duplicates, but complaining about unknown names. # # TODO: warn about duplicates? column_name: str for column_name in self.key_column_names: if column_name not in kr.column_name_map: raise ValueError("Column %s is not in the input file" % (repr(column_name))) key_idx: int = kr.column_name_map[column_name] if key_idx not in key_idx_list: key_idx_list.append(key_idx) if self.verbose: print("key indexes: %s" % " ".join([str(idx) for idx in key_idx_list]), file=self.error_file, flush=True) self.keep_first_idx_list.clear() if len(self.keep_first_names) > 0: keep_first_name: str for keep_first_name in self.keep_first_names: if keep_first_name not in kr.column_name_map: raise ValueError( "Keep first column %s is not in the input file" % (repr(keep_first_name))) keep_first_idx: int = kr.column_name_map[keep_first_name] if keep_first_idx in key_idx_list: raise ValueError( "Keep first column %s may not be a key column" % (repr(keep_first_name))) self.keep_first_idx_list.append(keep_first_idx) if self.verbose: print("keep first indexes: %s" % " ".join([str(idx) for idx in self.keep_first_idx_list]), file=self.error_file, flush=True) if self.deduplicate: if self.compact_id and kr.id_column_idx >= 0 and kr.id_column_idx not in self.keep_first_idx_list: self.keep_first_idx_list.append(kr.id_column_idx) # Any columns that aren't in the keep_first list and aren't # already in key_idx_list will be appended to key_idx_list: idx: int for idx in range(kr.column_count): if idx not in self.keep_first_idx_list and idx not in key_idx_list: key_idx_list.append(idx) if self.verbose: print("revised key indexes: %s" % " ".join([str(idx) for idx in key_idx_list]), file=self.error_file, flush=True) if self.verbose: key_idx_list_str: typing.List[str] = [] for key_idx in key_idx_list: key_idx_list_str.append(str(key_idx)) print("key indexes: %s" % " ".join(key_idx_list_str), file=self.error_file, flush=True) # Open the output file. ew: KgtkWriter = KgtkWriter.open( output_column_names, self.output_file_path, mode=kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, use_mgzip=self.reader_options.use_mgzip, # Hack! mgzip_threads=self.reader_options.mgzip_threads, # Hack! gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) # Open the optional list output file. lew: typing.Optional[KgtkWriter] = None if self.list_output_file_path is not None: lew = KgtkWriter.open( output_column_names, self.list_output_file_path, mode=kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, use_mgzip=self.reader_options.use_mgzip, # Hack! mgzip_threads=self.reader_options.mgzip_threads, # Hack! gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) input_line_count: int = 0 row: typing.List[str] = [] input_key: str prev_input_key: typing.Optional[str] = None going_up: typing.Optional[bool] = None if self.sorted_input: if self.verbose: print("Reading the input data from %s" % self.input_file_path, file=self.error_file, flush=True) for row in kr: input_line_count += 1 input_key = self.build_key(row, key_idx_list) if self.verify_sort: if prev_input_key is None: prev_input_key = input_key else: if going_up is None: if prev_input_key < input_key: going_up = True prev_input_key = input_key elif prev_input_key > input_key: going_up = False prev_input_key = input_key else: pass # No change in input key elif going_up: if prev_input_key < input_key: prev_input_key = input_key elif prev_input_key > input_key: raise ValueError( "Line %d sort violation going up: prev='%s' curr='%s'" % (input_line_count, prev_input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR), input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR))) else: pass # No change in input_key else: if prev_input_key > input_key: prev_input_key = input_key elif prev_input_key < input_key: raise ValueError( "Line %d sort violation going down: prev='%s' curr='%s'" % (input_line_count, prev_input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR), input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR))) else: pass # No change in input_key self.process_row(input_key, row, input_line_count, idb, ew, lew) else: if self.verbose: print("Sorting the input data from %s" % self.input_file_path, file=self.error_file, flush=True) # Map key values to lists of input and output data. input_map: typing.MutableMapping[ str, typing.List[typing.List[str]]] = {} for row in kr: input_line_count += 1 input_key = self.build_key(row, key_idx_list) if input_key in input_map: # Append the row to an existing list for that key. input_map[input_key].append(row) else: # Create a new list of rows for this key. input_map[input_key] = [row] if self.verbose: print("Processing the sorted input data", file=self.error_file, flush=True) for input_key in sorted(input_map.keys()): for row in input_map[input_key]: self.process_row(input_key, row, input_line_count, idb, ew, lew) # Flush the final row, if any. We pass the last row read for # feedback, such as an ID uniqueness violation. self.process_row("", row, input_line_count, idb, ew, lew, flush=True) if self.verbose: print("Read %d records, excluded %d records, wrote %d records." % (input_line_count, self.excluded_row_count, self.output_line_count), file=self.error_file, flush=True) if lew is not None: print("Wrote %d list ouput records." % (self.list_output_line_count), file=self.error_file, flush=True) ew.close() if lew is not None: lew.close()
def process(self): output_column_names: typing.List[str] if self.build_id and self.idbuilder_options is not None: self.idbuilder = KgtkIdBuilder.from_column_names( self.COLUMN_NAMES, self.idbuilder_options) output_column_names = self.idbuilder.column_names else: output_column_names = self.COLUMN_NAMES if self.verbose: print("Opening output file %s" % str(self.output_file_path), file=self.error_file, flush=True) # Open the output file. ew: KgtkWriter = KgtkWriter.open(output_column_names, self.output_file_path, mode=KgtkWriter.Mode.EDGE, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) rw: typing.Optional[typing.TextIO] = None if self.reject_file_path is not None: if self.verbose: print("Opening reject file %s" % str(self.reject_file_path), file=self.error_file, flush=True) # Open the reject file. Since the input data is not in KGTK format, # we use an ordinary file here. if str(self.reject_file_path) == "-": rw = sys.stdout else: rw = open(self.reject_file_path, "wt") total_input_line_count: int = 0 reject_line_count: int = 0 namespace_line_count: int = self.get_initial_namespaces() input_file_path: str for input_file_path in self.input_file_paths: input_line_count: int = 0 if self.local_namespace_use_uuid or self.namespace_id_use_uuid or self.newnode_use_uuid: if self.override_uuid is not None: self.local_namespace_uuid = self.override_uuid # for debugging else: # Generate a new local namespace UUID. self.local_namespace_uuid = shortuuid.uuid() # Open the input file. if self.verbose: print("Opening the input file: %s" % input_file_path, file=self.error_file, flush=True) infile: typing.TestIO if str(input_file_path) == "-": infile = sys.stdin else: infile = open(input_file_path, 'rt') line: str for line in infile: input_line_count += 1 total_input_line_count += 1 row: typing.List[str] valid: bool row, valid = self.parse(line, input_line_count) if not valid: if rw is not None: rw.write(line) reject_line_count += 1 continue node1: str ok_1: bool node1, ok_1 = self.convert_and_validate( row[0], input_line_count, ew) label: str ok_2: bool label, ok_2 = self.convert_and_validate( row[1], input_line_count, ew) node2: str ok_3: bool node2, ok_3 = self.convert_and_validate( row[2], input_line_count, ew) if ok_1 and ok_2 and ok_3: self.write_row(ew, node1, label, node2) else: if rw is not None: rw.write(line) reject_line_count += 1 if input_file_path != "-": infile.close() self.save_namespaces(ew) if self.verbose: print("Processed %d known namespaces." % (namespace_line_count), file=self.error_file, flush=True) print("Processed %d records." % (total_input_line_count), file=self.error_file, flush=True) print("Rejected %d records." % (reject_line_count), file=self.error_file, flush=True) print("Wrote %d records." % (self.output_line_count), file=self.error_file, flush=True) if ew is not None: ew.close() if rw is not None and self.reject_file_path is not None and self.reject_file_path != "-": rw.close()
def process(self): if len(self.column_name) == 0: raise ValueError("The name of the column to implode is empty.") selected_field_names: typing.List[str] = [] field_name: str if self.type_names is not None: if self.verbose: print("Validate the names of the data types to extract.", file=self.error_file, flush=True) type_name: str for type_name in self.type_names: if type_name not in KgtkValueFields.DEFAULT_DATA_TYPE_FIELDS: raise ValueError("Unknown data type name '%s'." % type_name) # Merge this KGTK data type's fields into the list of selected fields: for field_name in KgtkValueFields.DEFAULT_DATA_TYPE_FIELDS[ type_name]: if field_name == KgtkValueFields.VALID_FIELD_NAME: continue # We don't need the valid field. if field_name == KgtkValueFields.LIST_LEN_FIELD_NAME: continue # We don't need the list length field. if field_name not in selected_field_names: selected_field_names.append(field_name) if len(selected_field_names) == 0: raise ValueError("The list of fields to implode is empty.") if KgtkValueFields.DATA_TYPE_FIELD_NAME not in selected_field_names: raise ValueError( "The data type field '%s' has not been selected." % KgtkValueFields.DATA_TYPE_FIELD_NAME) # Open the input file. if self.verbose: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) output_column_names = kr.column_names.copy() new_column: bool # True ==> adding the imploded column, False ==> using an existing column column_idx: int # The index of the imploded column (new or old). if self.column_name in kr.column_name_map: column_idx = kr.column_name_map[self.column_name] new_column = False if not self.overwrite_column: raise ValueError( "Imploded column '%s' (idx %d) already exists and overwrite not allowed." % (self.column_name, column_idx)) if self.verbose: print("Overwriting existing imploded column '%s' (idx %d)." % (self.column_name, column_idx), file=self.error_file, flush=True) else: column_idx = len(output_column_names) new_column = True output_column_names.append(self.column_name) if self.verbose: print("Imploded column '%s' will be created (idx %d)." % (self.column_name, column_idx), file=self.error_file, flush=True) if self.verbose: print("Build the map of field names to exploded columns", file=self.error_file, flush=True) implosion: typing.MutableMapping[str, int] = {} missing_columns: typing.List[str] = [] for field_name in selected_field_names: if field_name in self.without_fields: if self.verbose: print("We can do without field '%s'." % field_name, file=self.error_file, flush=True) implosion[field_name] = -1 continue exploded_name: str = self.prefix + field_name if self.verbose: print("Field '%s' becomes '%s'" % (field_name, exploded_name), file=self.error_file, flush=True) if exploded_name in implosion: raise ValueError( "Field name '%s' is duplicated in the field list.") if exploded_name in kr.column_names: exploded_idx = kr.column_name_map[exploded_name] implosion[field_name] = exploded_idx if self.verbose: print("Field '%s' is in column '%s' (idx=%d)" % (field_name, exploded_name, exploded_idx), file=self.error_file, flush=True) else: if self.verbose: print("Field '%s' exploded column '%s' not found." % (field_name, exploded_name), file=self.error_file, flush=True) missing_columns.append(exploded_name) if len(missing_columns) > 0: raise ValueError("Missing columns: %s" % " ".join(missing_columns)) data_type_idx = implosion[KgtkValueFields.DATA_TYPE_FIELD_NAME] # If requested, create the ID column builder. # Assemble the list of output column names. idb: typing.Optional[KgtkIdBuilder] = None if self.build_id: if self.idbuilder_options is None: raise ValueError( "ID build requested but ID builder options are missing") idb = KgtkIdBuilder.from_column_names(output_column_names, self.idbuilder_options) id_output_column_names = idb.column_names.copy() else: id_output_column_names = output_column_names.copy() trimmed_output_column_names: typing.List[str] if self.remove_prefixed_columns and len(self.prefix) > 0: trimmed_output_column_names = [] if self.verbose: print("Removing columns with names that start with '%s'." % self.prefix, file=self.error_file, flush=True) column_name: str for column_name in id_output_column_names: if column_name.startswith(self.prefix): if self.verbose: print("Removing column '%s." % column_name, file=self.error_file, flush=True) else: trimmed_output_column_names.append(column_name) else: trimmed_output_column_names = id_output_column_names shuffle_list: typing.List[int] = [ ] # Easier to init than deal with typing.Optional. ew: typing.Optional[KgtkWriter] = None if self.output_file_path is not None: if self.verbose: print("Opening output file %s" % str(self.output_file_path), file=self.error_file, flush=True) # Open the output file. ew: KgtkWriter = KgtkWriter.open(trimmed_output_column_names, self.output_file_path, mode=kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) shuffle_list = ew.build_shuffle_list(id_output_column_names) rw: typing.Optional[KgtkWriter] = None if self.reject_file_path is not None: if self.verbose: print("Opening reject file %s" % str(self.reject_file_path), file=self.error_file, flush=True) # Open the reject file. rw: KgtkWriter = KgtkWriter.open(kr.column_names, self.reject_file_path, mode=kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) if self.verbose: print("Imploding records from %s" % self.input_file_path, file=self.error_file, flush=True) input_line_count: int = 0 imploded_value_count: int = 0 invalid_value_count: int = 0 existing_column_idx: int = -1 if new_column else column_idx row: typing.List[str] for row in kr: input_line_count += 1 value: str valid: bool value, valid = self.implode(input_line_count, row, implosion, data_type_idx, existing_column_idx) if valid: imploded_value_count += 1 else: invalid_value_count += 1 if rw is not None and not valid: # Reject the row before implosion. rw.write(row) elif ew is not None: output_row: typing.List[str] = row.copy() if new_column: output_row.append(value) else: output_row[column_idx] = value if idb is not None: output_row = idb.build(output_row, input_line_count) ew.write(output_row, shuffle_list=shuffle_list) if self.verbose: print( "Processed %d records, imploded %d values, %d invalid values." % (input_line_count, imploded_value_count, invalid_value_count), file=self.error_file, flush=True) if ew is not None: ew.close() if rw is not None: rw.close()
def process(self): # Open the input file. if self.verbose: if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open(self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options = self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) # If requested, creat the ID column builder. # Assemble the list of output column names. output_column_names: typing.List[str] idb: typing.Optional[KgtkIdBuilder] = None if self.build_id: if self.idbuilder_options is None: raise ValueError("ID build requested but ID builder options are missing") idb = KgtkIdBuilder.new(kr, self.idbuilder_options) output_column_names = idb.column_names else: output_column_names = kr.column_names # Build the list of key column edges: key_idx_list: typing.List[int] = [ ] if kr.is_edge_file: # Add the KGTK edge file required columns. key_idx_list.append(kr.node1_column_idx) key_idx_list.append(kr.label_column_idx) key_idx_list.append(kr.node2_column_idx) if not self.compact_id and kr.id_column_idx >= 0: key_idx_list.append(kr.id_column_idx) elif kr.is_node_file: # Add the KGTK node file required column: key_idx_list.append(kr.id_column_idx) # Append additinal columns to the list of key column indixes, # silently removing duplicates, but complaining about unknown names. # # TODO: warn about duplicates? column_name: str for column_name in self.key_column_names: if column_name not in kr.column_name_map: raise ValueError("Column %s is not in the input file" % (column_name)) key_idx: int = kr.column_name_map[column_name] if key_idx not in key_idx_list: key_idx_list.append(key_idx) if self.verbose: key_idx_list_str: typing.List[str] = [ ] for key_idx in key_idx_list: key_idx_list_str.append(str(key_idx)) print("key indexes: %s" % " ".join(key_idx_list_str)) # Open the output file. ew: KgtkWriter = KgtkWriter.open(output_column_names, self.output_file_path, mode=kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) input_line_count: int = 0 row: typing.List[str] = [ ] input_key: str prev_input_key: typing.Optional[str] = None going_up: typing.Optional[bool] = None if self.sorted_input: if self.verbose: print("Reading the input data from %s" % self.input_file_path, file=self.error_file, flush=True) for row in kr: input_line_count += 1 input_key = self.build_key(row, key_idx_list) if self.verify_sort: if prev_input_key is None: prev_input_key = input_key else: if going_up is None: if prev_input_key < input_key: going_up = True prev_input_key = input_key elif prev_input_key > input_key: going_up = False prev_input_key = input_key else: pass # No change in input key elif going_up: if prev_input_key < input_key: prev_input_key = input_key elif prev_input_key > input_key: raise ValueError("Line %d sort violation going up: prev='%s' curr='%s'" % (input_line_count, prev_input_key, input_key)) else: pass # No change in input_key else: if prev_input_key > input_key: prev_input_key = input_key elif prev_input_key < input_key: raise ValueError("Line %d sort violation going down: prev='%s' curr='%s'" % (input_line_count, prev_input_key, input_key)) else: pass # No change in input_key self.process_row(input_key, row, input_line_count, idb, ew) else: if self.verbose: print("Sorting the input data from %s" % self.input_file_path, file=self.error_file, flush=True) # Map key values to lists of input and output data. input_map: typing.MutableMapping[str, typing.List[typing.List[str]]] = { } for row in kr: input_line_count += 1 input_key = self.build_key(row, key_idx_list) if input_key in input_map: # Append the row to an existing list for that key. input_map[input_key].append(row) else: # Create a new list of rows for this key. input_map[input_key] = [ row ] if self.verbose: print("Processing the sorted input data", file=self.error_file, flush=True) for input_key in sorted(input_map.keys()): for row in input_map[input_key]: self.process_row(input_key, row, input_line_count, idb, ew) # Flush the final row, if any. We pass the last row read for # feedback, such as an ID uniqueness violation. self.process_row("", row, input_line_count, idb, ew, flush=True) if self.verbose: print("Read %d records, wrote %d records." % (input_line_count, self.output_line_count), file=self.error_file, flush=True) ew.close()
def run( input_file: KGTKFiles, output_file: KGTKFiles, new_edges_file: KGTKFiles, base_columns: typing.Optional[typing.List[str]] = None, columns_to_lower: typing.Optional[typing.List[str]] = None, label_values: typing.Optional[typing.List[str]] = None, lift_separator: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SEPARATOR, ignore_empty_node1: bool = False, ignore_empty_node2: bool = False, add_id: bool = False, lower: bool = False, normalize: bool = False, deduplicate_new_edges: bool = True, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions from kgtk.value.kgtkvalue import KgtkValue from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) new_edges_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file(new_edges_file, who="Label file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict( kwargs) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if new_edges_kgtk_file is not None: print("--label-file=%s" % str(new_edges_kgtk_file), file=error_file) if base_columns is not None: print("--base-columns %s" % " ".join(base_columns), file=error_file) if columns_to_lower is not None: print("--columns-to-lower %s" % " ".join(columns_to_lower), file=error_file) if label_values is not None: print("--label-values %s" % " ".join(label_values), file=error_file) print("--lift-separator=%s" % lift_separator, file=error_file) print("--add-id=%s" % add_id, file=error_file) print("--lower=%s" % lower, file=error_file) print("--ignore-empty-node1=%s" % ignore_empty_node1, file=error_file) print("--ignore-empty-node2=%s" % ignore_empty_node2, file=error_file) print("--normalize=%s" % normalize, file=error_file) print("--deduplicate-labels=%s" % deduplicate_new_edges, file=error_file) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) if not lower and not normalize: raise KGTKException( "One or both of --lower and --normalize must be requested.") try: if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) # Map the index of a column being removed to the index of the base column that supplies its node1 value. lower_map: typing.MutableMapping[int, typing.Tuple[int, str]] = dict() node1_column_name: str = kr.get_node1_column_actual_name() label_column_name: str = kr.get_label_column_actual_name() node2_column_name: str = kr.get_node2_column_actual_name() id_column_name: str = kr.get_id_column_actual_name() key_column_names: typing.List[str] = list() key_column_idxs: typing.Set[int] = set() if node1_column_name != "": if verbose: print("Node1 column name: %s" % node1_column_name, file=error_file, flush=True) key_column_names.append(node1_column_name) key_column_idxs.add(kr.node1_column_idx) if label_column_name != "": if verbose: print("Label column name: %s" % label_column_name, file=error_file, flush=True) key_column_names.append(label_column_name) key_column_idxs.add(kr.label_column_idx) if node2_column_name != "": if verbose: print("Node2 column name: %s" % node2_column_name, file=error_file, flush=True) key_column_names.append(node2_column_name) key_column_idxs.add(kr.node2_column_idx) if id_column_name != "": if verbose: print("Id column name: %s" % id_column_name, file=error_file, flush=True) key_column_names.append(id_column_name) key_column_idxs.add(kr.id_column_idx) elif normalize: raise KGTKException( "--normalize was requested but the ID column was not found.") base_name: str new_label_value: str column_name: str idx: int # There are three option patterns. if columns_to_lower is not None and len( columns_to_lower) > 0 and base_columns is not None and len( base_columns) > 0: # Pattern 1: len(columns_to_lower) > 0 and len(base_columns) == len(columns_to_lower) # column_names and base_columns are paired. New records use label_values if specified. if len(columns_to_lower) != len(base_columns): raise KGTKException( "There are %d columns to lower but only %d base columns." % (len(columns_to_lower), len(base_columns))) if label_values is not None and len(label_values) > 0 and len( label_values) != len(columns_to_lower): raise KGTKException( "There are %d columns to lower but only %d label values." % (len(columns_to_lower), len(label_values))) for idx, column_name in enumerate(columns_to_lower): base_name = base_columns[idx] if column_name not in kr.column_names: raise KGTKException( "Column %s is an unknown column, cannot remove it." % repr(column_name)) if column_name in key_column_names: raise KGTKException( "Column %s is a key column, cannot remove it." % repr(column_name)) if base_name not in kr.column_names: raise KGTKException( "For column name %s, base name %s is unknown" % (repr(column_name), repr(base_name))) if normalize and base_name == id_column_name: lower_map[kr.column_name_map[column_name]] = ( kr.column_name_map[base_name], column_name) else: if not lower: raise KGTKException( "--lower is not enabled for column %s, base name %s" % (repr(column_name), repr(base_name))) if label_values is not None and len( label_values) > 0 and len(label_values[idx]) > 0: lower_map[kr.column_name_map[column_name]] = ( kr.column_name_map[base_name], label_values[idx]) else: lower_map[kr.column_name_map[column_name]] = ( kr.column_name_map[base_name], column_name) elif columns_to_lower is not None and len(columns_to_lower) > 0 and ( base_columns is None or len(base_columns) == 0): # Pattern 2: len(columns_to_lower) > 0 and len(base_columns) == 0 # Each column name is split at the lift separator to determine the base name and label value. if len(lift_separator) == 0: raise KGTKException("The --lift-separator must not be empty.") for idx, column_name in enumerate(columns_to_lower): if column_name not in kr.column_names: raise KGTKException( "Column %s is an unknown column, cannot remove it." % repr(column_name)) if column_name in key_column_names: raise KGTKException( "Column %s is a key column, cannot remove it." % repr(column_name)) if lower and lift_separator in column_name: base_name, new_label_value = column_name.split( lift_separator, 1) if base_name not in kr.column_names: raise KGTKException( "For column name %s, base name %s is not known" % (repr(column_name), repr(base_name))) elif normalize: base_name = id_column_name new_label_value = column_name else: raise KGTKException( "Unable to parse column name %s, no separator (%s)." % (repr(column_name), repr(lift_separator))) lower_map[kr.column_name_map[column_name]] = ( kr.column_name_map[base_name], new_label_value) elif columns_to_lower is None or len(columns_to_lower) == 0: # Pattern 3: len(columns_to_lower) == 0. # Any column that matches a lift pattern against one of the # key columns (node1, label, node2, id, or their aliases) # will be lowered. if len(lift_separator) == 0: raise KGTKException("The --lift-separator must not be empty.") if base_columns is None or len(base_columns) == 0: # The base name list wasn't supplied. Use [node1, label, node2, id] base_columns = list(key_column_names) if verbose: print("Using the default base columns: %s" % " ".join(base_columns), file=error_file, flush=True) else: if verbose: print("Using these base columns: %s" % " ".join(base_columns), file=error_file, flush=True) for idx, column_name in enumerate(kr.column_names): # Skip the node1, label, node12, and id columns if idx in key_column_idxs: if verbose: print("column %s is a key column, skipping." % repr(column_name), file=error_file, flush=True) continue # Does this column match a lifting pattern? if lower and lift_separator in column_name: base_name, new_label_value = column_name.split( lift_separator, 1) if base_name not in base_columns: if verbose: print( "Column %s contains base name %s, which is not a base column." % (repr(column_name), repr(base_name)), file=error_file, flush=True) continue elif normalize: base_name = id_column_name new_label_value = column_name else: if verbose: print( "Column %s does not contain the separator %s and not normalizing, skipping." % (repr(column_name), repr(lift_separator)), file=error_file, flush=True) continue # This test should be redundant. if base_name in kr.column_names: lower_map[idx] = (kr.column_name_map[base_name], new_label_value) else: raise KGTKException( "Base name %s was unexpectedly not found." % repr(base_name)) if len(lower_map) == 0: raise KGTKException("There are no columns to lower or normalize.") if verbose: print("The following columns will be lowered or normalized", file=error_file, flush=True) for idx in sorted(lower_map.keys()): column_name = kr.column_names[idx] base_idx, new_label_value = lower_map[idx] base_name = kr.column_names[base_idx] print(" %s from %s (label %s)" % (column_name, base_name, repr(new_label_value)), file=error_file, flush=True) output_column_names: typing.List[str] = list() for idx, column_name in enumerate(kr.column_names): if idx not in lower_map: output_column_names.append(column_name) # Create the ID builder. idb: typing.Optional[KgtkIdBuilder] = None if add_id: idb = KgtkIdBuilder.from_column_names(output_column_names, idbuilder_options) output_column_names = idb.column_names.copy() if verbose: print("The output columns are: %s" % " ".join(output_column_names), file=error_file, flush=True) if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open( output_column_names, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=False, # Simplifies writing the labels verbose=verbose, very_verbose=very_verbose) shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names) lkw: typing.Optional[KgtkWriter] = None if new_edges_kgtk_file is not None: if verbose: print("Opening the label output file: %s" % str(new_edges_kgtk_file), file=error_file, flush=True) label_column_names = [ node1_column_name, label_column_name, node2_column_name ] lkw = KgtkWriter.open(label_column_names, new_edges_kgtk_file, mode=KgtkWriter.Mode.EDGE, verbose=verbose, very_verbose=very_verbose) # Optionally deduplicate the labels # set(node1_value + KgtkFormat.SEPARATOR + node2_value) label_set: typing.Set[str] = set() label_key: str input_line_count: int = 0 output_line_count: int = 0 label_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 output_row: typing.List[str] = kw.shuffle( row, shuffle_list=shuffle_list) kw.write(output_row) output_line_count += 1 id_seq_num: int = 0 column_idx: int for column_idx in lower_map.keys(): node1_idx: int node1_idx, new_label_value = lower_map[column_idx] node1_value: str node1_value = row[node1_idx] if len(node1_value) == 0: if ignore_empty_node1: continue # TODO: raise an exception else: raise KGTKException( "Empty node1 value when lowering %d to %d: %s in input line %d" % (column_idx, node1_idx, new_label_value, input_line_count)) item: str = row[column_idx] if len(item) == 0: if ignore_empty_node2: continue # Ignore empty node2 values. else: raise KGTKException( "Empty node2 value when lowering %d to %d: %s in input line %d" % (column_idx, node1_idx, new_label_value, input_line_count)) # Ths item might be a KGTK list. Let's split it, because # lists aren't allow in the node2 values we'll generate. node2_value: str for node2_value in KgtkValue.split_list(item): if len(node2_value) == 0: if ignore_empty_node2: continue # Ignore empty node2 values in a list. else: raise KGTKException( "Empty node2 value in a list when lowering %d to %d: %s in input line %d" % (column_idx, node1_idx, new_label_value, input_line_count)) if deduplicate_new_edges: label_key = node1_value + KgtkFormat.KEY_FIELD_SEPARATOR + new_label_value + KgtkFormat.KEY_FIELD_SEPARATOR + node2_value if label_key in label_set: continue else: label_set.add(label_key) lowered_input_row: typing.List[str] = [ "" for idx in range(kr.column_count) ] lowered_input_row[kr.node1_column_idx] = node1_value lowered_input_row[kr.label_column_idx] = new_label_value lowered_input_row[kr.node2_column_idx] = node2_value lowered_output_row: typing.List[str] = kw.shuffle( lowered_input_row, shuffle_list=shuffle_list) if idb is not None: id_seq_num += 0 lowered_output_row = idb.build(lowered_output_row, id_seq_num, already_added=True) if lkw is not None: lkw.write(lowered_output_row) label_line_count += 1 else: kw.write(lowered_output_row) label_line_count += 1 output_line_count += 1 if verbose: print("Read %d rows, wrote %d rows with %d labels." % (input_line_count, output_line_count, label_line_count), file=error_file, flush=True) kw.close() if lkw is not None: lkw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1