def process_qnode(self, kw: KgtkWriter, current_process_node_id: str, each_node_attributes: EACH_NODE_ATTRIBUTES) -> bool: interesting_qnode: bool = False if each_node_attributes: for k in each_node_attributes: if each_node_attributes[k]: interesting_qnode = True break if not interesting_qnode: return False concat_sentence: str explanation: str concat_sentence, explanation = self.attribute_to_sentence( each_node_attributes, current_process_node_id) if self.explain: kw.write([ current_process_node_id, self.sentence_label, KgtkFormat.stringify(concat_sentence), KgtkFormat.stringify(explanation) ]) else: kw.write([ current_process_node_id, self.sentence_label, KgtkFormat.stringify(concat_sentence) ]) return True
def write_row(self, ew: KgtkWriter, node1: str, label: str, node2: str): output_row: typing.List[str] = [node1, label, node2] if self.idbuilder is None: ew.write(output_row) else: ew.write(self.idbuilder.build(output_row, self.output_line_count)) self.output_line_count += 1
def single_object_filter_inverted( kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], obj_idx: int, obj_filter: typing.Set[str], ): if verbose: print("Applying a single object filter inverted", file=error_file, flush=True) obj_filter_value: str = list(obj_filter)[0] input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if row[obj_idx] != obj_filter_value: kw.write(row) output_line_count += 1 else: if rw is not None: rw.write(row) reject_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count))
def process_row(self, input_key: str, row: typing.List[str], line_number: int, idb: typing.Optional[KgtkIdBuilder], ew: KgtkWriter, flush: bool = False): # Note: This code makes the assumption that row lengths do not vary! if self.current_key is not None: # We have a record being built. Write it? if flush or self.current_key != input_key: # self.current_key != input_key means that the key is changing. self.compact_row() if self.current_row is not None: if idb is None: ew.write(self.current_row) else: ew.write(idb.build(self.current_row, line_number)) self.output_line_count += 1 self.current_key = None self.current_row = None if flush: # This was a flush request. We're done. return # Are we starting a new key? if self.current_key is None: # Save the new row. self.current_key = input_key self.expand_row(row) else: # Merge into an existing row. self.merge_row(row)
def single_predicate_filter( kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], pred_idx: int, pred_filter: typing.Set[str], ): if verbose: print("Applying a single predicate filter", file=error_file, flush=True) pred_filter_value: str = list(pred_filter)[0] input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if row[pred_idx] == pred_filter_value: kw.write(row) output_line_count += 1 else: if rw is not None: rw.write(row) reject_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count))
def process_cacheing_filter(self, input_kr: KgtkReader, filter_kr: KgtkReader, input_key_columns: typing.List[int], filter_key_columns: typing.List[int], ew: KgtkWriter): if self.verbose: print("Processing by cacheing the filter file's key set..") if self.verbose: print("Building the filter key set from %s" % self.filter_file_path, file=self.error_file, flush=True) key_set: typing.Set[str] = self.extract_key_set( filter_kr, "filter", filter_key_columns) if self.verbose or self.very_verbose: print("There are %d entries in the filter key set." % len(key_set), file=self.error_file, flush=True) if self.very_verbose: print("Keys: %s" % " ".join(key_set), file=self.error_file, flush=True) if self.verbose: print("Filtering records from %s" % self.input_file_path, file=self.error_file, flush=True) input_line_count: int = 0 output_line_count: int = 0 # TODO: join these two code paths using xor? row: typing.List[str] input_key: str if self.invert: for row in input_kr: input_line_count += 1 input_key = self.build_key(row, input_key_columns) if input_key not in key_set: ew.write(row) output_line_count += 1 else: for row in input_kr: input_line_count += 1 input_key = self.build_key(row, input_key_columns) if input_key in key_set: ew.write(row) output_line_count += 1 if self.verbose: print("Read %d records, wrote %d records." % (input_line_count, output_line_count), file=self.error_file, flush=True)
def process_row(self, input_key: str, row: typing.List[str], line_number: int, idb: typing.Optional[KgtkIdBuilder], ew: KgtkWriter, flush: bool = False): if self.very_verbose: print("Input key %s" % repr(input_key), file=self.error_file, flush=True) # Note: This code makes the assumption that row lengths do not vary! if self.current_key is not None: if self.very_verbose: print("No current key", file=self.error_file, flush=True) # We have a record being built. Write it? if flush or self.current_key != input_key: if self.very_verbose: if flush: print("flush", file=self.error_file, flush=True) else: print("current_key %s != input_key %s" % (repr(self.current_key), repr(input_key)), file=self.error_file, flush=True) # self.current_key != input_key means that the key is changing. self.compact_row() if self.current_row is not None: if self.very_verbose: print("writing %s" % repr(self.field_separator.join(self.current_row)), file=self.error_file, flush=True) if idb is None: ew.write(self.current_row) else: ew.write(idb.build(self.current_row, line_number)) self.output_line_count += 1 self.current_key = None self.current_row = None if flush: # This was a flush request. We're done. return # Are we starting a new key? if self.current_key is None: # Save the new row. if self.very_verbose: print("New current_key %s" % repr(self.current_key), file=self.error_file, flush=True) self.current_key = input_key if self.very_verbose: print("Expand row %s" % self.field_separator.join(row), file=self.error_file, flush=True) self.expand_row(row) else: # Merge into an existing row. if self.very_verbose: print("Merge row", file=self.error_file, flush=True) self.merge_row(row)
def write_output_row( self, ew: KgtkWriter, row: typing.List[str], new_columns: int, input_select_column_idx: int, label_select_column_idx: int, labels: typing.Mapping[str, str], lifted_column_idxs: typing.List[int], lifted_output_column_idxs: typing.List[int], ) -> bool: output_row: typing.List[str] = row.copy() if new_columns > 0: output_row.extend([""] * new_columns) output_select_column_idx: int = input_select_column_idx do_write: bool = True do_lift: bool = True if label_select_column_idx >= 0: print("label_select_column_idx %d" % label_select_column_idx) if row[label_select_column_idx] == self.label_select_column_value: # Don't lift label columns, if we have stored labels in the input records. do_lift = False if self.remove_label_records: do_write = False if input_select_column_idx >= 0: if self.input_select_column_value is not None and row[ input_select_column_idx] != self.input_select_column_value: # Not selected for lifting into. do_lift = False if do_lift: # Lift the specified columns in this row. did_lift: bool = False lifted_column_idx: int for idx, lifted_column_idx in enumerate(lifted_column_idxs): label_key: str = row[lifted_column_idx] if label_key in labels: output_row[ lifted_output_column_idxs[idx]] = labels[label_key] did_lift = True # What if we want to note if we lifted all columns? if did_lift and output_select_column_idx >= 0 and self.output_select_column_value is not None: output_row[ output_select_column_idx] = self.output_select_column_value if do_write: ew.write(output_row) return do_write
def pass_group_through(self, kw: KgtkWriter, uninvolvedw: typing.Optional[KgtkWriter], node1_group: typing.List[typing.List[str]], new_id_column: bool): # Unreification was not triggered. Pass this group of rows # through unchanged, except for possibly appending an ID # column. # # TODO: Perhaps we'd like to build an ID value at the same time? row: typing.List[str] for row in node1_group: if uninvolvedw is not None: uninvolvedw.write(row) if new_id_column: row = row.copy() row.append("") kw.write(row) self.output_line_count += 1
def process_row(self, input_key: str, row: typing.List[str], line_number: int, idb: typing.Optional[KgtkIdBuilder], ew: KgtkWriter, flush: bool = False): # Note: This code makes the assumption that row lengths do not vary! if self.current_key is not None: # We have a record being built. Write it? if flush or self.current_key != input_key: # self.current_key != input_key means that the key is changing. self.compact_row() if self.current_row is not None: if idb is None: ew.write(self.current_row) else: ew.write(idb.build(self.current_row, line_number)) self.current_key = None self.current_row = None if flush: # This was a flush request. We're done. return # Are we starting a new key? if self.current_key is None: # Save the new row as the current row. If the next row # doesn't have the same input key, we'll write this # row out with a minimum of handling. self.current_key = input_key self.current_row = row return if self.current_row_lists is None: self.expand_row() self.merge_row(row)
def write_output_row(self, ew: KgtkWriter, row: typing.List[str], new_columns: int, label_column_idx: int, labels: typing.Mapping[str, str], lifted_column_idxs: typing.List[int], lifted_output_column_idxs: typing.List[int]): output_row: typing.List[str] = row.copy() if new_columns > 0: output_row.extend([""] * new_columns) if label_column_idx >= 0 and row[ label_column_idx] == self.label_column_value: # Don't lift label columns, if we have stored labels in the input records. pass else: # Lift the specified columns in this row. lifted_column_idx: int for idx, lifted_column_idx in enumerate(lifted_column_idxs): lifted_value: str = row[lifted_column_idx] if lifted_value in labels: output_row[lifted_output_column_idxs[idx]] = labels[ row[lifted_column_idx]] ew.write(output_row) return
def process(self, kr: KgtkReader, kw: KgtkWriter): line_number: int = 0 row: typing.List[str] for row in kr: line_number += 1 kw.write(self.build(row, line_number))
def process_cacheing_input_preserving_order( self, input_kr: KgtkReader, filter_kr: KgtkReader, input_key_columns: typing.List[int], filter_key_columns: typing.List[int], ew: KgtkWriter): # This algorithm preserves the input file's record order in the output file, # at the cost of extra work building keys. if self.verbose: print( "Processing by cacheing the input file while preserving record order." ) # Step one: read the input file, cache it, and build the input key set if self.verbose: print("Building the input key set from %s" % self.input_file_path, file=self.error_file, flush=True) input_key_set: typing.Set[str] input_cache: typing.List[typing.List[str]] input_key_set, input_cache = self.extract_key_set_and_cache( input_kr, "input", input_key_columns) input_line_count: int = len(input_cache) if self.verbose or self.very_verbose: print("There are %d rows in the input cache." % input_line_count, file=self.error_file, flush=True) print("There are %d entries in the input key set." % len(input_key_set), file=self.error_file, flush=True) if self.very_verbose: print("Keys: %s" % " ".join(input_key_set), file=self.error_file, flush=True) # Step two: read the filter file and act on the key_set. output_key_set: typing.Set[str] = set() if self.verbose: print("Applying the filter from %s" % self.filter_file_path, file=self.error_file, flush=True) filter_key: str filter_line_count: int = 0 row: typing.List[str] if self.invert: output_key_set = input_key_set for row in filter_kr: filter_line_count += 1 filter_key = self.build_key(row, filter_key_columns) if filter_key in output_key_set: output_key_set.remove(filter_key) else: for row in filter_kr: filter_line_count += 1 filter_key = self.build_key(row, filter_key_columns) if filter_key in input_key_set: output_key_set.add(filter_key) if self.verbose: print("Read %d rows from the filter file." % filter_line_count, file=self.error_file, flush=True) print("There are %d entries in the output key set." % len(output_key_set), file=self.error_file, flush=True) # Step three: read the input rows from the cache and write only the # ones with keys in the output key set. output_line_count: int = 0 for row in input_cache: input_key: str = self.build_key(row, input_key_columns) if input_key in output_key_set: ew.write(row) output_line_count += 1 if self.verbose: print("Wrote %d rows to the output file." % output_line_count, file=self.error_file, flush=True)
def process_cacheing_input(self, input_kr: KgtkReader, filter_kr: KgtkReader, input_key_columns: typing.List[int], filter_key_columns: typing.List[int], ew: KgtkWriter): if self.verbose: print("Processing by cacheing the input file.") input_line_count: int = 0 filter_line_count: int = 0 output_line_count: int = 0 # Map key values to lists of input and output data. inputmap: typing.MutableMapping[str, typing.List[typing.List[str]]] = {} outputmap: typing.MutableMapping[str, typing.List[typing.List[str]]] = {} if self.verbose: print("Reading the input data from %s" % self.input_file_path, file=self.error_file, flush=True) row: typing.List[str] for row in input_kr: input_line_count += 1 input_key: str = self.build_key(row, input_key_columns) if input_key in inputmap: # Append the row to an existing list for that key. inputmap[input_key].append(row) else: # Create a new list of rows for this key. inputmap[input_key] = [row] if self.verbose: print("Applying the filter from %s" % self.filter_file_path, file=self.error_file, flush=True) filter_key: str if self.invert: outputmap = inputmap for row in filter_kr: filter_line_count += 1 filter_key = self.build_key(row, filter_key_columns) if filter_key in outputmap: del outputmap[filter_key] else: for row in filter_kr: filter_line_count += 1 filter_key = self.build_key(row, filter_key_columns) if filter_key in inputmap: outputmap[filter_key] = inputmap[filter_key] if self.verbose: print("Writing the output data to %s" % self.output_file_path, file=self.error_file, flush=True) # To simplify debugging, write the output data in sorted order (keys, # then input order). key: str for key in sorted(outputmap.keys()): for row in outputmap[key]: ew.write(row) output_line_count += 1 if self.verbose: print( "Read %d input records, read %d filter records, wrote %d records." % (input_line_count, filter_line_count, output_line_count), file=self.error_file, flush=True)
def general_filter(kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], subj_idx: int, subj_filter: typing.Set[str], pred_idx: int, pred_filter: typing.Set[str], obj_idx: int, obj_filter: typing.Set[str]): if verbose: print("Applying a general filter", file=error_file, flush=True) apply_subj_filter: bool = len(subj_filter) > 0 apply_pred_filter: bool = len(pred_filter) > 0 apply_obj_filter: bool = len(obj_filter) > 0 input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 subj_filter_keep_count: int = 0 pred_filter_keep_count: int = 0 obj_filter_keep_count: int = 0 subj_filter_reject_count: int = 0 pred_filter_reject_count: int = 0 obj_filter_reject_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 keep: bool = False reject: bool = False if apply_subj_filter: if row[subj_idx] in subj_filter: keep = True subj_filter_keep_count += 1 else: reject = True subj_filter_reject_count += 1 if apply_pred_filter: if row[pred_idx] in pred_filter: keep = True pred_filter_keep_count += 1 else: reject = True pred_filter_reject_count += 1 if apply_obj_filter: if row[obj_idx] in obj_filter: keep = True obj_filter_keep_count += 1 else: reject = True obj_filter_reject_count += 1 if (not keep ^ invert) if or_pattern else (reject ^ invert): if rw is not None: rw.write(row) reject_line_count += 1 else: kw.write(row) output_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count)) print("Keep counts: subject=%d, predicate=%d, object=%d." % (subj_filter_keep_count, pred_filter_keep_count, obj_filter_keep_count)) print("Reject counts: subject=%d, predicate=%d, object=%d." % (subj_filter_reject_count, pred_filter_reject_count, obj_filter_reject_count))