def run(filename, english_only, sort): # import modules locally import sys # type: ignore from kgtk.exceptions import kgtk_exception_auto_handler import csv def row_to_edge(row): return '\t'.join(row) + '\n' try: in_columns = ['assertion', 'rel', 'subj', 'obj', 'metadata'] out_columns = ['node1', 'label', 'node2'] with open(filename, 'r') as f: reader = csv.reader(f, delimiter='\t', quotechar='"') sys.stdout.write(row_to_edge(out_columns)) for row in reader: new_row = [row[2], row[1], row[3]] if not english_only or (new_row[0].startswith('/c/en/') and new_row[2].startswith('/c/en/')): sys.stdout.write(row_to_edge(new_row)) except Exception as e: kgtk_exception_auto_handler(e)
def run(): # import modules locally import sys # type: ignore from kgtk.exceptions import kgtk_exception_auto_handler import json import nltk from nltk.corpus import wordnet as wn def header_to_edge(row): row = [r.replace('_', ';') for r in row] return '\t'.join(row) + '\n' def obtain_wordnet_lemmas(syn): lemmas = [] for lemma in syn.lemma_names(): lemmas.append(lemma.replace('_', ' ')) return lemmas def obtain_hypernyms(syn): hyps = [] for hypernym in syn.hypernyms(): hyps.append(hypernym.name()) return hyps def obtain_member_holonyms(syn): hols = [] for hol in syn.member_holonyms(): hols.append(hol.name()) return hols def obtain_part_holonyms(syn): hols = [] for hol in syn.part_holonyms(): hols.append(hol.name()) return hols def obtain_substance_meronyms(syn): hols = [] for hol in syn.substance_meronyms(): hols.append(hol.name()) return hols def get_wn_data(): syns = list(wn.all_synsets()) all_labels = {} all_hyps = {} all_members = {} all_parts = {} all_subs = {} for syn in syns: syn_name = syn.name() lemmas = obtain_wordnet_lemmas(syn) all_labels[syn_name] = '|'.join(lemmas) hypernyms = obtain_hypernyms(syn) if len(hypernyms): all_hyps[syn_name] = hypernyms member_holonyms = obtain_member_holonyms(syn) if len(member_holonyms): all_members[syn_name] = member_holonyms part_holonyms = obtain_part_holonyms(syn) if len(part_holonyms): all_parts[syn_name] = part_holonyms substance_meronyms = obtain_substance_meronyms(syn) if len(substance_meronyms): all_subs[syn_name] = substance_meronyms return all_labels, all_hyps, all_members, all_parts, all_subs def create_edges(data, labels, rel, rel_label): all_rows = [] for node1, v in data.items(): for node2 in v: node1_preflabel = labels[node1].split('|')[0] node2_preflabel = labels[node2].split('|')[0] sentence = ' '.join( [node1_preflabel, rel_label, node2_preflabel]) if rel == '/r/IsA': question = ('What is %s?' % node1_preflabel).capitalize() else: question = ('%s %s what?' % (node1_preflabel, rel_label)).capitalize() a_row = [ node1, rel, node2, labels[node1], rel_label, labels[node2], "", "WN", "1.0", "", sentence, question ] all_rows.append(a_row) return all_rows try: out_columns = [ 'node1', 'label', 'node2', 'node1_label', 'label_label', 'node2_label', 'label_dimension', 'source', 'weight', 'creator', 'sentence', 'question' ] sys.stdout.write(header_to_edge(out_columns)) all_labels, all_hyps, all_members, all_parts, all_subs = get_wn_data() hyp_edges = create_edges(all_hyps, all_labels, '/r/IsA', 'is a') member_edges = create_edges(all_members, all_labels, '/r/PartOf', 'is a part of') part_edges = create_edges(all_parts, all_labels, '/r/PartOf', 'is a part of') sub_edges = create_edges(all_subs, all_labels, '/r/MadeOf', 'is made of') all_edges = hyp_edges + member_edges + part_edges + sub_edges for edge in all_edges: sys.stdout.write('\t'.join(edge) + '\n') except Exception as e: kgtk_exception_auto_handler(e)
def run( input_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, pattern: str, subj_col: typing.Optional[str], pred_col: typing.Optional[str], obj_col: typing.Optional[str], or_pattern: bool, invert: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) reject_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( output_file, who="KGTK reject file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if reject_kgtk_file is not None: print("--reject-file=%s" % str(reject_kgtk_file), file=error_file) print("--pattern=%s" % str(pattern), file=error_file) if subj_col is not None: print("--subj=%s" % str(subj_col), file=error_file) if pred_col is not None: print("--pred=%s" % str(pred_col), file=error_file) if obj_col is not None: print("--obj=%s" % str(obj_col), file=error_file) print("--or=%s" % str(or_pattern), file=error_file) print("--invert=%s" % str(invert), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) def prepare_filter(pattern: str) -> typing.Set[str]: filt: typing.Set[str] = set() pattern = pattern.strip() if len(pattern) == 0: return filt target: str for target in pattern.split(","): target = target.strip() if len(target) > 0: filt.add(target) return filt try: patterns: typing.List[str] = pattern.split(";") if len(patterns) != 3: print( "Error: The pattern must have three sections separated by semicolons (two semicolons total).", file=error_file, flush=True) raise KGTKException("Bad pattern") subj_filter: typing.Set[str] = prepare_filter(patterns[0]) pred_filter: typing.Set[str] = prepare_filter(patterns[1]) obj_filter: typing.Set[str] = prepare_filter(patterns[2]) apply_subj_filter: bool = len(subj_filter) > 0 apply_pred_filter: bool = len(pred_filter) > 0 apply_obj_filter: bool = len(obj_filter) > 0 if verbose and not (apply_subj_filter or apply_pred_filter or apply_obj_filter): print("Warning: the filter is empty.", file=error_file, flush=True) if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) subj_idx: int = kr.get_node1_column_index(subj_col) pred_idx: int = kr.get_label_column_index(pred_col) obj_idx: int = kr.get_node2_column_index(obj_col) # Complain about a missing column only when it is needed by the pattern. trouble: bool = False if subj_idx < 0 and len(subj_filter) > 0: trouble = True print("Error: Cannot find the subject column '%s'." % kr.get_node1_canonical_name(subj_col), file=error_file, flush=True) if pred_idx < 0 and len(pred_filter) > 0: trouble = True print("Error: Cannot find the predicate column '%s'." % kr.get_label_canonical_name(pred_col), file=error_file, flush=True) if obj_idx < 0 and len(obj_filter) > 0: trouble = True print("Error: Cannot find the object column '%s'." % kr.get_node2_canonical_name(obj_col), file=error_file, flush=True) if trouble: raise KGTKException("Missing columns.") if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(kr.column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) rw: typing.Optional[KgtkWriter] = None if reject_kgtk_file is not None: if verbose: print("Opening the reject file: %s" % str(reject_kgtk_file), file=error_file, flush=True) rw = KgtkWriter.open(kr.column_names, reject_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 subj_filter_keep_count: int = 0 pred_filter_keep_count: int = 0 obj_filter_keep_count: int = 0 subj_filter_reject_count: int = 0 pred_filter_reject_count: int = 0 obj_filter_reject_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 keep: bool = False reject: bool = False if apply_subj_filter: if row[subj_idx] in subj_filter: keep = True subj_filter_keep_count += 1 else: reject = True subj_filter_reject_count += 1 if apply_pred_filter: if row[pred_idx] in pred_filter: keep = True pred_filter_keep_count += 1 else: reject = True pred_filter_reject_count += 1 if apply_obj_filter: if row[obj_idx] in obj_filter: keep = True obj_filter_keep_count += 1 else: reject = True obj_filter_reject_count += 1 if (not keep ^ invert) if or_pattern else (reject ^ invert): if rw is not None: rw.write(row) reject_line_count += 1 else: kw.write(row) output_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count)) print("Keep counts: subject=%d, predicate=%d, object=%d." % (subj_filter_keep_count, pred_filter_keep_count, obj_filter_keep_count)) print("Reject counts: subject=%d, predicate=%d, object=%d." % (subj_filter_reject_count, pred_filter_reject_count, obj_filter_reject_count)) kw.close() if rw is not None: rw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def run(input_file: KGTKFiles, english_only): # import modules locally import sys # type: ignore from kgtk.exceptions import kgtk_exception_auto_handler import csv import json import re from pathlib import Path from kgtk.kgtkformat import KgtkFormat def header_to_edge(row): row = [r.replace('_', ';') for r in row] return '\t'.join(row) + '\n' def make_node_label(node): return KgtkFormat.stringify(node.strip().split('/')[3].replace( '_', ' ')) def split_camel_case(name): splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', name)).split() return ' '.join(splitted).lower() def make_rel_label(rel): return KgtkFormat.stringify(split_camel_case(rel.split('/')[-1])) def row_to_edge(row, cols): edge = {} edge['node1'] = row[2] edge['relation'] = row[1] edge['node2'] = row[3] edge['node1_label'] = make_node_label(row[2]) edge['node2_label'] = make_node_label(row[3]) edge['relation_label'] = make_rel_label(row[1]) edge['relation_dimension'] = '' metadata = json.loads(row[4]) edge['source'] = KgtkFormat.stringify('CN') if 'surfaceText' in metadata.keys(): edge['sentence'] = KgtkFormat.stringify( metadata['surfaceText'].replace('\\', '')) else: edge['sentence'] = '' edge_list = [edge[col] for col in cols] return '\t'.join(edge_list) + '\n' try: filename: Path = KGTKArgumentParser.get_input_file(input_file) in_columns = ['assertion', 'rel', 'subj', 'obj', 'metadata'] out_columns = [ 'node1', 'relation', 'node2', 'node1_label', 'node2_label', 'relation_label', 'relation_dimension', 'source', 'sentence' ] with open(filename, 'r') as f: reader = csv.reader(f, delimiter='\t', quotechar='"') sys.stdout.write(header_to_edge(out_columns)) for row in reader: if not english_only or (row[2].startswith('/c/en/') and row[3].startswith('/c/en/')): sys.stdout.write(row_to_edge(row, out_columns)) except Exception as e: kgtk_exception_auto_handler(e)
def run( input_file: KGTKFiles, output_file: KGTKFiles, new_edges_file: KGTKFiles, base_columns: typing.Optional[typing.List[str]] = None, columns_to_lower: typing.Optional[typing.List[str]] = None, label_value: str = KgtkLift.DEFAULT_LABEL_SELECT_COLUMN_VALUE, lift_separator: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SEPARATOR, lower: bool = False, normalize: bool = False, deduplicate_new_edges: bool = True, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) new_edges_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file(new_edges_file, who="Label file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if new_edges_kgtk_file is not None: print("--label-file=%s" % str(new_edges_kgtk_file), file=error_file) if base_columns is not None: print("--base-columns=%s" % " ".join(base_columns), file=error_file) if columns_to_lower is not None: print("--columns-to-lower=%s" % " ".join(columns_to_lower), file=error_file) print("--label-value=%s" % label_value, file=error_file) print("--lift-separator=%s" % lift_separator, file=error_file) print("--lower=%s" % lower, file=error_file) print("--normalize=%s" % normalize, file=error_file) print("--deduplicate-labels=%s" % deduplicate_new_edges, file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) if not lower and not normalize: raise KGTKException( "One or both of --lower and --normalize must be requested.") try: if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) # Map the index of a column being removed to the index of the base column that supplies its node1 value. lower_map: typing.MutableMapping[int, typing.Tuple[int, str]] = dict() node1_column_name: str = kr.get_node1_column_actual_name() label_column_name: str = kr.get_label_column_actual_name() node2_column_name: str = kr.get_node2_column_actual_name() id_column_name: str = kr.get_id_column_actual_name() key_column_names: typing.List[str] = list() key_column_idxs: typing.Set[int] = set() if node1_column_name != "": if verbose: print("Node1 column name: %s" % node1_column_name, file=error_file, flush=True) key_column_names.append(node1_column_name) key_column_idxs.add(kr.node1_column_idx) if label_column_name != "": if verbose: print("Label column name: %s" % label_column_name, file=error_file, flush=True) key_column_names.append(label_column_name) key_column_idxs.add(kr.label_column_idx) if node2_column_name != "": if verbose: print("Node2 column name: %s" % node2_column_name, file=error_file, flush=True) key_column_names.append(node2_column_name) key_column_idxs.add(kr.node2_column_idx) if id_column_name != "": if verbose: print("Id column name: %s" % id_column_name, file=error_file, flush=True) key_column_names.append(id_column_name) key_column_idxs.add(kr.id_column_idx) elif normalize: raise KGTKException( "--normalize was requested but the ID column was not found.") base_name: str new_label_value: str column_name: str idx: int # There are three option patterns. if columns_to_lower is not None and len( columns_to_lower) > 0 and base_columns is not None and len( base_columns) > 0: # Pattern 1: len(columns_to_lower) > 0 and len(base_columns) == len(columns_to_lower) # column_names and base_columns are paired. New records use label_value. if len(columns_to_lower) != len(base_columns): raise KGTKException( "There are %d columns to remove but only %d base columns." % (len(columns_to_lower), len(base_columns))) if len(label_value) == 0: raise KGTKException("The --label-value must not be empty.") for idx, column_name in enumerate(columns_to_lower): base_name = base_columns[idx] if column_name not in kr.column_names: raise KGTKException( "Column %s is an unknown column, cannot remove it." % repr(column_name)) if column_name in key_column_names: raise KGTKException( "Column %s is a key column, cannot remove it." % repr(column_name)) if base_name not in kr.column_names: raise KGTKException( "For column name %s, base name %s is unknown" % (repr(column_name), repr(base_name))) if normalize and base_name == id_column_name: lower_map[kr.column_name_map[column_name]] = ( kr.column_name_map[base_name], column_name) else: if not lower: raise KGTKException( "--lower is not enabled for column %s, base name %s" % (repr(column_name), repr(base_name))) lower_map[kr.column_name_map[column_name]] = ( kr.column_name_map[base_name], label_value) elif columns_to_lower is not None and len(columns_to_lower) > 0 and ( base_columns is None or len(base_columns) == 0): # Pattern 2: len(columns_to_lower) > 0 and len(base_columns) == 0 # Each column name is split at the lift separator to determine the base name and label value. if len(lift_separator) == 0: raise KGTKException("The --lift-separator must not be empty.") for idx, column_name in enumerate(columns_to_lower): if column_name not in kr.column_names: raise KGTKException( "Column %s is an unknown column, cannot remove it." % repr(column_name)) if column_name in key_column_names: raise KGTKException( "Column %s is a key column, cannot remove it." % repr(column_name)) if lower and lift_separator in column_name: base_name, new_label_value = column_name.split( lift_separator, 1) if base_name not in kr.column_names: raise KGTKException( "For column name %s, base name %s is not known" % (repr(column_name), repr(base_name))) elif normalize: base_name = id_column_name new_label_value = column_name else: raise KGTKException( "Unable to parse column name %s, no separator (%s)." % (repr(column_name), repr(lift_separator))) lower_map[kr.column_name_map[column_name]] = ( kr.column_name_map[base_name], new_label_value) elif columns_to_lower is None or len(columns_to_lower) == 0: # Pattern 3: len(columns_to_lower) == 0. # Any column that matches a lift pattern against one of the # key columns (node1, label, node2, id, or their aliases) # will be lowered. if len(lift_separator) == 0: raise KGTKException("The --lift-separator must not be empty.") if base_columns is None or len(base_columns) == 0: # The base name list wasn't supplied. Use [node1, label, node2, id] base_columns = list(key_column_names) if verbose: print("Using the default base columns: %s" % " ".join(base_columns), file=error_file, flush=True) else: if verbose: print("Using these base columns: %s" % " ".join(base_columns), file=error_file, flush=True) for idx, column_name in enumerate(kr.column_names): # Skip the node1, label, node12, and id columns if idx in key_column_idxs: if verbose: print("column %s is a key column, skipping." % repr(column_name), file=error_file, flush=True) continue # Does this column match a lifting pattern? if lower and lift_separator in column_name: base_name, new_label_value = column_name.split( lift_separator, 1) if base_name not in base_columns: if verbose: print( "Column %s contains base name %s, which is not a base column." % (repr(column_name), repr(base_name)), file=error_file, flush=True) continue elif normalize: base_name = id_column_name new_label_value = column_name else: if verbose: print( "Column %s does not contain the separator %s and not normalizing, skipping." % (repr(column_name), repr(lift_separator)), file=error_file, flush=True) continue # This test should be redundant. if base_name in kr.column_names: lower_map[idx] = (kr.column_name_map[base_name], new_label_value) else: raise KGTKException( "Base name %s was unexpectedly not found." % repr(base_name)) if len(lower_map) == 0: raise KGTKException("There are no columns to lower or normalize.") if verbose: print("The following columns will be lowered or normalized", file=error_file, flush=True) for idx in sorted(lower_map.keys()): column_name = kr.column_names[idx] base_idx, new_label_value = lower_map[idx] base_name = kr.column_names[base_idx] print(" %s from %s (label %s)" % (column_name, base_name, repr(new_label_value)), file=error_file, flush=True) output_column_names: typing.List[str] = list() for idx, column_name in enumerate(kr.column_names): if idx not in lower_map: output_column_names.append(column_name) if verbose: print("The output columns are: %s" % " ".join(output_column_names), file=error_file, flush=True) if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open( output_column_names, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=False, # Simplifies writing the labels verbose=verbose, very_verbose=very_verbose) shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names) lkw: typing.Optional[KgtkWriter] = None if new_edges_kgtk_file is not None: if verbose: print("Opening the label output file: %s" % str(new_edges_kgtk_file), file=error_file, flush=True) label_column_names = [ node1_column_name, label_column_name, node2_column_name ] lkw = KgtkWriter.open(label_column_names, new_edges_kgtk_file, mode=KgtkWriter.Mode.EDGE, verbose=verbose, very_verbose=very_verbose) # Optionally deduplicate the labels # set(node1_value + KgtkFormat.SEPARATOR + node2_value) label_set: typing.Set[str] = set() label_key: str input_line_count: int = 0 output_line_count: int = 0 label_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 kw.write(row, shuffle_list=shuffle_list) output_line_count += 1 column_idx: int for column_idx in lower_map.keys(): node1_idx: int node1_idx, new_label_value = lower_map[column_idx] node1_value: str node1_value = row[node1_idx] if len(node1_value) == 0: continue # TODO: raise an exception item: str = row[column_idx] if len(item) == 0: continue # Ignore empty node2 values. # Ths item might be a KGTK list. Let's split it, because # lists aren't allow in the node2 values we'll generate. node2_value: str for node2_value in KgtkValue.split_list(item): if len(node2_value) == 0: continue # Ignore empty node2 values. if deduplicate_new_edges: label_key = node1_value + KgtkFormat.KEY_FIELD_SEPARATOR + new_label_value + KgtkFormat.KEY_FIELD_SEPARATOR + node2_value if label_key in label_set: continue else: label_set.add(label_key) output_map: typing.Mapping[str, str] = { node1_column_name: node1_value, label_column_name: new_label_value, node2_column_name: node2_value, } if lkw is None: kw.writemap(output_map) label_line_count += 1 output_line_count += 1 else: lkw.writemap(output_map) label_line_count += 1 if verbose: print("Read %d rows, wrote %d rows with %d labels." % (input_line_count, output_line_count, label_line_count), file=error_file, flush=True) kw.close() if lkw is not None: lkw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def run(input_file: KGTKFiles, attr_syn_file: KGTKFiles): # import modules locally import sys # type: ignore from kgtk.exceptions import kgtk_exception_auto_handler import csv import json import re from pathlib import Path from collections import defaultdict out_columns = [ 'node1', 'label', 'node2', 'node1_label', 'label_label', 'node2_label', 'label_dimension', 'source', 'weight', 'creator', 'sentence', 'question' ] proximity_relation = '/r/LocatedNear' property_relation = '/r/HasProperty' property_relation_label = 'has property' capableof_relation = '/r/CapableOf' capableof_relation_label = 'capable of' def create_edge(node1, node1_lbl, node2, node2_lbl, rel, rel_lbl, image_id): my_row = node1, rel, node2, '|'.join(node1_lbl), rel_lbl, '|'.join( node2_lbl), '', 'VG', '1.0', 'I' + image_id, '', '' return '\t'.join(my_row) + '\n' def header_to_edge(row): row = [r.replace('_', ';') for r in row] return '\t'.join(row) + '\n' def create_uri(ns, rel): return '%s:%s' % (ns, rel) try: scene_graph_filename: Path = KGTKArgumentParser.get_input_file( input_file) attr_synsets_filename: Path = KGTKArgumentParser.get_input_file( attr_syn_file) with open(scene_graph_filename, 'r') as f: images_data = json.load(f) with open(attr_synsets_filename, 'r') as f: attr_synsets = json.load(f) sys.stdout.write(header_to_edge(out_columns)) for counter, an_image in enumerate(images_data): image_id = str(an_image['image_id']) #image_node=create_uri('vg', 'I' + image_id) # OBJECTS objid2names = defaultdict(list) objid2syns = {} rows = [] for o in an_image['objects']: obj_id = o['object_id'] o_synset = o['synsets'] objid2syns[obj_id] = o_synset for name in o['names']: name = name.strip().lower().rstrip('.') if not name: continue objid2names[obj_id].append(name) # ATTRIBUTES if 'attributes' in o.keys(): for attr in o['attributes']: attr = attr.lower() if attr in attr_synsets: asyn = attr_synsets[attr] apos = asyn.split('.')[1] if apos != 'n': if apos == 'v': # verb for osyn in o_synset: if osyn != asyn: edge_row = create_edge( osyn, objid2names[obj_id], asyn, [attr], capableof_relation, capableof_relation_label, image_id) if edge_row not in rows: rows.append(edge_row) else: #adjective for osyn in o_synset: if osyn != asyn: edge_row = create_edge( osyn, objid2names[obj_id], asyn, [attr], property_relation, property_relation_label, image_id) if edge_row not in rows: rows.append(edge_row) # RELATIONS for rel in an_image['relationships']: #synsets=rel['synsets'] relation_label = rel['predicate'].lower().strip().strip('.') sub_id = rel['subject_id'] sub_names = objid2names[sub_id] sub_syns = objid2syns[sub_id] obj_id = rel['object_id'] obj_names = objid2names[obj_id] obj_syns = objid2syns[obj_id] for ssyn in sub_syns: for osyn in obj_syns: if osyn != ssyn: edge_row = create_edge(ssyn, sub_names, osyn, obj_names, proximity_relation, relation_label, image_id) if edge_row not in rows: rows.append(edge_row) for a_row in rows: sys.stdout.write(a_row) except Exception as e: kgtk_exception_auto_handler(e)
def run(input_file: KGTKFiles, relation, source, output_file: KGTKFiles): # import modules locally import sys # type: ignore from kgtk.exceptions import kgtk_exception_auto_handler import csv import json import re from pathlib import Path from string import Template from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkwriter import KgtkWriter def make_node_label(node): return KgtkFormat.stringify(node[3:]) def split_camel_case(name): splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', name)).split() return ' '.join(splitted).lower() def make_rel_label(rel): return KgtkFormat.stringify(split_camel_case(rel.split('/')[-1])) def row_to_edge(node1, rel, node2, source, cols): edge = {} prefix = source.lower() edge['node1'] = prefix + ':' + node1 edge['relation'] = rel edge['node2'] = prefix + ':' + node2 edge['node1;label'] = make_node_label(node1) edge['node2;label'] = make_node_label(node2) edge['relation;label'] = make_rel_label(rel) edge['relation;dimension'] = '' edge['source'] = KgtkFormat.stringify(source) edge['sentence'] = '' edge_list = [edge[col] for col in cols] return edge_list try: filename: Path = KGTKArgumentParser.get_input_file(input_file) in_columns = ['assertion', 'rel', 'subj', 'obj', 'metadata'] out_columns = [ 'node1', 'relation', 'node2', 'node1;label', 'node2;label', 'relation;label', 'relation;dimension', 'source', 'sentence' ] output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) ew: KgtkWriter = KgtkWriter.open( out_columns, output_kgtk_file, #mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, #verbose=self.verbose, #very_verbose=self.very_verbose ) with open(filename, 'r') as f: reader = csv.reader(f, delimiter=' ', quotechar='"') #sys.stdout.write(header_to_edge(out_columns)) for row in reader: ew.write( row_to_edge(row[0], relation, row[1], source, out_columns)) # Clean up. ew.close() except Exception as e: kgtk_exception_auto_handler(e)
def run(input_file: KGTKFiles, relation, source): # import modules locally import sys # type: ignore from kgtk.exceptions import kgtk_exception_auto_handler import csv import json import re from pathlib import Path from string import Template from kgtk.kgtkformat import KgtkFormat def header_to_edge(row): row = [r.replace('_', ';') for r in row] return '\t'.join(row) + '\n' def make_node_label(node): return KgtkFormat.stringify(node[3:]) def split_camel_case(name): splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', name)).split() return ' '.join(splitted).lower() def make_rel_label(rel): return KgtkFormat.stringify(split_camel_case(rel.split('/')[-1])) def row_to_edge(node1, rel, node2, source, cols): edge = {} prefix = source.lower() edge['node1'] = prefix + ':' + node1 edge['relation'] = rel edge['node2'] = prefix + ':' + node2 edge['node1_label'] = make_node_label(node1) edge['node2_label'] = make_node_label(node2) edge['relation_label'] = make_rel_label(rel) edge['relation_dimension'] = '' edge['source'] = KgtkFormat.stringify(source) edge['sentence'] = '' edge_list = [edge[col] for col in cols] return '\t'.join(edge_list) + '\n' try: filename: Path = KGTKArgumentParser.get_input_file(input_file) in_columns = ['assertion', 'rel', 'subj', 'obj', 'metadata'] out_columns = [ 'node1', 'relation', 'node2', 'node1_label', 'node2_label', 'relation_label', 'relation_dimension', 'source', 'sentence' ] with open(filename, 'r') as f: reader = csv.reader(f, delimiter=' ', quotechar='"') sys.stdout.write(header_to_edge(out_columns)) for row in reader: sys.stdout.write( row_to_edge(row[0], relation, row[1], source, out_columns)) except Exception as e: kgtk_exception_auto_handler(e)
def run( input_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, pattern: str, subj_col: typing.Optional[str], pred_col: typing.Optional[str], obj_col: typing.Optional[str], or_pattern: bool, invert: bool, show_version: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) reject_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( reject_file, who="KGTK reject file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) UPDATE_VERSION: str = "2020-08-06T17:06:06.829542+00:00#Mu9vz3KEPh+beQeSwZ8qGMKrTJzHWZFfZFXY6UrYXJAnNpPSin+5NvkSfxKLMkyJtGyeavgGAz8+74bup7eYaQ==" if show_version or verbose: print("kgtk filter version: %s" % UPDATE_VERSION, file=error_file, flush=True) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if reject_kgtk_file is not None: print("--reject-file=%s" % str(reject_kgtk_file), file=error_file) print("--pattern=%s" % str(pattern), file=error_file) if subj_col is not None: print("--subj=%s" % str(subj_col), file=error_file) if pred_col is not None: print("--pred=%s" % str(pred_col), file=error_file) if obj_col is not None: print("--obj=%s" % str(obj_col), file=error_file) print("--or=%s" % str(or_pattern), file=error_file) print("--invert=%s" % str(invert), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) def prepare_filter(pattern: str) -> typing.Set[str]: filt: typing.Set[str] = set() pattern = pattern.strip() if len(pattern) == 0: return filt target: str for target in pattern.split(","): target = target.strip() if len(target) > 0: filt.add(target) return filt def single_predicate_filter( kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], pred_idx: int, pred_filter: typing.Set[str], ): if verbose: print("Applying a single predicate filter", file=error_file, flush=True) pred_filter_value: str = list(pred_filter)[0] input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if row[pred_idx] == pred_filter_value: kw.write(row) output_line_count += 1 else: if rw is not None: rw.write(row) reject_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count)) def single_predicate_filter_inverted( kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], pred_idx: int, pred_filter: typing.Set[str], ): if verbose: print("Applying a single predicate filter inverted", file=error_file, flush=True) pred_filter_value: str = list(pred_filter)[0] input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if row[pred_idx] != pred_filter_value: kw.write(row) output_line_count += 1 else: if rw is not None: rw.write(row) reject_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count)) def single_object_filter( kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], obj_idx: int, obj_filter: typing.Set[str], ): if verbose: print("Applying a single object filter", file=error_file, flush=True) obj_filter_value: str = list(obj_filter)[0] input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if row[obj_idx] == obj_filter_value: kw.write(row) output_line_count += 1 else: if rw is not None: rw.write(row) reject_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count)) def single_object_filter_inverted( kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], obj_idx: int, obj_filter: typing.Set[str], ): if verbose: print("Applying a single object filter inverted", file=error_file, flush=True) obj_filter_value: str = list(obj_filter)[0] input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if row[obj_idx] != obj_filter_value: kw.write(row) output_line_count += 1 else: if rw is not None: rw.write(row) reject_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count)) def general_filter(kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], subj_idx: int, subj_filter: typing.Set[str], pred_idx: int, pred_filter: typing.Set[str], obj_idx: int, obj_filter: typing.Set[str]): if verbose: print("Applying a general filter", file=error_file, flush=True) apply_subj_filter: bool = len(subj_filter) > 0 apply_pred_filter: bool = len(pred_filter) > 0 apply_obj_filter: bool = len(obj_filter) > 0 input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 subj_filter_keep_count: int = 0 pred_filter_keep_count: int = 0 obj_filter_keep_count: int = 0 subj_filter_reject_count: int = 0 pred_filter_reject_count: int = 0 obj_filter_reject_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 keep: bool = False reject: bool = False if apply_subj_filter: if row[subj_idx] in subj_filter: keep = True subj_filter_keep_count += 1 else: reject = True subj_filter_reject_count += 1 if apply_pred_filter: if row[pred_idx] in pred_filter: keep = True pred_filter_keep_count += 1 else: reject = True pred_filter_reject_count += 1 if apply_obj_filter: if row[obj_idx] in obj_filter: keep = True obj_filter_keep_count += 1 else: reject = True obj_filter_reject_count += 1 if (not keep ^ invert) if or_pattern else (reject ^ invert): if rw is not None: rw.write(row) reject_line_count += 1 else: kw.write(row) output_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count)) print("Keep counts: subject=%d, predicate=%d, object=%d." % (subj_filter_keep_count, pred_filter_keep_count, obj_filter_keep_count)) print("Reject counts: subject=%d, predicate=%d, object=%d." % (subj_filter_reject_count, pred_filter_reject_count, obj_filter_reject_count)) try: patterns: typing.List[str] = pattern.split(";") if len(patterns) != 3: print( "Error: The pattern must have three sections separated by semicolons (two semicolons total).", file=error_file, flush=True) raise KGTKException("Bad pattern") subj_filter: typing.Set[str] = prepare_filter(patterns[0]) pred_filter: typing.Set[str] = prepare_filter(patterns[1]) obj_filter: typing.Set[str] = prepare_filter(patterns[2]) if verbose and len(subj_filter) == 0 and len(pred_filter) == 0 and len( obj_filter) == 0: print("Warning: the filter is empty.", file=error_file, flush=True) if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) subj_idx: int = kr.get_node1_column_index(subj_col) pred_idx: int = kr.get_label_column_index(pred_col) obj_idx: int = kr.get_node2_column_index(obj_col) # Complain about a missing column only when it is needed by the pattern. trouble: bool = False if subj_idx < 0 and len(subj_filter) > 0: trouble = True print("Error: Cannot find the subject column '%s'." % kr.get_node1_canonical_name(subj_col), file=error_file, flush=True) if pred_idx < 0 and len(pred_filter) > 0: trouble = True print("Error: Cannot find the predicate column '%s'." % kr.get_label_canonical_name(pred_col), file=error_file, flush=True) if obj_idx < 0 and len(obj_filter) > 0: trouble = True print("Error: Cannot find the object column '%s'." % kr.get_node2_canonical_name(obj_col), file=error_file, flush=True) if trouble: raise KGTKException("Missing columns.") if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(kr.column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) rw: typing.Optional[KgtkWriter] = None if reject_kgtk_file is not None: if verbose: print("Opening the reject file: %s" % str(reject_kgtk_file), file=error_file, flush=True) rw = KgtkWriter.open(kr.column_names, reject_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) if len(subj_filter) == 0 and len(pred_filter) == 1 and len( obj_filter) == 0: if invert: single_predicate_filter_inverted(kr, kw, rw, pred_idx, pred_filter) else: single_predicate_filter(kr, kw, rw, pred_idx, pred_filter) elif len(subj_filter) == 0 and len(pred_filter) == 0 and len( obj_filter) == 1: if invert: single_object_filter_inverted(kr, kw, rw, obj_idx, obj_filter) else: single_object_filter(kr, kw, rw, obj_idx, obj_filter) else: general_filter(kr, kw, rw, subj_idx, subj_filter, pred_idx, pred_filter, obj_idx, obj_filter) kw.close() if rw is not None: rw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def run(input_file: KGTKFiles, english_only, output_file: KGTKFiles, weights_file: KGTKFiles): # import modules locally import sys # type: ignore from kgtk.exceptions import kgtk_exception_auto_handler import csv import json import re from pathlib import Path from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkwriter import KgtkWriter def make_node_label(node): return KgtkFormat.stringify(node.strip().split('/')[3].replace( '_', ' ')) def split_camel_case(name): splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', name)).split() return ' '.join(splitted).lower() def make_rel_label(rel): return KgtkFormat.stringify(split_camel_case(rel.split('/')[-1])) def make_weight_edge(row): node1 = '%s-%s-%s-0000' % (row[2], row[1], row[3]) rel = 'weight' node2 = str(json.loads(row[-1])['weight']) return [node1, rel, node2] def row_to_edge(row, cols): edge = {} edge['node1'] = row[2] edge['relation'] = row[1] edge['node2'] = row[3] edge['node1;label'] = make_node_label(row[2]) edge['node2;label'] = make_node_label(row[3]) edge['relation;label'] = make_rel_label(row[1]) edge['relation;dimension'] = '' metadata = json.loads(row[4]) edge['source'] = KgtkFormat.stringify('CN') if 'surfaceText' in metadata.keys(): edge['sentence'] = KgtkFormat.stringify( metadata['surfaceText'].replace('\\', '')) else: edge['sentence'] = '' edge_list = [edge[col] for col in cols] return edge_list try: filename: Path = KGTKArgumentParser.get_input_file(input_file) in_columns = ['assertion', 'rel', 'subj', 'obj', 'metadata'] out_columns = [ 'node1', 'relation', 'node2', 'node1;label', 'node2;label', 'relation;label', 'relation;dimension', 'source', 'sentence' ] output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) ew: KgtkWriter = KgtkWriter.open( out_columns, output_kgtk_file, #mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, #verbose=self.verbose, #very_verbose=self.very_verbose ) if weights_file: info_kgtk_file: Path = KGTKArgumentParser.get_output_file( weights_file) ew_aux: KgtkWriter = KgtkWriter.open( out_columns[:3], info_kgtk_file, #mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, #verbose=self.verbose, #very_verbose=self.very_verbose ) with open(filename, 'r') as f: reader = csv.reader(f, delimiter='\t', quotechar='"') for row in reader: if not english_only or (row[2].startswith('/c/en/') and row[3].startswith('/c/en/')): ew.write(row_to_edge(row, out_columns)) if weights_file and 'weight' in json.loads(row[-1]).keys(): ew_aux.write(make_weight_edge(row)) # Clean up ew.close() if weights_file: ew_aux.close() except Exception as e: kgtk_exception_auto_handler(e)
def run( input_file: KGTKFiles, output_file: KGTKFiles, into_file: KGTKFiles, enable: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) into_kgtk_file: Path = KGTKArgumentParser.get_output_file( into_file, who="The tee output file", default_stdout=False) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) print("--to-file=%s" % str(into_kgtk_file), file=error_file) print("--enable=%s" % str(enable), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(kr.column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) tkw: typing.Optional[KgtkWriter] = None if enable: if verbose: print("Opening the tee output file: %s" % str(output_kgtk_file), file=error_file, flush=True) tkw = KgtkWriter.open(kr.column_names, into_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) input_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 kw.write(row) if tkw is not None: tkw.write(row) if verbose: print("Processed %d rows." % (input_line_count), file=error_file, flush=True) kw.close() if tkw is not None: tkw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def run(output_file: KGTKFiles): # import modules locally import sys # type: ignore from kgtk.exceptions import kgtk_exception_auto_handler import json import nltk nltk.download("wordnet") from nltk.corpus import wordnet as wn from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkwriter import KgtkWriter def obtain_wordnet_lemmas(syn): lemmas = [] for lemma in syn.lemma_names(): lemmas.append(KgtkFormat.stringify(lemma.replace('_', ' '))) return lemmas def obtain_hypernyms(syn): hyps = [] for hypernym in syn.hypernyms(): hyps.append(hypernym.name()) return hyps def obtain_member_holonyms(syn): hols = [] for hol in syn.member_holonyms(): hols.append(hol.name()) return hols def obtain_part_holonyms(syn): hols = [] for hol in syn.part_holonyms(): hols.append(hol.name()) return hols def obtain_substance_meronyms(syn): hols = [] for hol in syn.substance_meronyms(): hols.append(hol.name()) return hols def get_wn_data(): syns = list(wn.all_synsets()) all_labels = {} all_hyps = {} all_members = {} all_parts = {} all_subs = {} for syn in syns: syn_name = syn.name() lemmas = obtain_wordnet_lemmas(syn) all_labels[syn_name] = '|'.join(lemmas) hypernyms = obtain_hypernyms(syn) if len(hypernyms): all_hyps[syn_name] = hypernyms member_holonyms = obtain_member_holonyms(syn) if len(member_holonyms): all_members[syn_name] = member_holonyms part_holonyms = obtain_part_holonyms(syn) if len(part_holonyms): all_parts[syn_name] = part_holonyms substance_meronyms = obtain_substance_meronyms(syn) if len(substance_meronyms): all_subs[syn_name] = substance_meronyms return all_labels, all_hyps, all_members, all_parts, all_subs def create_edges(data, labels, rel, rel_label): all_rows = [] source = KgtkFormat.stringify('WN') for node1, v in data.items(): for node2 in v: node1_preflabel = labels[node1].split('|')[0] node2_preflabel = labels[node2].split('|')[0] a_row = [ 'wn:' + node1, rel, 'wn:' + node2, labels[node1], labels[node2], rel_label, "", source, '' ] all_rows.append(a_row) return all_rows try: out_columns = [ 'node1', 'relation', 'node2', 'node1;label', 'node2;label', 'relation;label', 'relation;dimension', 'source', 'sentence' ] output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) ew: KgtkWriter = KgtkWriter.open( out_columns, output_kgtk_file, #mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, #verbose=self.verbose, #very_verbose=self.very_verbose ) all_labels, all_hyps, all_members, all_parts, all_subs = get_wn_data() hyp_edges = create_edges(all_hyps, all_labels, '/r/IsA', KgtkFormat.stringify('is a')) member_edges = create_edges(all_members, all_labels, '/r/PartOf', KgtkFormat.stringify('is a part of')) part_edges = create_edges(all_parts, all_labels, '/r/PartOf', KgtkFormat.stringify('is a part of')) sub_edges = create_edges(all_subs, all_labels, '/r/MadeOf', KgtkFormat.stringify('is made of')) all_edges = hyp_edges + member_edges + part_edges + sub_edges for edge in all_edges: ew.write(edge) # Clean up. ew.close() except Exception as e: kgtk_exception_auto_handler(e)
def run(input_file: KGTKFiles, english_only): # import modules locally import sys # type: ignore from kgtk.exceptions import kgtk_exception_auto_handler import csv import json import re from pathlib import Path from string import Template def header_to_edge(row): row = [r.replace('_', ';') for r in row] return '\t'.join(row) + '\n' def make_node_label(node): return node.strip().split('/')[3].replace('_', ' ') def split_camel_case(name): splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', name)).split() return ' '.join(splitted).lower() def make_rel_label(rel): return split_camel_case(rel.split('/')[-1]) def get_template(label): t = { 'antonym': Template('What is the opposite from $node1?'), 'at location': Template('At what location is $node1?'), 'capable of': Template('What is $node1 capable of?'), 'causes': Template('What is caused by $node1?'), 'causes desire': Template('What desire is caused by $node1?'), 'created by': Template('What can create $node1?'), 'defined as': Template('How can $node1 be defined?'), 'derived from': Template('From which word is $node1 derived?'), 'desires': Template('What does $node1 desire?'), 'distinct from': Template('What is $node1 distinct from?'), 'etymologically derived from': Template('What is $node1 etymologically derived from?'), 'symbol of': Template('What is $node1 symbol of?'), 'synonym': Template('What is a synonym of $node1?'), 'manner of': Template('$node1 is a manner of what?'), 'located near': Template('What is $node1 located near to?'), 'has context': Template('What is a context of $node1?'), 'similar to': Template('What is $node1 similar to?'), 'etymologically related to': Template("What is $node1 etymologically related to?"), 'made of': Template('What is $node1 made of?'), 'receives action': Template('What can be done to $node1?'), 'obstructed by': Template('What is $node1 obstructed by?'), 'motivated by goal': Template('What goal motivates $node1?'), 'has property': Template('What is a property of $node1?'), 'has prerequisite': Template('What is a prerequisite for $node1?'), 'has first subevent': Template('What is the first subevent of $node1?'), 'has last subevent': Template('What is the last subevent of $node1?'), 'has subevent': Template('What is a subevent of $node1?'), 'used for': Template('What is $node1 used for?'), 'has a': Template('What belongs to $node1?'), 'is a': Template('What is a $node1?'), 'form of': Template('What is $node1 a form of?'), 'related to': Template('What is $node1 related to?') } if label in t.keys(): return t[label] else: return None def make_question(node1, label, node2): t = get_template(label) return t.substitute(node1=node1) def row_to_edge(row, cols): edge = {} edge['node1'] = row[2] edge['label'] = row[1] edge['node2'] = row[3] edge['node1_label'] = make_node_label(row[2]) edge['node2_label'] = make_node_label(row[3]) edge['label_label'] = make_rel_label(row[1]) edge['label_dimension'] = '' metadata = json.loads(row[4]) edge['weight'] = str(metadata['weight']) edge['source'] = 'CN' edge['creator'] = metadata['dataset'] if 'surfaceText' in metadata.keys(): edge['sentence'] = metadata['surfaceText'] else: edge['sentence'] = '' t = get_template(edge['label_label']) if t: edge['question'] = t.substitute(node1=edge['node1_label']) else: return '' edge_list = [edge[col] for col in cols] return '\t'.join(edge_list) + '\n' try: filename: Path = KGTKArgumentParser.get_input_file(input_file) in_columns = ['assertion', 'rel', 'subj', 'obj', 'metadata'] out_columns = [ 'node1', 'label', 'node2', 'node1_label', 'label_label', 'node2_label', 'label_dimension', 'source', 'weight', 'creator', 'sentence', 'question' ] with open(filename, 'r') as f: reader = csv.reader(f, delimiter='\t', quotechar='"') sys.stdout.write(header_to_edge(out_columns)) for row in reader: if not english_only or (row[2].startswith('/c/en/') and row[3].startswith('/c/en/')): sys.stdout.write(row_to_edge(row, out_columns)) except Exception as e: kgtk_exception_auto_handler(e)
def run( input_file: KGTKFiles, output_file: KGTKFiles, columns: typing.Optional[typing.List[str]], split_on_commas: bool, split_on_spaces: bool, strip_spaces: bool, all_except: bool, ignore_missing_columns: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if columns is not None: print("--columns=%s" % " ".join(columns), file=error_file) print("--split-on-commas=%s" % str(split_on_commas), file=error_file) print("--split-on-spaces=%s" % str(split_on_spaces), file=error_file) print("--strip-spaces=%s" % str(strip_spaces), file=error_file) print("--all-except=%s" % str(all_except), file=error_file) print("--ignore-missing-columns=%s" % str(ignore_missing_columns), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: if columns is None: columns = [] # This simplifies matters. if split_on_spaces: # We will be very lenient, and allow space-seperated arguments # *inside* shell quoting, e.g. # # kgtk remove_columns -c 'name name2 name3' # # Do not enable this option if spaces are legal inside your # column names. columns = " ".join(columns).split() remove_columns: typing.List[str] = [] arg: str column_name: str for arg in columns: if split_on_commas: for column_name in arg.split(","): if strip_spaces: column_name = column_name.strip() if len(column_name) > 0: remove_columns.append(column_name) else: if strip_spaces: arg = arg.strip() if len(arg) > 0: remove_columns.append(arg) if verbose: if all_except: print("Removing all columns except %d columns: %s" % (len(remove_columns), " ".join(remove_columns)), file=error_file, flush=True) else: print("Removing %d columns: %s" % (len(remove_columns), " ".join(remove_columns)), file=error_file, flush=True) if len(remove_columns) == 0: raise KGTKException("No columns to remove") if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) output_column_names: typing.List[str] trouble_column_names: typing.List[str] = [] if all_except: if not ignore_missing_columns: for column_name in remove_columns: if column_name not in kr.column_names: print("Error: cannot retain unknown column '%s'." % column_name, file=error_file, flush=True) trouble_column_names.append(column_name) output_column_names = [] for column_name in kr.column_names: if column_name in remove_columns: output_column_names.append(column_name) else: output_column_names = kr.column_names.copy() for column_name in remove_columns: if column_name in output_column_names: output_column_names.remove(column_name) elif not ignore_missing_columns: print("Error: cannot remove unknown column '%s'." % column_name, file=error_file, flush=True) trouble_column_names.append(column_name) if len(trouble_column_names) > 0: raise KGTKException("Unknown columns %s" % " ".join(trouble_column_names)) if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(output_column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names) input_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 kw.write(row, shuffle_list=shuffle_list) if verbose: print("Processed %d rows." % (input_line_count), file=error_file, flush=True) kw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def run(input_file: KGTKFiles, attr_syn_file: KGTKFiles, output_file: KGTKFiles): # import modules locally import sys # type: ignore from kgtk.exceptions import kgtk_exception_auto_handler import csv import json import re from pathlib import Path from collections import defaultdict from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkwriter import KgtkWriter def create_edge(node1, node1_lbl, node2, node2_lbl, rel, rel_lbl, image_id): my_row = [ node1, rel, node2, '|'.join(node1_lbl), '|'.join(node2_lbl), rel_lbl, '', KgtkFormat.stringify('VG'), '' ] return my_row try: scene_graph_filename: Path = KGTKArgumentParser.get_input_file( input_file) attr_synsets_filename: Path = KGTKArgumentParser.get_input_file( attr_syn_file) out_columns = [ 'node1', 'relation', 'node2', 'node1;label', 'node2;label', 'relation;label', 'relation;dimension', 'source', 'sentence' ] output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) ew: KgtkWriter = KgtkWriter.open( out_columns, output_kgtk_file, #mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, #verbose=self.verbose, #very_verbose=self.very_verbose ) proximity_relation = '/r/LocatedNear' property_relation = 'mw:MayHaveProperty' property_relation_label = KgtkFormat.stringify('may have property') capableof_relation = '/r/CapableOf' capableof_relation_label = KgtkFormat.stringify('capable of') with open(scene_graph_filename, 'r') as f: images_data = json.load(f) with open(attr_synsets_filename, 'r') as f: attr_synsets = json.load(f) for counter, an_image in enumerate(images_data): image_id = str(an_image['image_id']) # OBJECTS objid2names = defaultdict(list) objid2syns = {} rows = [] for o in an_image['objects']: obj_id = o['object_id'] o_synset = o['synsets'] objid2syns[obj_id] = o_synset for name in o['names']: name = name.strip().lower().rstrip('.') if not name: continue objid2names[obj_id].append(KgtkFormat.stringify(name)) # ATTRIBUTES if 'attributes' in o.keys(): for attr in o['attributes']: attr = attr.lower() if attr in attr_synsets: asyn = attr_synsets[attr] apos = asyn.split('.')[1] if apos != 'n': if apos == 'v': # verb for osyn in o_synset: if osyn != asyn: edge_row = create_edge( 'wn:' + osyn, objid2names[obj_id], 'wn:' + asyn, [KgtkFormat.stringify(attr)], capableof_relation, capableof_relation_label, image_id) if edge_row not in rows: rows.append(edge_row) else: #adjective for osyn in o_synset: if osyn != asyn: edge_row = create_edge( 'wn:' + osyn, objid2names[obj_id], 'wn:' + asyn, [KgtkFormat.stringify(attr)], property_relation, property_relation_label, image_id) if edge_row not in rows: rows.append(edge_row) # RELATIONS for rel in an_image['relationships']: #synsets=rel['synsets'] relation_label = KgtkFormat.stringify( rel['predicate'].lower().strip().strip('.')) sub_id = rel['subject_id'] sub_names = objid2names[sub_id] sub_syns = objid2syns[sub_id] obj_id = rel['object_id'] obj_names = objid2names[obj_id] obj_syns = objid2syns[obj_id] for ssyn in sub_syns: for osyn in obj_syns: if osyn != ssyn: edge_row = create_edge('wn:' + ssyn, sub_names, 'wn:' + osyn, obj_names, proximity_relation, relation_label, image_id) if edge_row not in rows: rows.append(edge_row) for a_row in rows: ew.write(a_row) # Clean up ew.close() except Exception as e: kgtk_exception_auto_handler(e)
def run( input_file: KGTKFiles, output_file: KGTKFiles, columns: typing.Optional[typing.List[str]] = None, labels: typing.Optional[typing.List[str]] = None, id_column_name: typing.Optional[str] = None, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally import os from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if columns is not None: print("--columns=%s" % " ".join(columns), file=error_file) if labels is not None: print("--labels=%s" % " ".join(labels), file=error_file) if id_column_name is not None: print("--id-column=%s" % id_column_name, file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) if verbose: print("Starting normalize_nodes pid=%d" % (os.getpid()), file=error_file, flush=True) label_map: typing.MutableMapping[str, str] = dict() if labels is not None and len(labels) > 0: if columns is None: raise KGTKException( "--columns must be supplied when --labels is used.") if len(columns) != len(labels): raise KGTKException("%d columns were supplied, but %d labels." % (len(columns), len(labels))) idx: int label: str for idx, label in enumerate(labels): label_map[columns[idx]] = label try: if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) id_column_idx: int = kr.get_id_column_index(id_column_name) if id_column_idx < 0: raise KGTKException("Unknown ID column %s" % repr(id_column_name)) output_column_names: typing.List[str] = [ KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2 ] if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(output_column_names, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, verbose=verbose, very_verbose=very_verbose) input_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 node1_value: str = row[id_column_idx] column_idx: int column_name: str for column_idx, column_name in enumerate(kr.column_names): if column_idx == id_column_idx: continue if columns is not None and column_name not in columns: continue label_value: str = label_map.get(column_name, column_name) new_value: str = row[column_idx] if len(new_value) == 0: continue # ignore empty values. # The column value might contain a KGTK list. Since node2 isn't supposed # to contain lists, we'll split it. node2_value: str for node2_value in KgtkValue.split_list(new_value): if len(node2_value) == 0: continue # node2 shouldn't contain empty values output_row: typing.List[str] = [ node1_value, label_value, node2_value ] kw.write(output_row) output_line_count += 1 if verbose: print("Read %d node rows, wrote %d edge rows." % (input_line_count, output_line_count), file=error_file, flush=True) kw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def run( input_file: KGTKFiles, output_file: KGTKFiles, label_file: KGTKFiles, base_columns: typing.Optional[typing.List[str]] = None, columns_to_remove: typing.Optional[typing.List[str]] = None, label_value: str = KgtkLift.DEFAULT_LABEL_SELECT_COLUMN_VALUE, lift_suffix: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SUFFIX, deduplicate_labels: bool = True, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) label_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file(label_file, who="Label file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if label_kgtk_file is not None: print("--label-file=%s" % str(label_kgtk_file), file=error_file) if base_columns is not None: print("--base-columns=%s" % " ".join(base_columns), file=error_file) if columns_to_remove is not None: print("--columns-to-lower=%s" % " ".join(columns_to_remove), file=error_file) print("--label-value=%s" % label_value, file=error_file) print("--lift-suffix=%s" % lift_suffix, file=error_file) print("--deduplicate-labels=%s" % deduplicate_labels, file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) # Map the index of a column being removed to the index of the base column that supplies its node1 value. lower_map: typing.MutableMapping[int, int] = dict() # These columns will never be removed: key_column_idxs: typing.Set[int] = set( (kr.node1_column_idx, kr.label_column_idx, kr.node2_column_idx, kr.id_column_idx)) key_column_idxs.discard(-1) key_column_names: typing.Set[str] = set( (kr.column_names[idx] for idx in key_column_idxs)) base_name: str column_name: str idx: int # There are three option patterns. if columns_to_remove is not None and len( columns_to_remove) > 0 and base_columns is not None and len( base_columns) > 0: # Pattern 1: len(columns_to_remove) > 0 and len(base_columns) == len(columns_to_remove) # column_names and base_columns are paired. if len(columns_to_remove) != len(base_columns): raise KGTKException( "There are %d columns to remove but only %d base columns." % (len(columns_to_remove), len(base_columns))) for idx, column_name in enumerate(columns_to_remove): base_name = base_columns[idx] if column_name not in kr.column_names: raise KGTKException( "Column %s is an unknown column, cannot remove it." % repr(column_name)) if column_name in key_column_names: raise KGTKException( "Column %s is a key column, cannot remove it." % repr(column_name)) if base_name not in kr.column_names: raise KGTKException( "For column name %s, base name %s is unknown" % (repr(column_name), repr(base_name))) lower_map[kr.column_name_map[ column_name]] = kr.column_name_map[base_name] elif columns_to_remove is not None and len(columns_to_remove) > 0 and ( base_columns is None or len(base_columns) == 0): # Pattern 2: len(columns_to_remove) > 0 and len(base_columns) == 0 # Each column name is stripped of the lift suffix to determine the base name. if len(lift_suffix) == 0: raise KGTKException("The --lift-suffix must not be empty.") for idx, column_name in enumerate(columns_to_remove): if column_name not in kr.column_names: raise KGTKException( "Column %s is an unknown column, cannot remove it." % repr(column_name)) if column_name in key_column_names: raise KGTKException( "Column %s is a key column, cannot remove it." % repr(column_name)) if not column_name.endswith(lift_suffix): raise KGTKException("Unable to parse column name %s." % repr(column_name)) base_name = column_name[:-len(lift_suffix)] if base_name not in kr.column_names: raise KGTKException( "For column name %s, base name %s is not known" % (repr(column_name), repr(base_name))) lower_map[kr.column_name_map[ column_name]] = kr.column_name_map[base_name] elif columns_to_remove is None or len(columns_to_remove) == 0: # Pattern 3: len(columns_to_remove) == 0. if len(lift_suffix) == 0: raise KGTKException("The --lift-suffix must not be empty.") if base_columns is None or len(base_columns) == 0: # The base name list wasn't supplied. Use [node1, label, node2, id] base_columns = list(key_column_names) for idx, column_name in enumerate(kr.column_names): # Skip the node1, label, node12, and id columns if idx in key_column_idxs: continue # Does this column match a lifting pattern? for base_name in base_columns: if len(base_name) == 0: continue if column_name == base_name + lift_suffix: lower_map[idx] = kr.column_name_map[base_name] if len(lower_map) == 0: raise KGTKException("There are no columns to lower.") if verbose: print("The following columns will be lowered", file=error_file, flush=True) for idx in sorted(lower_map.keys()): column_name = kr.column_names[idx] base_name = kr.column_names[lower_map[idx]] print(" %s from %s" % (column_name, base_name), file=error_file, flush=True) output_column_names: typing.List[str] = list() for idx, column_name in enumerate(kr.column_names): if idx not in lower_map: output_column_names.append(column_name) if verbose: print("The output columns are: %s" % " ".join(output_column_names), file=error_file, flush=True) if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open( output_column_names, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=False, # Simplifies writing the labels verbose=verbose, very_verbose=very_verbose) shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names) lkw: typing.Optional[KgtkWriter] = None if label_kgtk_file is not None: if verbose: print("Opening the label output file: %s" % str(label_kgtk_file), file=error_file, flush=True) label_column_names = [ KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2 ] lkw = KgtkWriter.open(label_column_names, label_kgtk_file, mode=KgtkWriter.Mode.EDGE, verbose=verbose, very_verbose=very_verbose) # Optionally deduplicate the labels # set(node1_value + KgtkFormat.SEPARATOR + node2_value) label_set: typing.Set[str] = set() label_key: str # If labels will be written to the output file and deduplication is enabled: check_existing_labels: bool = \ deduplicate_labels and \ lkw is None and \ kr.node1_column_idx >= 0 and \ kr.label_column_idx >= 0 and \ kr.node2_column_idx >= 0 input_line_count: int = 0 output_line_count: int = 0 label_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if check_existing_labels and row[ kr.label_column_idx] == label_value: label_key = row[ kr.node1_column_idx] + KgtkFormat.COLUMN_SEPARATOR + row[ kr.node2_column_idx] if label_key in label_set: continue else: label_set.add(label_key) kw.write(row, shuffle_list=shuffle_list) output_line_count += 1 column_idx: int for column_idx in lower_map.keys(): node1_value: str = row[lower_map[column_idx]] if len(node1_value) == 0: continue # TODO: raise an exception item: str = row[column_idx] if len(item) == 0: continue # Ignore empty node2 values. # Ths item might be a KGTK list. Let's split it, because # lists aren't allow in the node2 values we'll generate. node2_value: str for node2_value in KgtkValue.split_list(item): if len(node2_value) == 0: continue # Ignore empty node2 values. if deduplicate_labels: label_key = node1_value + KgtkFormat.COLUMN_SEPARATOR + node2_value if label_key in label_set: continue else: label_set.add(label_key) output_map: typing.Mapping[str, str] = { KgtkFormat.NODE1: node1_value, KgtkFormat.LABEL: label_value, KgtkFormat.NODE2: node2_value, } if lkw is None: kw.writemap(output_map) label_line_count += 1 output_line_count += 1 else: lkw.writemap(output_map) label_line_count += 1 if verbose: print("Read %d rows, wrote %d rows with %d labels." % (input_line_count, output_line_count, label_line_count), file=error_file, flush=True) kw.close() if lkw is not None: lkw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1