Exemplo n.º 1
0
    def process_qnode(self, kw: KgtkWriter, current_process_node_id: str,
                      each_node_attributes: EACH_NODE_ATTRIBUTES) -> bool:
        interesting_qnode: bool = False
        if each_node_attributes:
            for k in each_node_attributes:
                if each_node_attributes[k]:
                    interesting_qnode = True
                    break
        if not interesting_qnode:
            return False

        concat_sentence: str
        explanation: str
        concat_sentence, explanation = self.attribute_to_sentence(
            each_node_attributes, current_process_node_id)
        if self.explain:
            kw.write([
                current_process_node_id, self.sentence_label,
                KgtkFormat.stringify(concat_sentence),
                KgtkFormat.stringify(explanation)
            ])
        else:
            kw.write([
                current_process_node_id, self.sentence_label,
                KgtkFormat.stringify(concat_sentence)
            ])
        return True
Exemplo n.º 2
0
 def produce_node_labels(event):
     if '\t' in event:
         event = event.split('\t')[0]
     e1 = event.lower()
     e1 = e1.rstrip('.').strip()
     e2 = remove_people_mentions(e1)
     while '  ' in e2:
         e2 = e2.replace('  ', ' ')
     if e1 != e2 and e2:
         return '|'.join(
             [KgtkFormat.stringify(e1),
              KgtkFormat.stringify(e2)])
     else:
         return KgtkFormat.stringify(e1)
Exemplo n.º 3
0
    def edge2KGTK(edge: Tuple[str, str, str]) -> pd.Series:
        """
        Gets the edge as triple of subject, object, predicate and converts the edge to the KGTK format
        Args:
            edge: Tuple[str, str, str]
                input edge
        Returns: pd.Series
            pandas Series with keys according to KGTK format at
            https://docs.google.com/document/d/1fbbqgyX0N2EdxLam6hatfke1R-nZWkoN6M1oB_f4aQo/edit#heading=h.a5nlqev5bmm4
        """
        s, p, o = edge

        def clean(e: str) -> str:
            out = e.split(':')[-1].replace('_', ' ')
            return KgtkFormat.stringify(
                re.sub("([a-z])([A-Z])", "\g<1> \g<2>", out).strip().lower())

        return pd.Series({
            'node1': s,
            'relation': p,
            'node2': o,
            'node1;label': clean(s),
            'node2;label': clean(o),
            'relation;label': clean(p),
            'relation;dimension': '',
            'source': KgtkFormat.stringify('FN'),
            'sentence': ''
        })
Exemplo n.º 4
0
 def create_edge(node1, node1_lbl, node2, node2_lbl, rel, rel_lbl,
                 image_id):
     my_row = [
         node1, rel, node2, '|'.join(node1_lbl), '|'.join(node2_lbl),
         rel_lbl, '',
         KgtkFormat.stringify('VG'), ''
     ]
     return my_row
Exemplo n.º 5
0
 def create_edges(data, labels, rel, rel_label):
     all_rows=[]
     source=KgtkFormat.stringify('WN')
     for node1, v in data.items():
         for node2 in v:
             node1_preflabel=labels[node1].split('|')[0]
             node2_preflabel=labels[node2].split('|')[0]
             a_row=['wn:' + node1, rel, 'wn:' + node2, labels[node1], labels[node2], rel_label, "", source, '']
             all_rows.append(a_row)
     return all_rows
Exemplo n.º 6
0
    def row_to_edge(row, cols):

        edge = {}
        edge['node1'] = row[2]
        edge['relation'] = row[1]
        edge['node2'] = row[3]
        edge['node1_label'] = make_node_label(row[2])
        edge['node2_label'] = make_node_label(row[3])
        edge['relation_label'] = make_rel_label(row[1])
        edge['relation_dimension'] = ''

        metadata = json.loads(row[4])
        edge['source'] = KgtkFormat.stringify('CN')
        if 'surfaceText' in metadata.keys():
            edge['sentence'] = KgtkFormat.stringify(
                metadata['surfaceText'].replace('\\', ''))
        else:
            edge['sentence'] = ''

        edge_list = [edge[col] for col in cols]
        return '\t'.join(edge_list) + '\n'
Exemplo n.º 7
0
def extract(input_file, output_file, source):
    rows=[]
    with open(output_file, 'w') as w:
        columns=['id', 'node1', 'relation', 'node2', 'node1;label', 'node2;label','relation;label', 'relation;dimension', 'source', 'sentence']
        w.write(print_edge(columns))
        with open(input_file, 'r') as f:
            header=next(f)
            for line in f:
                data=line.split('\t')
                data[1]=data[1].replace('same', 'Same')
                id='-'.join(data[:3])
                new_row=[id, *data[:3], "", "", "", "", KgtkFormat.stringify(source), ""]
                w.write(print_edge(new_row))
Exemplo n.º 8
0
 def produce_rel_label(rel):
     mapping = {
         'xAttr': 'person x has attribute',
         'oAttr': 'others have attribute',
         'xReact': 'person x feels',
         'oReact': 'others feel',
         'xIntent': 'person x wants',
         'xWant': 'person x wants',
         'oWant': 'others want',
         'xNeed': 'person x needs',
         'xEffect': 'effect on person x',
         'oEffect': 'the effect on others'
     }
     return KgtkFormat.stringify(mapping[rel])
Exemplo n.º 9
0
    def row_to_edge(node1, rel, node2, source, cols):

        edge = {}
        prefix = source.lower()
        edge['node1'] = prefix + ':' + node1
        edge['relation'] = rel
        edge['node2'] = prefix + ':' + node2
        edge['node1;label'] = make_node_label(node1)
        edge['node2;label'] = make_node_label(node2)
        edge['relation;label'] = make_rel_label(rel)
        edge['relation;dimension'] = ''

        edge['source'] = KgtkFormat.stringify(source)
        edge['sentence'] = ''

        edge_list = [edge[col] for col in cols]
        return edge_list
Exemplo n.º 10
0
def run(input_file: KGTKFiles, output_file: KGTKFiles):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException
    import csv
    import re
    import json
    from pathlib import Path
    from string import Template
    import pandas as pd
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkwriter import KgtkWriter

    def make_node(x):
        und_x = x.replace(' ', '_')
        pref_und_x = 'at:%s' % und_x
        return pref_und_x

    def remove_people_mentions(event):
        e = event.replace('personx', '').strip()
        e = e.replace('persony', '').strip()
        e = e.replace('person x', '').strip()
        e = e.replace('person y', '').strip()
        e = e.replace('the ___', '')
        e = e.replace('___', '')
        e = e.replace("'s", '')
        e = e.replace('to y', '')
        return e.strip()

    def produce_node_labels(event):
        if '\t' in event:
            event = event.split('\t')[0]
        e1 = event.lower()
        e1 = e1.rstrip('.').strip()
        e2 = remove_people_mentions(e1)
        while '  ' in e2:
            e2 = e2.replace('  ', ' ')
        if e1 != e2 and e2:
            return '|'.join(
                [KgtkFormat.stringify(e1),
                 KgtkFormat.stringify(e2)])
        else:
            return KgtkFormat.stringify(e1)

    def produce_rel_label(rel):
        mapping = {
            'xAttr': 'person x has attribute',
            'oAttr': 'others have attribute',
            'xReact': 'person x feels',
            'oReact': 'others feel',
            'xIntent': 'person x wants',
            'xWant': 'person x wants',
            'oWant': 'others want',
            'xNeed': 'person x needs',
            'xEffect': 'effect on person x',
            'oEffect': 'the effect on others'
        }
        return KgtkFormat.stringify(mapping[rel])

    try:

        filename: Path = KGTKArgumentParser.get_input_file(input_file)

        out_columns = [
            'node1', 'relation', 'node2', 'node1;label', 'node2;label',
            'relation;label', 'relation;dimension', 'source', 'sentence'
        ]

        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)
        ew: KgtkWriter = KgtkWriter.open(
            out_columns,
            output_kgtk_file,
            #mode=input_kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            gzip_in_parallel=False,
            #verbose=self.verbose,
            #very_verbose=self.very_verbose
        )

        df = pd.read_csv(filename, index_col=0)
        df.iloc[:, :9] = df.iloc[:, :9].apply(
            lambda col: col.apply(json.loads))

        df.drop(df.columns[len(df.columns) - 1], axis=1, inplace=True)
        df.drop(df.columns[len(df.columns) - 1], axis=1, inplace=True)

        for event, row in df.iterrows():
            event_label = produce_node_labels(event)

            first_event_label = KgtkFormat.unstringify(
                event_label.split('|')[0] if '|' in
                event_label else event_label)
            n1 = make_node(first_event_label)
            for c in df.columns:
                for v in row[c]:
                    if v == 'none': continue
                    value_label = produce_node_labels(v)
                    first_value_label = KgtkFormat.unstringify(
                        value_label.split('|')[0] if '|' in
                        value_label else value_label)
                    n2 = make_node(first_value_label)

                    rel_label = produce_rel_label(c)

                    sentence = ''

                    relation = make_node(c)

                    this_row = [
                        n1, relation, n2, event_label, value_label, rel_label,
                        '',
                        KgtkFormat.stringify('AT'), sentence
                    ]
                    ew.write(this_row)

        # Clean up.
        ew.close()

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
Exemplo n.º 11
0
 def make_node_label(node):
     return KgtkFormat.stringify(node.strip().split('/')[3].replace(
         '_', ' '))
Exemplo n.º 12
0
 def make_rel_label(rel):
     return KgtkFormat.stringify(split_camel_case(rel.split('/')[-1]))
Exemplo n.º 13
0
    def implode_language_qualified_string(
        self,
        input_line_count: int,
        row: typing.List[str],
        implosion: typing.Mapping[str, int],
        type_name: str,
    ) -> typing.Tuple[str, bool]:
        valid: bool = True
        text_idx: int = implosion[KgtkValueFields.TEXT_FIELD_NAME]
        text_val: str = row[text_idx]
        if len(text_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" %
                      (input_line_count, type_name,
                       KgtkValueFields.TEXT_FIELD_NAME),
                      file=self.error_file,
                      flush=True)

        elif len(text_val) == 1:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is too short" %
                      (input_line_count, type_name,
                       KgtkValueFields.TEXT_FIELD_NAME),
                      file=self.error_file,
                      flush=True)
        else:
            if not text_val.startswith('"'):
                valid = False
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': %s field does not start with a double quote"
                        % (input_line_count, type_name,
                           KgtkValueFields.TEXT_FIELD_NAME),
                        file=self.error_file,
                        flush=True)
            if not text_val.endswith('"'):
                valid = False
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': %s field does not end with a double quote"
                        % (input_line_count, type_name,
                           KgtkValueFields.TEXT_FIELD_NAME),
                        file=self.error_file,
                        flush=True)

        language_idx: int = implosion[KgtkValueFields.LANGUAGE_FIELD_NAME]
        language_val: str = self.unwrap(row[language_idx])
        if len(language_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" %
                      (input_line_count, type_name,
                       KgtkValueFields.LANGUAGE_FIELD_NAME),
                      file=self.error_file,
                      flush=True)

        suf_idx: int = implosion[KgtkValueFields.LANGUAGE_SUFFIX_FIELD_NAME]
        suf: str = self.unwrap(row[suf_idx]) if suf_idx >= 0 else ""
        if len(suf) > 0 and not suf.startswith("-"):
            # As a siecial favor, we'll accept language suffixes that do not
            # start with a dash.  We'll prepend the dash.
            suf = "-" + suf

        value: str = ""
        if valid:
            # This subterfuge uses Python's literal parser to parse the string.
            if not self.escape_pipes:
                # ast.literal_eval(...) doesn't treat backslash pipe (\|) as an escaped pipe (|).
                # (this is documented behavior) so we will remove escaped pipes manually.
                text_val = text_val.replace('\\|', '|')
            value = KgtkFormat.stringify(ast.literal_eval(text_val),
                                         language=language_val,
                                         language_suffix=suf)

        if valid and self.validate:
            kv: KgtkValue = KgtkValue(value, options=self.value_options)
            valid = kv.is_language_qualified_string(validate=True)
            if not valid:
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': imploded value '%s' is not a valid language qualified string."
                        % (input_line_count, type_name, value),
                        file=self.error_file,
                        flush=True)
        return value, valid
sources=['AT', 'RG', 'CN']
identity_rel='mw:SameAs'

lbl2ids=defaultdict(set)

with open(input_file, 'r') as f:
    header=next(f)
    for line in f:
        data=line.split('\t')
        if check_source(data[8], sources):
            node1=data[1]
            node2=data[3]
            if lexical_node(node1):
                node1_label=data[4]
                lbl2ids[node1_label].add(node1)
            if lexical_node(node2):
                node2_label=data[5]
                lbl2ids[node2_label].add(node2)
print(len(lbl2ids))

with open('tmp/lexical_mappings.tsv', 'w') as w:
    w.write(header)
    for label, ids in lbl2ids.items():
        if len(ids)<=1: continue

        list_ids=list(ids)
        for i in range(len(list_ids)-1):
            edge_id='%s-%s-%s-1' % (list_ids[i], identity_rel, list_ids[i+1])
            row=[edge_id, list_ids[i], identity_rel, list_ids[i+1], '', '', '', '', KgtkFormat.stringify('LEX'), '']
            w.write('\t'.join(row) + '\n')
Exemplo n.º 15
0
 def clean(e: str) -> str:
     out = e.split(':')[-1].replace('_', ' ')
     return KgtkFormat.stringify(
         re.sub("([a-z])([A-Z])", "\g<1> \g<2>", out).strip().lower())
Exemplo n.º 16
0
    def implode_string(
        self,
        input_line_count: int,
        row: typing.List[str],
        implosion: typing.Mapping[str, int],
        type_name: str,
    ) -> typing.Tuple[str, bool]:
        valid: bool = True
        if KgtkValueFields.LANGUAGE_FIELD_NAME in implosion:
            language_idx: int = implosion[KgtkValueFields.LANGUAGE_FIELD_NAME]
            if language_idx >= 0:
                language_val: str = self.unwrap(row[language_idx])
                if len(language_val) > 0:
                    if self.general_strings:
                        return self.implode_language_qualified_string(
                            input_line_count, row, implosion, type_name)
                    else:
                        valid = False
                        if self.verbose:
                            print(
                                "Input line %d: data type '%s': %s field is not empty"
                                % (input_line_count, type_name,
                                   KgtkValueFields.LANGUAGE_FIELD_NAME),
                                file=self.error_file,
                                flush=True)

        text_idx: int = implosion[KgtkValueFields.TEXT_FIELD_NAME]
        text_val: str = row[text_idx]
        if len(text_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" %
                      (input_line_count, type_name,
                       KgtkValueFields.TEXT_FIELD_NAME),
                      file=self.error_file,
                      flush=True)
        elif len(text_val) == 1:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is too short" %
                      (input_line_count, type_name,
                       KgtkValueFields.TEXT_FIELD_NAME),
                      file=self.error_file,
                      flush=True)
        else:
            if not text_val.startswith('"'):
                valid = False
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': %s field does not start with a double quote"
                        % (input_line_count, type_name,
                           KgtkValueFields.TEXT_FIELD_NAME),
                        file=self.error_file,
                        flush=True)
            if not text_val.endswith('"'):
                valid = False
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': %s field does not end with a double quote"
                        % (input_line_count, type_name,
                           KgtkValueFields.TEXT_FIELD_NAME),
                        file=self.error_file,
                        flush=True)

        value: str = ""
        if valid:
            # This subterfuge uses Python's literal parser to parse the string.
            if not self.escape_pipes:
                # ast.literal_eval(...) doesn't treat backslash pipe (\|) as an escaped pipe (|).
                # (this is documented behavior) so we will remove escaped pipes manually.
                text_val = text_val.replace('\\|', '|')
            value = KgtkFormat.stringify(ast.literal_eval(text_val))

        if valid and self.validate:
            kv: KgtkValue = KgtkValue(value, options=self.value_options)
            valid = kv.is_string(validate=True)
            if not valid:
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': imploded value '%s' is not a valid string."
                        % (input_line_count, type_name, value),
                        file=self.error_file,
                        flush=True)
        return value, valid
Exemplo n.º 17
0
def run(input_file: KGTKFiles, attr_syn_file: KGTKFiles):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import csv
    import json
    import re
    from pathlib import Path
    from collections import defaultdict
    from kgtk.kgtkformat import KgtkFormat

    out_columns = [
        'node1', 'relation', 'node2', 'node1_label', 'node2_label',
        'relation_label', 'relation_dimension', 'source', 'sentence'
    ]

    proximity_relation = '/r/LocatedNear'
    property_relation = 'mw:MayHaveProperty'
    property_relation_label = KgtkFormat.stringify('may have property')
    capableof_relation = '/r/CapableOf'
    capableof_relation_label = KgtkFormat.stringify('capable of')

    def create_edge(node1, node1_lbl, node2, node2_lbl, rel, rel_lbl,
                    image_id):
        my_row = [
            node1, rel, node2, '|'.join(node1_lbl), '|'.join(node2_lbl),
            rel_lbl, '',
            KgtkFormat.stringify('VG'), ''
        ]
        return '\t'.join(my_row) + '\n'

    def header_to_edge(row):
        row = [r.replace('_', ';') for r in row]
        return '\t'.join(row) + '\n'

    def create_uri(ns, rel):
        return '%s:%s' % (ns, rel)

    try:
        scene_graph_filename: Path = KGTKArgumentParser.get_input_file(
            input_file)
        attr_synsets_filename: Path = KGTKArgumentParser.get_input_file(
            attr_syn_file)

        with open(scene_graph_filename, 'r') as f:
            images_data = json.load(f)

        with open(attr_synsets_filename, 'r') as f:
            attr_synsets = json.load(f)

        sys.stdout.write(header_to_edge(out_columns))

        for counter, an_image in enumerate(images_data):

            image_id = str(an_image['image_id'])

            # OBJECTS
            objid2names = defaultdict(list)
            objid2syns = {}
            rows = []
            for o in an_image['objects']:
                obj_id = o['object_id']
                o_synset = o['synsets']
                objid2syns[obj_id] = o_synset
                for name in o['names']:
                    name = name.strip().lower().rstrip('.')
                    if not name: continue
                    objid2names[obj_id].append(KgtkFormat.stringify(name))

                # ATTRIBUTES
                if 'attributes' in o.keys():
                    for attr in o['attributes']:
                        attr = attr.lower()
                        if attr in attr_synsets:
                            asyn = attr_synsets[attr]
                            apos = asyn.split('.')[1]
                            if apos != 'n':
                                if apos == 'v':  # verb
                                    for osyn in o_synset:
                                        if osyn != asyn:
                                            edge_row = create_edge(
                                                'wn:' + osyn,
                                                objid2names[obj_id],
                                                'wn:' + asyn,
                                                [KgtkFormat.stringify(attr)],
                                                capableof_relation,
                                                capableof_relation_label,
                                                image_id)
                                            if edge_row not in rows:
                                                rows.append(edge_row)
                                else:  #adjective
                                    for osyn in o_synset:
                                        if osyn != asyn:
                                            edge_row = create_edge(
                                                'wn:' + osyn,
                                                objid2names[obj_id],
                                                'wn:' + asyn,
                                                [KgtkFormat.stringify(attr)],
                                                property_relation,
                                                property_relation_label,
                                                image_id)
                                            if edge_row not in rows:
                                                rows.append(edge_row)

            # RELATIONS
            for rel in an_image['relationships']:
                #synsets=rel['synsets']
                relation_label = KgtkFormat.stringify(
                    rel['predicate'].lower().strip().strip('.'))
                sub_id = rel['subject_id']
                sub_names = objid2names[sub_id]
                sub_syns = objid2syns[sub_id]
                obj_id = rel['object_id']
                obj_names = objid2names[obj_id]
                obj_syns = objid2syns[obj_id]

                for ssyn in sub_syns:
                    for osyn in obj_syns:
                        if osyn != ssyn:
                            edge_row = create_edge('wn:' + ssyn, sub_names,
                                                   'wn:' + osyn, obj_names,
                                                   proximity_relation,
                                                   relation_label, image_id)
                            if edge_row not in rows:
                                rows.append(edge_row)
            for a_row in rows:
                sys.stdout.write(a_row)

    except Exception as e:
        kgtk_exception_auto_handler(e)
Exemplo n.º 18
0
def run(output_file: KGTKFiles):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import json
    import nltk
    nltk.download("wordnet")
    from nltk.corpus import wordnet as wn
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkwriter import KgtkWriter

    def obtain_wordnet_lemmas(syn):
        lemmas = []
        for lemma in syn.lemma_names():
            lemmas.append(KgtkFormat.stringify(lemma.replace('_', ' ')))
        return lemmas

    def obtain_hypernyms(syn):
        hyps = []
        for hypernym in syn.hypernyms():
            hyps.append(hypernym.name())
        return hyps

    def obtain_member_holonyms(syn):
        hols = []
        for hol in syn.member_holonyms():
            hols.append(hol.name())
        return hols

    def obtain_part_holonyms(syn):
        hols = []
        for hol in syn.part_holonyms():
            hols.append(hol.name())
        return hols

    def obtain_substance_meronyms(syn):
        hols = []
        for hol in syn.substance_meronyms():
            hols.append(hol.name())
        return hols

    def get_wn_data():
        syns = list(wn.all_synsets())
        all_labels = {}
        all_hyps = {}
        all_members = {}
        all_parts = {}
        all_subs = {}
        for syn in syns:
            syn_name = syn.name()

            lemmas = obtain_wordnet_lemmas(syn)
            all_labels[syn_name] = '|'.join(lemmas)

            hypernyms = obtain_hypernyms(syn)
            if len(hypernyms):
                all_hyps[syn_name] = hypernyms

            member_holonyms = obtain_member_holonyms(syn)
            if len(member_holonyms):
                all_members[syn_name] = member_holonyms

            part_holonyms = obtain_part_holonyms(syn)
            if len(part_holonyms):
                all_parts[syn_name] = part_holonyms

            substance_meronyms = obtain_substance_meronyms(syn)
            if len(substance_meronyms):
                all_subs[syn_name] = substance_meronyms

        return all_labels, all_hyps, all_members, all_parts, all_subs

    def create_edges(data, labels, rel, rel_label):
        all_rows = []
        source = KgtkFormat.stringify('WN')
        for node1, v in data.items():
            for node2 in v:
                node1_preflabel = labels[node1].split('|')[0]
                node2_preflabel = labels[node2].split('|')[0]
                a_row = [
                    'wn:' + node1, rel, 'wn:' + node2, labels[node1],
                    labels[node2], rel_label, "", source, ''
                ]
                all_rows.append(a_row)
        return all_rows

    try:
        out_columns = [
            'node1', 'relation', 'node2', 'node1;label', 'node2;label',
            'relation;label', 'relation;dimension', 'source', 'sentence'
        ]

        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)
        ew: KgtkWriter = KgtkWriter.open(
            out_columns,
            output_kgtk_file,
            #mode=input_kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            gzip_in_parallel=False,
            #verbose=self.verbose,
            #very_verbose=self.very_verbose
        )

        all_labels, all_hyps, all_members, all_parts, all_subs = get_wn_data()
        hyp_edges = create_edges(all_hyps, all_labels, '/r/IsA',
                                 KgtkFormat.stringify('is a'))
        member_edges = create_edges(all_members, all_labels, '/r/PartOf',
                                    KgtkFormat.stringify('is a part of'))
        part_edges = create_edges(all_parts, all_labels, '/r/PartOf',
                                  KgtkFormat.stringify('is a part of'))
        sub_edges = create_edges(all_subs, all_labels, '/r/MadeOf',
                                 KgtkFormat.stringify('is made of'))
        all_edges = hyp_edges + member_edges + part_edges + sub_edges

        for edge in all_edges:
            ew.write(edge)

        # Clean up.
        ew.close()

    except Exception as e:
        kgtk_exception_auto_handler(e)
Exemplo n.º 19
0
def run(input_file: KGTKFiles, attr_syn_file: KGTKFiles,
        output_file: KGTKFiles):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import csv
    import json
    import re
    from pathlib import Path
    from collections import defaultdict
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkwriter import KgtkWriter

    def create_edge(node1, node1_lbl, node2, node2_lbl, rel, rel_lbl,
                    image_id):
        my_row = [
            node1, rel, node2, '|'.join(node1_lbl), '|'.join(node2_lbl),
            rel_lbl, '',
            KgtkFormat.stringify('VG'), ''
        ]
        return my_row

    try:
        scene_graph_filename: Path = KGTKArgumentParser.get_input_file(
            input_file)
        attr_synsets_filename: Path = KGTKArgumentParser.get_input_file(
            attr_syn_file)

        out_columns = [
            'node1', 'relation', 'node2', 'node1;label', 'node2;label',
            'relation;label', 'relation;dimension', 'source', 'sentence'
        ]

        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)
        ew: KgtkWriter = KgtkWriter.open(
            out_columns,
            output_kgtk_file,
            #mode=input_kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            gzip_in_parallel=False,
            #verbose=self.verbose,
            #very_verbose=self.very_verbose
        )

        proximity_relation = '/r/LocatedNear'
        property_relation = 'mw:MayHaveProperty'
        property_relation_label = KgtkFormat.stringify('may have property')
        capableof_relation = '/r/CapableOf'
        capableof_relation_label = KgtkFormat.stringify('capable of')

        with open(scene_graph_filename, 'r') as f:
            images_data = json.load(f)

        with open(attr_synsets_filename, 'r') as f:
            attr_synsets = json.load(f)

        for counter, an_image in enumerate(images_data):

            image_id = str(an_image['image_id'])

            # OBJECTS
            objid2names = defaultdict(list)
            objid2syns = {}
            rows = []
            for o in an_image['objects']:
                obj_id = o['object_id']
                o_synset = o['synsets']
                objid2syns[obj_id] = o_synset
                for name in o['names']:
                    name = name.strip().lower().rstrip('.')
                    if not name: continue
                    objid2names[obj_id].append(KgtkFormat.stringify(name))

                # ATTRIBUTES
                if 'attributes' in o.keys():
                    for attr in o['attributes']:
                        attr = attr.lower()
                        if attr in attr_synsets:
                            asyn = attr_synsets[attr]
                            apos = asyn.split('.')[1]
                            if apos != 'n':
                                if apos == 'v':  # verb
                                    for osyn in o_synset:
                                        if osyn != asyn:
                                            edge_row = create_edge(
                                                'wn:' + osyn,
                                                objid2names[obj_id],
                                                'wn:' + asyn,
                                                [KgtkFormat.stringify(attr)],
                                                capableof_relation,
                                                capableof_relation_label,
                                                image_id)
                                            if edge_row not in rows:
                                                rows.append(edge_row)
                                else:  #adjective
                                    for osyn in o_synset:
                                        if osyn != asyn:
                                            edge_row = create_edge(
                                                'wn:' + osyn,
                                                objid2names[obj_id],
                                                'wn:' + asyn,
                                                [KgtkFormat.stringify(attr)],
                                                property_relation,
                                                property_relation_label,
                                                image_id)
                                            if edge_row not in rows:
                                                rows.append(edge_row)

            # RELATIONS
            for rel in an_image['relationships']:
                #synsets=rel['synsets']
                relation_label = KgtkFormat.stringify(
                    rel['predicate'].lower().strip().strip('.'))
                sub_id = rel['subject_id']
                sub_names = objid2names[sub_id]
                sub_syns = objid2syns[sub_id]
                obj_id = rel['object_id']
                obj_names = objid2names[obj_id]
                obj_syns = objid2syns[obj_id]

                for ssyn in sub_syns:
                    for osyn in obj_syns:
                        if osyn != ssyn:
                            edge_row = create_edge('wn:' + ssyn, sub_names,
                                                   'wn:' + osyn, obj_names,
                                                   proximity_relation,
                                                   relation_label, image_id)
                            if edge_row not in rows:
                                rows.append(edge_row)
            for a_row in rows:
                ew.write(a_row)

        # Clean up
        ew.close()

    except Exception as e:
        kgtk_exception_auto_handler(e)
Exemplo n.º 20
0
 def obtain_wordnet_lemmas(syn):
     lemmas = []
     for lemma in syn.lemma_names():
         lemmas.append(KgtkFormat.stringify(lemma.replace('_', ' ')))
     return lemmas
Exemplo n.º 21
0
 def make_node_label(node):
     return KgtkFormat.stringify(node[3:])