예제 #1
0
 def process_entity(self, entity):
     if entity == '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>':
         return const.is_type_of
     elif entity == '<http://www.w3.org/2000/01/rdf-schema#seeAlso>':
         return const.are_related
     elif entity == '<http://www.w3.org/2002/07/owl#Thing>':
         return None
     if '#' in entity:
         parts = entity.split('#')
     else:
         parts = entity.split('/')
     if len(parts) < 2:
         return None
     namespace = parts[-2]
     name = parts[-1]
     if len(namespace) == 0:
         return None
     if namespace[0] == '<':
         namespace = namespace[1:]
     if name[-1] == '>':
         name = name[:-1]
     if namespace == 'ontology':
         name, namespace = process_ontology(name)
     elif namespace == 'resource':
         name, namespace = self.process_resource(name)
     if name is None:
         return None
     name = name.lower()
     return sym.build(name, namespace)
예제 #2
0
 def process_comments(self, post):
     if 'body' in post:
         author = sym.build(post['author'], 'reddit_user')
         self.process_text(post['body'], author, reset_context=False)
     if 'comments' in post:
         for comment in post['comments']:
             if comment:
                 self.process_comments(comment)
예제 #3
0
 def to_hyperedge(self, with_namespaces=True):
     if not with_namespaces:
         s = sym.str2symbol(self.token.word)
     else:
         s = sym.build(self.token.word, self.namespace)
     if self.connector:
         s = '+%s' % s
     return s
    def process_post(self, post):
        web_entity = sym.build(post['web_entity'], 'web_entity')
        print('web_entity: %s' % web_entity)

        text = post['text'].strip()
        if len(text) == 0:
            return
        if text[-1].isalnum():
            text += '.'
        self.process_text(text, web_entity)
예제 #5
0
    def process_post(self, post):
        author = sym.build(post['author'], 'reddit_user')
        print('author: %s' % author)

        # aux_text = generate_aux_text(post)

        text = post['title'].strip()
        if text[-1].isalnum():
            text += '.'
        self.process_text(text, author, reset_context=True, aux_text='')
        if self.comments:
            self.process_comments(post)
예제 #6
0
    def generate_labels(self, entity_id):
        entity = self.output.tree.get(entity_id)
        if entity.is_node():
            # process children first
            for i in range(len(entity.children_ids)):
                self.generate_labels(entity.children_ids[i])

            # if entity.is_compound_concept():
            edge = entity.to_hyperedge()
            text = entity.as_text()
            label = sym.build(text, cons.label_namespace)
            syn_edge = [cons.has_label, edge, label]
            self.output.edges.append(syn_edge)
예제 #7
0
    def process_entity(self, entity_id):
        entity = self.output.tree.get(entity_id)

        entity.generate_namespace()

        if entity.is_leaf():
            if entity.token.word.lower() != entity.token.lemma.lower():
                lemma_ent = sym.build(entity.token.lemma.lower(),
                                      entity.namespace)
                self.output.edges.append(
                    (const.have_same_lemma, entity.to_hyperedge(), lemma_ent))
        else:
            for child_id in entity.children_ids:
                self.process_entity(child_id)
예제 #8
0
 def to_hyperedge(self, with_namespaces=True):
     if self.compound:
         words = [leaf.token.word for leaf in self.natural_leaf_sequence()]
         if not with_namespaces:
             s = sym.str2symbol('_'.join(words))
         else:
             if not self.namespace:
                 self.generate_namespace()
             s = sym.build('_'.join(words), self.namespace)
         return s
     else:
         return tuple([
             child.to_hyperedge(with_namespaces=with_namespaces)
             for child in self.children()
         ])
예제 #9
0
    def generate_synonyms(self, entity_id):
        # process children first
        entity = self.output.tree.get(entity_id)
        if entity.is_node():
            for i in range(len(entity.children_ids)):
                self.generate_synonyms(entity.children_ids[i])

        edge = entity.to_hyperedge()
        synonym = entity.to_synonym()
        if synonym:
            self.output.edges.append([cons.are_synonyms, edge, synonym])

        if entity.is_node() and entity.children()[0].is_connector():
            text = entity.as_text()
            ns = 'gb%s' % sym.hashed(ed.edge2str(edge))
            symbol = sym.build(text, ns)
            syn_edge = [cons.are_synonyms, edge, symbol]
            self.output.edges.append(syn_edge)
예제 #10
0
 def process_entity(self, entity):
     if entity == '<http://dbpedia.org/property/wordnet_type>':
         return const.is_type_of
     if '#' in entity:
         parts = entity.split('#')
     else:
         parts = entity.split('/')
     namespace = parts[-2]
     name = parts[-1]
     if len(namespace) == 0:
         return None
     if namespace[0] == '<':
         namespace = namespace[1:]
     if name[-1] == '>':
         name = name[:-1]
     if namespace == 'resource':
         name, namespace = self.process_resource(name)
     elif namespace == 'instances':
         name, namespace = process_wordnet_instance(name)
     if name is None:
         return None
     name = name.lower()
     return sym.build(name, namespace)
예제 #11
0
 def test_build(self):
     self.assertEqual(sym.build('graphbrain', '1'), 'graphbrain/1')
예제 #12
0
 def test_build(self):
     self.assertEqual(sym.build(['graphbrain', '1']), 'graphbrain/1')
예제 #13
0
def lemma2symbol(lemma):
    lemma_id = 'wn.%s' % lemma.synset().name()
    return sym.build(lemma.name().lower(), lemma_id)
예제 #14
0
def to_symbol(wikidata_id, label):
    ns = 'wd%s' % wikidata_id
    return sym.build(label, ns)
예제 #15
0
def generate(hg):
    print('starting parser...')
    parser = par.Parser()

    mer = Meronomy(hg, parser)

    print('reading edges...')
    total_edges = 0
    total_beliefs = 0

    total_verts = hg.symbol_count() + hg.edge_count()
    i = 0
    with progressbar.ProgressBar(max_value=total_verts) as bar:
        for vertex in hg.all():
            if sym.is_edge(vertex):
                edge = vertex
                total_edges += 1
                if hg.is_belief(edge):
                    mer.add_edge(edge)
                    total_beliefs += 1
            i += 1
            if (i % 1000) == 0:
                bar.update(i)

    print('edges: %s; beliefs: %s' % (total_edges, total_beliefs))

    print('post assignments...')
    i = 0
    with progressbar.ProgressBar(max_value=total_verts) as bar:
        for vertex in hg.all():
            if sym.is_edge(vertex):
                edge = vertex
                if hg.is_belief(edge):
                    mer.post_assignments(edge)
            i += 1
            if (i % 1000) == 0:
                bar.update(i)

    print('generating meronomy graph...')
    mer.generate()

    print('normalizing meronomy graph...')
    mer.normalize_graph()

    print('generating synonyms...')
    mer.generate_synonyms()

    print('writing synonyms...')
    i = 0
    with progressbar.ProgressBar(max_value=len(mer.synonym_sets)) as bar:
        for syn_id in mer.synonym_sets:
            edges = set()
            for atom in mer.synonym_sets[syn_id]:
                if atom in mer.edge_map:
                    edges |= mer.edge_map[atom]
            best_count = -1
            best_label_edge = None
            for edge in edges:
                if mer.edge_counts[edge] > best_count:
                    best_count = mer.edge_counts[edge]
                    best_label_edge = edge
            label = hg.get_label(best_label_edge)
            syn_symbol = sym.build(label, 'syn%s' % syn_id)
            for edge in edges:
                syn_edge = (cons.are_synonyms, edge, syn_symbol)
                hg.add(syn_edge)
            label_symbol = sym.build(label, cons.label_namespace)
            label_edge = (cons.has_label, syn_symbol, label_symbol)
            hg.add(label_edge)
            i += 1
            if i % 1000 == 0:
                bar.update(i)
        bar.update(i)

    print('%s synonym sets created' % len(mer.synonym_sets))
    print('done.')