def process_entity(self, entity): if entity == '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>': return const.is_type_of elif entity == '<http://www.w3.org/2000/01/rdf-schema#seeAlso>': return const.are_related elif entity == '<http://www.w3.org/2002/07/owl#Thing>': return None if '#' in entity: parts = entity.split('#') else: parts = entity.split('/') if len(parts) < 2: return None namespace = parts[-2] name = parts[-1] if len(namespace) == 0: return None if namespace[0] == '<': namespace = namespace[1:] if name[-1] == '>': name = name[:-1] if namespace == 'ontology': name, namespace = process_ontology(name) elif namespace == 'resource': name, namespace = self.process_resource(name) if name is None: return None name = name.lower() return sym.build(name, namespace)
def process_comments(self, post): if 'body' in post: author = sym.build(post['author'], 'reddit_user') self.process_text(post['body'], author, reset_context=False) if 'comments' in post: for comment in post['comments']: if comment: self.process_comments(comment)
def to_hyperedge(self, with_namespaces=True): if not with_namespaces: s = sym.str2symbol(self.token.word) else: s = sym.build(self.token.word, self.namespace) if self.connector: s = '+%s' % s return s
def process_post(self, post): web_entity = sym.build(post['web_entity'], 'web_entity') print('web_entity: %s' % web_entity) text = post['text'].strip() if len(text) == 0: return if text[-1].isalnum(): text += '.' self.process_text(text, web_entity)
def process_post(self, post): author = sym.build(post['author'], 'reddit_user') print('author: %s' % author) # aux_text = generate_aux_text(post) text = post['title'].strip() if text[-1].isalnum(): text += '.' self.process_text(text, author, reset_context=True, aux_text='') if self.comments: self.process_comments(post)
def generate_labels(self, entity_id): entity = self.output.tree.get(entity_id) if entity.is_node(): # process children first for i in range(len(entity.children_ids)): self.generate_labels(entity.children_ids[i]) # if entity.is_compound_concept(): edge = entity.to_hyperedge() text = entity.as_text() label = sym.build(text, cons.label_namespace) syn_edge = [cons.has_label, edge, label] self.output.edges.append(syn_edge)
def process_entity(self, entity_id): entity = self.output.tree.get(entity_id) entity.generate_namespace() if entity.is_leaf(): if entity.token.word.lower() != entity.token.lemma.lower(): lemma_ent = sym.build(entity.token.lemma.lower(), entity.namespace) self.output.edges.append( (const.have_same_lemma, entity.to_hyperedge(), lemma_ent)) else: for child_id in entity.children_ids: self.process_entity(child_id)
def to_hyperedge(self, with_namespaces=True): if self.compound: words = [leaf.token.word for leaf in self.natural_leaf_sequence()] if not with_namespaces: s = sym.str2symbol('_'.join(words)) else: if not self.namespace: self.generate_namespace() s = sym.build('_'.join(words), self.namespace) return s else: return tuple([ child.to_hyperedge(with_namespaces=with_namespaces) for child in self.children() ])
def generate_synonyms(self, entity_id): # process children first entity = self.output.tree.get(entity_id) if entity.is_node(): for i in range(len(entity.children_ids)): self.generate_synonyms(entity.children_ids[i]) edge = entity.to_hyperedge() synonym = entity.to_synonym() if synonym: self.output.edges.append([cons.are_synonyms, edge, synonym]) if entity.is_node() and entity.children()[0].is_connector(): text = entity.as_text() ns = 'gb%s' % sym.hashed(ed.edge2str(edge)) symbol = sym.build(text, ns) syn_edge = [cons.are_synonyms, edge, symbol] self.output.edges.append(syn_edge)
def process_entity(self, entity): if entity == '<http://dbpedia.org/property/wordnet_type>': return const.is_type_of if '#' in entity: parts = entity.split('#') else: parts = entity.split('/') namespace = parts[-2] name = parts[-1] if len(namespace) == 0: return None if namespace[0] == '<': namespace = namespace[1:] if name[-1] == '>': name = name[:-1] if namespace == 'resource': name, namespace = self.process_resource(name) elif namespace == 'instances': name, namespace = process_wordnet_instance(name) if name is None: return None name = name.lower() return sym.build(name, namespace)
def test_build(self): self.assertEqual(sym.build('graphbrain', '1'), 'graphbrain/1')
def test_build(self): self.assertEqual(sym.build(['graphbrain', '1']), 'graphbrain/1')
def lemma2symbol(lemma): lemma_id = 'wn.%s' % lemma.synset().name() return sym.build(lemma.name().lower(), lemma_id)
def to_symbol(wikidata_id, label): ns = 'wd%s' % wikidata_id return sym.build(label, ns)
def generate(hg): print('starting parser...') parser = par.Parser() mer = Meronomy(hg, parser) print('reading edges...') total_edges = 0 total_beliefs = 0 total_verts = hg.symbol_count() + hg.edge_count() i = 0 with progressbar.ProgressBar(max_value=total_verts) as bar: for vertex in hg.all(): if sym.is_edge(vertex): edge = vertex total_edges += 1 if hg.is_belief(edge): mer.add_edge(edge) total_beliefs += 1 i += 1 if (i % 1000) == 0: bar.update(i) print('edges: %s; beliefs: %s' % (total_edges, total_beliefs)) print('post assignments...') i = 0 with progressbar.ProgressBar(max_value=total_verts) as bar: for vertex in hg.all(): if sym.is_edge(vertex): edge = vertex if hg.is_belief(edge): mer.post_assignments(edge) i += 1 if (i % 1000) == 0: bar.update(i) print('generating meronomy graph...') mer.generate() print('normalizing meronomy graph...') mer.normalize_graph() print('generating synonyms...') mer.generate_synonyms() print('writing synonyms...') i = 0 with progressbar.ProgressBar(max_value=len(mer.synonym_sets)) as bar: for syn_id in mer.synonym_sets: edges = set() for atom in mer.synonym_sets[syn_id]: if atom in mer.edge_map: edges |= mer.edge_map[atom] best_count = -1 best_label_edge = None for edge in edges: if mer.edge_counts[edge] > best_count: best_count = mer.edge_counts[edge] best_label_edge = edge label = hg.get_label(best_label_edge) syn_symbol = sym.build(label, 'syn%s' % syn_id) for edge in edges: syn_edge = (cons.are_synonyms, edge, syn_symbol) hg.add(syn_edge) label_symbol = sym.build(label, cons.label_namespace) label_edge = (cons.has_label, syn_symbol, label_symbol) hg.add(label_edge) i += 1 if i % 1000 == 0: bar.update(i) bar.update(i) print('%s synonym sets created' % len(mer.synonym_sets)) print('done.')