def test_edge2str(self): self.assertEqual(ed.edge2str(('is', 'graphbrain/1', 'great/1')), '(is graphbrain/1 great/1)') self.assertEqual(ed.edge2str(('size', 'graphbrain/1', 7)), '(size graphbrain/1 7)') self.assertEqual(ed.edge2str(('size', 'graphbrain/1', 7.)), '(size graphbrain/1 7.0)') self.assertEqual(ed.edge2str(('size', 'graphbrain/1', -7.)), '(size graphbrain/1 -7.0)') self.assertEqual(ed.edge2str(('src', 'graphbrain/1', ('is', 'graphbrain/1', 'great/1'))), '(src graphbrain/1 (is graphbrain/1 great/1))')
def remove_raw(self, edge): """Auxiliary function for remove! to call from inside a transaction.""" if self.exists(edge): for vert in edge: self.dec_degree(ed.edge2str(vert)) self.remove_edge_permutations(edge) self.remove_str(ed.edge2str(edge))
def add_raw(self, edge, timestamp): """Auxiliary function for add to call from inside a transaction.""" if not self.exists(edge): for vert in edge: vert_str = ed.edge2str(vert) if not self.inc_degree(vert_str): self.add_str(vert_str, 1, timestamp) self.add_str(ed.edge2str(edge), 0, timestamp) self.write_edge_permutations(edge) return edge
def star(self, center): """Return all the edges that contain a given entity. Entity can be atomic or an edge.""" center_id = center if isinstance(center, (list, tuple)): center_id = ed.edge2str(center) return self.str2perms(center_id)
def do_with_edge_permutations(edge, f): """Applies the function f to all permutations of the given edge.""" nperms = math.factorial(len(edge)) for nperm in range(nperms): perm_str = ' '.join([ed.edge2str(e) for e in nthperm(edge, nperm)]) perm_str = '%s %s' % (perm_str, nperm) f(perm_str)
def degree(self, vertex): """Returns the degree of a vertex.""" vert_str = ed.edge2str(vertex) cur = self.open_cursor() cur.execute('SELECT degree FROM vertices WHERE id=%s' % (self.ph,), (vert_str,)) for row in cur: deg = row[0] cur.close() return deg self.close_cursor(cur, local=True, commit=False) return 0
def timestamp(self, vertex): """Returns the timestamp of a vertex.""" vert_str = ed.edge2str(vertex) cur = self.open_cursor() cur.execute('SELECT timestamp FROM vertices WHERE id=%s' % (self.ph,), (vert_str,)) for row in cur: ts = row[0] cur.close() return ts self.close_cursor(cur, local=True, commit=False) return -1
def edge_to_visual(hg, edge, depth): rels = edge[0] entities = edge[1:] if sym.sym_type(rels) != sym.SymbolType.EDGE: rels = (rels, ) visual_edge = [] if len(entities) == 1 and len(rels) == 1: visual_edge.append( edge_html(hg, rels[0], show_degree=False, outer=False, rel=True, depth=depth + 1)) visual_edge.append( edge_html(hg, entities[0], show_degree=False, outer=False, rel=False, depth=depth + 1)) else: for i in range(len(entities)): visual_edge.append( edge_html(hg, entities[i], show_degree=False, outer=False, rel=False, depth=depth + 1)) if len(rels) > i: visual_edge.append( edge_html(hg, rels[i], show_degree=False, outer=False, rel=True, depth=depth + 1)) if depth > 0: zoom_in_html = '<a href="/vertex?id=%s">%s</a>'\ % (urllib.parse.quote_plus(ed.edge2str(edge)), '<span class="glyphicon glyphicon-zoom-in zoom-in" aria-hidden="true" />') visual_edge.append(zoom_in_html) return visual_edge
def generate_synonyms(self, entity_id): # process children first entity = self.output.tree.get(entity_id) if entity.is_node(): for i in range(len(entity.children_ids)): self.generate_synonyms(entity.children_ids[i]) edge = entity.to_hyperedge() synonym = entity.to_synonym() if synonym: self.output.edges.append([cons.are_synonyms, edge, synonym]) if entity.is_node() and entity.children()[0].is_connector(): text = entity.as_text() ns = 'gb%s' % sym.hashed(ed.edge2str(edge)) symbol = sym.build(text, ns) syn_edge = [cons.are_synonyms, edge, symbol] self.output.edges.append(syn_edge)
def add_edges(self, edge): if sym.is_edge(edge): for item in edge: self.add_edges(item) edge_str = ed.edge2str(edge, namespaces=False) if not sym.is_edge(edge): if edge_str[0] == '+': edge_str = edge_str[1:] if len(edge_str) == 0: return if not edge_str[0].isalnum(): return if self.parser.make_word(edge_str).prob > MAX_PROB: return if edge_str not in self.edge_counts: self.edge_counts[edge_str] = 0 self.edge_counts[edge_str] += 1
def edge_html(hg, edge, show_degree=False, outer=True, rel=False, depth=0): if sym.sym_type(edge) == sym.SymbolType.EDGE: depth_class = 'depth%s' % str(depth) html_edge = '<div class="hyperedge %s">%s</div>' % ( depth_class, ' '.join(edge_to_visual(hg, edge, depth))) if outer: extra_html = '' if show_degree: degree = hg.degree(edge) extra_html = '<span class="badge">%s</span>' % degree html_outer = '<a href="/vertex?id=%s">%s</a>'\ % (urllib.parse.quote_plus(ed.edge2str(edge)), '<span class="glyphicon glyphicon-zoom-out zoom-out" aria-hidden="true" />') html_edge = '<div class="outer-hyperedge">%s%s%s</div>' % ( html_edge, html_outer, extra_html) return html_edge else: return symbol_html(edge, rel)
def edge2str(self, edge): s = ed.edge2str(edge, namespaces=False) if sym.is_edge(edge): return s if s[0] == '+': s = s[1:] if len(s) == 0: return None if not s[0].isalnum(): return None word = self.parser.make_word(s) if word.prob < MAX_PROB: return s return None
def print_atom_groups(self): n = 0 for k in self.atom_groups: atom_group = self.atom_groups[k] size = len(atom_group['sentences']) if size > 3: n += 1 print('ATOM_GROUP id: %s' % n) print('Base concepts: %s' % atom_group['label']) print('size: %s' % size) print('sentences:') for sentence in atom_group['sentences']: print('* %s' % sentence) print('edges:') for edge in atom_group['edges']: print( '* %s' % ed.edge2str(ed.without_namespaces(ed.str2edge(edge)))) print()
def read_text(self, text, aux_text=None, reset_context=True): if self.parser is None: self.debug_msg('creating parser...') self.parser = Parser() self.disamb = Disambiguation(self.hg, self.parser) nlp_parses = self.parser.parse_text(text) if reset_context: self.aux_text = text if aux_text: self.aux_text = '%s\n%s' % (text, aux_text) parses = [(p[0], self.read_sentence(Sentence(p[1]))) for p in nlp_parses] for p in parses: self.debug_msg('== extra ==') for edg in p[1].edges: self.debug_msg(ed.edge2str(edg)) return parses
def test_all_metrics(self): self.hg.destroy() self.hg.add(('size', 'graphbrain/1', -7.0)) self.hg.add(('is', 'graphbrain/1', 'great/1')) self.hg.add(('src', 'mary/1', ('is', 'graphbrain/1', 'great/1'))) labels = set([ '%s %s' % (ed.edge2str(t[0]), t[1]['d']) for t in self.hg.all_attributes() ]) self.assertNotEqual( labels, { 'size 1', 'graphbrain/1 2', '-7.0 1', 'is 1', 'great/1 1', 'src 1', 'mary/1 1', '(size graphbrain/1 -7.0) 0', '(is graphbrain/1 great/1) 1', '(src mary/1 (is graphbrain/1 great/1)) 0' }) self.hg.destroy() labels = set(self.hg.all_attributes()) self.assertEqual(labels, set())
def similar_edges(self, targ_edge): edges = self.hg.all() targ_eedge = enrich_edge(self.parser, targ_edge) sims = {} for edge in edges: if edge != targ_edge and not exclude(edge): eedge = enrich_edge(self.parser, edge) total_sim = simil.eedge_similarity(targ_eedge, eedge) if total_sim >= self.sim_threshold: sims[ed.edge2str(edge)] = total_sim sorted_edges = sorted(sims.items(), key=operator.itemgetter(1), reverse=True) result = [] for e in sorted_edges: edge_data = {'edge': e[0], 'sim': e[1], 'text': self.hg.get_str_attribute(ed.str2edge(e[0]), 'text')} result.append(edge_data) return result
def edges_with_similar_concepts(self, targ_edge): edges = self.hg.all() targ_eedge = enrich_edge(self.parser, targ_edge) sims = {} for edge in edges: if edge != targ_edge and not exclude(edge): eedge = enrich_edge(self.parser, edge) total_sim, worst_sim, complete, matches = simil.edge_concepts_similarity(targ_eedge, eedge) if complete and worst_sim >= self.sim_threshold: sims[ed.edge2str(edge)] = (worst_sim, total_sim, matches) sorted_edges = sorted(sims.items(), key=operator.itemgetter(1), reverse=True) result = [] for e in sorted_edges: edge_data = {'edge': e[0], 'worst_sim': e[1][0], 'sim': e[1][1], 'matches': e[1][2], 'text': self.hg.get_str_attribute(ed.str2edge(e[0]), 'text')} result.append(edge_data) return result
def generate_atom_groups(self): nsyns = len(self.synonym_sets) # build coocurrence sparse matrix synonym_cooc = sps.lil_matrix((nsyns, nsyns)) for edge in extra_edges: co_synonyms = self.find_co_synonyms(edge) if len(co_synonyms) > 1: for pair in itertools.combinations(co_synonyms, 2): synonym_cooc[pair[0], pair[1]] += 1 synonym_cooc[pair[1], pair[0]] += 1 # normalize matrix synonym_cooc = normalize(synonym_cooc, norm='l1', axis=1, copy=False) # iterate matrix, build graph gedges = [] weights = [] cx = synonym_cooc.tocoo() for i, j, v in zip(cx.row, cx.col, cx.data): gedges.append((i, j)) weights.append(v) g = igraph.Graph() g.add_vertices(nsyns) g.add_edges(gedges) g.es['weight'] = weights # community detection comms = igraph.Graph.community_multilevel(g, weights='weight', return_levels=False) # build atom_groups self.atom_groups = {} for i in range(len(comms)): comm = comms[i] count = 0 syns = [] sentences = set() edges = [] for item in comm: edges += self.synonym_map[item]['edges'] for atom in self.synonym_map[item]['edges']: for edat in edge_data: if ed.contains(ed.str2edge( ed.edge2str(ed.str2edge(edat['edge']), namespaces=False)), ed.str2edge(atom), deep=True): if edat['text']: sentences.add(edat['text']) syns.append(self.synonym_map[item]) count += self.synonym_map[item]['count'] label = ', '.join(edges) atom_group = { 'label': label, 'syns': syns, 'count': count, 'sentences': sentences, 'edges': edges } self.atom_groups[i] = atom_group
def vertex2key(vertex): return ('v%s' % ed.edge2str(vertex)).encode('utf-8')
from gb.explore.similarity import edge_similarity if __name__ == '__main__': print('creating parser...') par = par.Parser() print('parser created.') edge_data = json_tools.read('edges_similar_concepts.json') extra_edges = {} for item in edge_data: edge = ed.str2edge(item['edge']) matched = [ed.str2edge(match[1]) for match in item['matches']] for part in edge[1:]: if part not in matched: key = ed.edge2str(part) if key in extra_edges: extra_edges[key] += 1 else: extra_edges[key] = 1 sorted_edges = sorted(extra_edges.items(), key=operator.itemgetter(1), reverse=False) print(sorted_edges) print(len(sorted_edges)) print('creating distance matrix...') size = len(sorted_edges) dists = np.zeros((size, size)) for i in range(size):
def write_edge_permutation(self, perm): perm_key = (u'p%s' % ed.edge2str(perm)).encode('utf-8') self.db.put(perm_key, b'x')
def remove_edge_permutation(self, perm): perm_key = (u'p%s' % ed.edge2str(perm)).encode('utf-8') self.db.delete(perm_key)
def timestamp(self, vertex): """Returns the timestamp of a vertex.""" logging.debug('[hypergraph timestamp()] %s' % ed.edge2str(vertex)) return self.backend.timestamp(vertex)
def get_float_attribute(self, vertex, attribute, or_else=None): """Returns attribute as float value.""" logging.debug( '[hypergraph get_float_attribute()] %s attribute: %s; or_else: %s' % (ed.edge2str(vertex), attribute, or_else)) return self.backend.get_float_attribute(vertex, attribute, or_else)
def write_edge_permutation(self, perm): eid = ed.edge2str(perm) self.update_or_insert('perms', {'id': eid}, eid)
def f(x): return '%s %s' % (ed.edge2str(x['vertex']), x['degree'])
def edges_with_symbols(self, symbols, root=None): """Find all edges containing the given symbols, and optionally a given root""" logging.debug('[hypergraph edges_with_symbols()] %s root: %s' % (symbols, ed.edge2str(root))) return self.backend.edges_with_symbols(symbols, root)
def pattern2edges(self, pattern): """Return all the edges that match a pattern. A pattern is a collection of entity ids and wildcards (None).""" logging.debug('[hypergraph pattern2edges()] %s' % ed.edge2str(pattern)) return self.backend.pattern2edges(pattern)
def exists(self, vertex): """Checks if the given edge exists in the hypergraph.""" logging.debug('[hypergraph exists()] %s' % ed.edge2str(vertex)) return self.backend.exists(vertex)
def dec_attribute(self, vertex, attribute): """Increments an attribute of a vertex.""" logging.debug('[hypergraph dec_attribute()] %s attribute: %s' % (ed.edge2str(vertex), attribute)) return self.backend.dec_attribute(vertex, attribute)
def remove_edge_permutation(self, perm): eid = ed.edge2str(perm) cur = self.conn.cursor() cur.execute('DELETE FROM perms WHERE id=%s' % (self.ph,), (eid,)) self.conn.commit() cur.close()
def degree(self, vertex): """Returns the degree of a vertex.""" logging.debug('[hypergraph degree()] %s' % ed.edge2str(vertex)) return self.backend.degree(vertex)
def remove_by_pattern(self, pattern): """Removes from the hypergraph all edges that match the pattern.""" logging.debug('[hypergraph remove_by_pattern()] %s' % ed.edge2str(pattern)) edges = self.pattern2edges(pattern) for edge in edges: self.remove(edge)
def sources(self, edge): """Set of sources (nodes) that support a statement (edge).""" logging.debug('[hypergraph sources()] %s' % ed.edge2str(edge)) edges = self.pattern2edges((const.source, edge, None)) sources = [edge[2] for edge in edges] return set(sources)
def symbols_with_root(self, root): """Find all symbols with the given root.""" logging.debug('[hypergraph symbols_with_root()] %s' % ed.edge2str(root)) if len(root) == 0: return {} return self.backend.symbols_with_root(root)
def remove(self, edge): """Removes and edge from the hypergraph.""" logging.debug('[hypergraph remove()] %s' % ed.edge2str(edge)) if isinstance(edge, (list, tuple)): self.backend.remove(edge)
def edge2str(edge): s = ed.edge2str(edge, namespaces=False) s = unidecode(s) s = s.replace('.', '') return s
def star(self, center, limit=None): """Return all the edges that contain a given entity. Entity can be atomic or an edge.""" logging.debug('[hypergraph star()] %s' % ed.edge2str(center)) return self.backend.star(center, limit=limit)
concept_sets = [concepts1, concepts2, concepts3] # filter edges print('before filter: %s' % len(full_edges)) full_edges = [edge for edge in full_edges if contains_all_concept_sets(edge, concept_sets)] print('after filter: %s' % len(full_edges)) # build graph g = Graph(par, full_edges, black_list=synset1+synset2) pr_pairs = g.synset_pr_pairs() remaining_edges = full_edges[:] covered = set() for pr_pair in pr_pairs[:50]: syn_id = int(pr_pair[0]) pr = pr_pair[1] count = 0 new_remaining_edges = [] for full_edge in remaining_edges: if g.contains_synonym(full_edge, syn_id): count += 1 covered.add(ed.edge2str(full_edge, namespaces=False)) else: new_remaining_edges.append(full_edge) remaining_edges = new_remaining_edges if count > 0: print('%s [%s]{%s} %.2f%% %s' % (g.meronomy.synonym_label(syn_id), count, len(covered), (float(len(covered)) / float(len(full_edges))) * 100., pr))
def set_attribute(self, vertex, attribute, value): """Sets the value of an attribute.""" logging.debug('[hypergraph set_attribute()] %s %s=%s' % (ed.edge2str(vertex), attribute, value)) return self.backend.set_attribute(vertex, attribute, value)
def f(x): return "%s %s" % (ed.edge2str(x["vertex"]), x["degree"])