def add_edge(self, edge_ns): is_edge = sym.is_edge(edge_ns) edge = ed.without_namespaces(edge_ns) # discard common words if not is_edge: word = self.parser.make_word(edge) if word.prob > MAX_PROB: return False orig = self.edge2str(edge_ns) # add to edge_map if orig not in self.edge_map: self.edge_map[orig] = set() self.edge_map[orig].add(edge_ns) self.vertices.add(orig) self.atoms[orig] = ed.depth(edge) if is_edge: for e_ns in edge_ns: targ = self.edge2str(e_ns) if targ: if self.add_edge(e_ns): e = ed.without_namespaces(e_ns) # if is_concept(edge): self.add_link(orig, targ) # elif is_concept(e): self.edge_counts[e_ns] += 1 return True
def test_without_namespaces(self): self.assertEqual(ed.without_namespaces('graphbrain/1'), 'graphbrain') self.assertEqual( ed.without_namespaces(('is', 'graphbrain/1', 'great/1')), ('is', 'graphbrain', 'great')) self.assertEqual( ed.without_namespaces( ('is', 'graphbrain/1', ('super', 'great/1'))), ('is', 'graphbrain', ('super', 'great')))
def write_edge_data(edge_data, file_path): f = open(file_path, 'w') for e in edge_data: # f.write('%s\n' % json.dumps(e, separators=(',', ':'))) f.write('%s\n' % str(e['sim'])) f.write('%s\n' % e['text']) f.write('%s\n' % ed.edge2str(ed.without_namespaces(ed.str2edge(e['edge'])))) f.close()
def get_concepts(self, edge): if sym.is_edge(edge): concepts = {syn.main_synonym(self.hg, edge)} if len(edge) > 1: for item in edge[1:]: concepts |= self.get_concepts(item) return concepts else: word = self.parser.make_word(unidecode(ed.without_namespaces(edge))) if word.prob > MAX_PROB: return set() if edge[0] in {'`', '_', "'"}: return set() else: return {syn.main_synonym(self.hg, edge)}
def edges_with_term(self): edges = self.hg.all() filtered_edges = [] for edge in edges: if not exclude(edge): if ed.contains(ed.without_namespaces(edge), self.term): print(edge) filtered_edges.append(edge) result = [] for e in filtered_edges: edge_data = {'edge': ed.edge2str(e), 'text': self.hg.get_str_attribute(e, 'text')} result.append(edge_data) return result
def print_atom_groups(self): n = 0 for k in self.atom_groups: atom_group = self.atom_groups[k] size = len(atom_group['sentences']) if size > 3: n += 1 print('ATOM_GROUP id: %s' % n) print('Base concepts: %s' % atom_group['label']) print('size: %s' % size) print('sentences:') for sentence in atom_group['sentences']: print('* %s' % sentence) print('edges:') for edge in atom_group['edges']: print( '* %s' % ed.edge2str(ed.without_namespaces(ed.str2edge(edge)))) print()
def generate_atom_group_clusters(self, edges): # build atom_group coocurrence sparse matrix nag = len(self.atom_groups) ag_cooc = sps.lil_matrix((nag, nag)) for edge in edges: edge = ed.without_namespaces(edge) co_ags = self.find_co_atom_groups(edge) for pair in co_ags: ag_cooc[pair[0], pair[1]] += 1 ag_cooc[pair[1], pair[0]] += 1 # normalize matrix ag_cooc = normalize(ag_cooc, norm='l1', axis=1, copy=False) # iterate matrix, build graph gedges = [] weights = [] cx = ag_cooc.tocoo() for i, j, v in zip(cx.row, cx.col, cx.data): gedges.append((i, j)) weights.append(v) g = igraph.Graph() g.add_vertices(nag) g.add_edges(gedges) g.es['weight'] = weights # community detection comms = igraph.Graph.community_multilevel(g, weights='weight', return_levels=False) # build atom_group_clusters self.atom_group_clusters = {} for i in range(len(comms)): comm = comms[i] labels = [] for item in comm: labels.append('[%s]{%s}' % (self.atom_groups[item]['label'], self.atom_groups[item]['count'])) label = ' + '.join(labels) atom_group_cluster = {'label': label} self.atom_group_clusters[i] = atom_group_cluster
# build extra edges list # extra_edges = [] # full_edges = [] # for it in edge_data: # e = ed.str2edge(it['edge']) # full_edges.append(e) # matched = [ed.str2edge(match[1]) for match in it['matches']] # for part in e[1:]: # if part not in matched: # extra_edges.append(part) edge_data = json_tools.read('all.json') # build full edges list extra_edges = [] for it in edge_data: extra_edges.append(ed.without_namespaces(ed.str2edge(it['edge']))) full_edges = extra_edges ag = AtomGroups(par) print('set edges') ag.set_edges(extra_edges) print('generate_atoms') ag.generate_atoms() print('generate synonyms') ag.generate_synonyms() print('generate atom groups') ag.generate_atom_groups() ag.print_atom_groups() print('generate atom group clusters') ag.generate_atom_group_clusters(full_edges) ag.print_atom_group_clusters()