예제 #1
0
    def add_edge(self, edge_ns):
        is_edge = sym.is_edge(edge_ns)
        edge = ed.without_namespaces(edge_ns)

        # discard common words
        if not is_edge:
            word = self.parser.make_word(edge)
            if word.prob > MAX_PROB:
                return False

        orig = self.edge2str(edge_ns)

        # add to edge_map
        if orig not in self.edge_map:
            self.edge_map[orig] = set()
        self.edge_map[orig].add(edge_ns)

        self.vertices.add(orig)
        self.atoms[orig] = ed.depth(edge)

        if is_edge:
            for e_ns in edge_ns:
                targ = self.edge2str(e_ns)
                if targ:
                    if self.add_edge(e_ns):
                        e = ed.without_namespaces(e_ns)
                        # if is_concept(edge):
                        self.add_link(orig, targ)
                        # elif is_concept(e):
                        self.edge_counts[e_ns] += 1
        return True
예제 #2
0
 def test_without_namespaces(self):
     self.assertEqual(ed.without_namespaces('graphbrain/1'), 'graphbrain')
     self.assertEqual(
         ed.without_namespaces(('is', 'graphbrain/1', 'great/1')),
         ('is', 'graphbrain', 'great'))
     self.assertEqual(
         ed.without_namespaces(
             ('is', 'graphbrain/1', ('super', 'great/1'))),
         ('is', 'graphbrain', ('super', 'great')))
예제 #3
0
def write_edge_data(edge_data, file_path):
    f = open(file_path, 'w')
    for e in edge_data:
        # f.write('%s\n' % json.dumps(e, separators=(',', ':')))
        f.write('%s\n' % str(e['sim']))
        f.write('%s\n' % e['text'])
        f.write('%s\n' % ed.edge2str(ed.without_namespaces(ed.str2edge(e['edge']))))
    f.close()
예제 #4
0
 def get_concepts(self, edge):
     if sym.is_edge(edge):
         concepts = {syn.main_synonym(self.hg, edge)}
         if len(edge) > 1:
             for item in edge[1:]:
                 concepts |= self.get_concepts(item)
         return concepts
     else:
         word = self.parser.make_word(unidecode(ed.without_namespaces(edge)))
         if word.prob > MAX_PROB:
             return set()
         if edge[0] in {'`', '_', "'"}:
             return set()
         else:
             return {syn.main_synonym(self.hg, edge)}
예제 #5
0
    def edges_with_term(self):
        edges = self.hg.all()

        filtered_edges = []
        for edge in edges:
            if not exclude(edge):
                if ed.contains(ed.without_namespaces(edge), self.term):
                    print(edge)
                    filtered_edges.append(edge)

        result = []
        for e in filtered_edges:
            edge_data = {'edge': ed.edge2str(e),
                         'text': self.hg.get_str_attribute(e, 'text')}
            result.append(edge_data)
        return result
예제 #6
0
 def print_atom_groups(self):
     n = 0
     for k in self.atom_groups:
         atom_group = self.atom_groups[k]
         size = len(atom_group['sentences'])
         if size > 3:
             n += 1
             print('ATOM_GROUP id: %s' % n)
             print('Base concepts: %s' % atom_group['label'])
             print('size: %s' % size)
             print('sentences:')
             for sentence in atom_group['sentences']:
                 print('* %s' % sentence)
             print('edges:')
             for edge in atom_group['edges']:
                 print(
                     '* %s' %
                     ed.edge2str(ed.without_namespaces(ed.str2edge(edge))))
             print()
예제 #7
0
    def generate_atom_group_clusters(self, edges):
        # build atom_group coocurrence sparse matrix
        nag = len(self.atom_groups)
        ag_cooc = sps.lil_matrix((nag, nag))
        for edge in edges:
            edge = ed.without_namespaces(edge)
            co_ags = self.find_co_atom_groups(edge)
            for pair in co_ags:
                ag_cooc[pair[0], pair[1]] += 1
                ag_cooc[pair[1], pair[0]] += 1

        # normalize matrix
        ag_cooc = normalize(ag_cooc, norm='l1', axis=1, copy=False)

        # iterate matrix, build graph
        gedges = []
        weights = []
        cx = ag_cooc.tocoo()
        for i, j, v in zip(cx.row, cx.col, cx.data):
            gedges.append((i, j))
            weights.append(v)
        g = igraph.Graph()
        g.add_vertices(nag)
        g.add_edges(gedges)
        g.es['weight'] = weights

        # community detection
        comms = igraph.Graph.community_multilevel(g,
                                                  weights='weight',
                                                  return_levels=False)

        # build atom_group_clusters
        self.atom_group_clusters = {}
        for i in range(len(comms)):
            comm = comms[i]
            labels = []
            for item in comm:
                labels.append('[%s]{%s}' % (self.atom_groups[item]['label'],
                                            self.atom_groups[item]['count']))
            label = ' + '.join(labels)
            atom_group_cluster = {'label': label}
            self.atom_group_clusters[i] = atom_group_cluster
예제 #8
0
    # build extra edges list
    # extra_edges = []
    # full_edges = []
    # for it in edge_data:
    #     e = ed.str2edge(it['edge'])
    #     full_edges.append(e)
    #     matched = [ed.str2edge(match[1]) for match in it['matches']]
    #     for part in e[1:]:
    #         if part not in matched:
    #             extra_edges.append(part)

    edge_data = json_tools.read('all.json')
    # build full edges list
    extra_edges = []
    for it in edge_data:
        extra_edges.append(ed.without_namespaces(ed.str2edge(it['edge'])))
    full_edges = extra_edges

    ag = AtomGroups(par)
    print('set edges')
    ag.set_edges(extra_edges)
    print('generate_atoms')
    ag.generate_atoms()
    print('generate synonyms')
    ag.generate_synonyms()
    print('generate atom groups')
    ag.generate_atom_groups()
    ag.print_atom_groups()
    print('generate atom group clusters')
    ag.generate_atom_group_clusters(full_edges)
    ag.print_atom_group_clusters()