Пример #1
0
    def build_from_data(self, twn, docs, max_depth=5, min_doc_num=20):
        starttime = time.time()
        cowordnet = self.build_global_cowordnet(docs)
        com_dect = comdect.LabelCommunityDetection(min_nodes=30)
        group = CommunityGroup(cowordnet, docs, com_dect)
        real_labels = None
        if self.logfile:
            real_labels = self.load_doc_labels(docs)
            self.resoutput = open(self.logfile, 'w')
            self.resoutput.write(
                'NCluster \tAdjusted_Rand \tAdjusted_NMI \tF-Measure \tV-Measure \tprecison \tcall\n'
            )
        depth = 0
        rootcom = Community(twn, group, len(docs))
        group.add_community(rootcom)

        while depth <= max_depth:
            acoms = group.active_coms()
            if not acoms: break
            print 'dividing community'
            for c in acoms:
                children = c.make_children()
                if children:
                    group.remove_community(c)
                    for ch in children:
                        group.add_community(ch)

            acoms = group.active_coms()
            if not acoms: break

            uncdocs = group.unclassified_docs()
            print 'Mapping unclassified document into communities'
            Community.map_docs_to_coms(uncdocs, acoms)
            group.remove_null_community(min_doc_num)
            depth += 1
            if self.logfile:
                predicted = group.doc_labels()
                rs = cmp_cluster_result(predicted, len(group), real_labels)
                self.resoutput.write(rs)
            print 'rebuilding wordnet'
            for c in acoms:
                c.rebuild_wordnet()

        self.merge_communities(group, 0.5)
        if self.logfile:
            predicted = group.doc_labels()
            rs = cmp_cluster_result(predicted, len(group), real_labels)
            self.resoutput.write(rs)
            self.resoutput.write('\r\n')
            self.resoutput.write(self.output_keywords(group).encode('utf8'))

            self.resoutput.close()
            #os.system('emacs ../doc_clustering_evalution.txt')
        self._run_time = time.time() - starttime
        print 'Elapsed time: %.2fs' % self._run_time
        return group.doc_labels()
Пример #2
0
    def build_from_data(self,twn,docs,max_depth=5, min_doc_num=20):
        starttime = time.time()
        cowordnet = self.build_global_cowordnet(docs)
        com_dect = comdect.LabelCommunityDetection(min_nodes=30)
        group = CommunityGroup(cowordnet,docs,com_dect)
        real_labels = None
        if self.logfile:
            real_labels = self.load_doc_labels(docs)
            self.resoutput = open(self.logfile,'w')
            self.resoutput.write('NCluster \tAdjusted_Rand \tAdjusted_NMI \tF-Measure \tV-Measure \tprecison \tcall\n')
        depth = 0
        rootcom = Community(twn,group,len(docs))
        group.add_community(rootcom)

        while depth<=max_depth:
            acoms = group.active_coms()
            if not acoms: break
            print 'dividing community'
            for c in acoms:
                children = c.make_children()
                if children:
                    group.remove_community(c)
                    for ch in children:
                        group.add_community(ch)
                  
            acoms = group.active_coms()
            if not acoms: break
 
            uncdocs = group.unclassified_docs()
            print 'Mapping unclassified document into communities'
            Community.map_docs_to_coms(uncdocs, acoms)
            group.remove_null_community(min_doc_num)
            depth += 1
            if self.logfile:
                predicted = group.doc_labels()
                rs = cmp_cluster_result(predicted,len(group),real_labels)
                self.resoutput.write(rs)
            print 'rebuilding wordnet'
            for c in acoms:
                c.rebuild_wordnet()

        self.merge_communities(group, 0.5)
        if self.logfile:
            predicted = group.doc_labels()
            rs = cmp_cluster_result(predicted,len(group),real_labels)
            self.resoutput.write(rs)
            self.resoutput.write('\r\n')
            self.resoutput.write(self.output_keywords(group).encode('utf8'))
        
            self.resoutput.close()
            #os.system('emacs ../doc_clustering_evalution.txt')
        self._run_time = time.time() - starttime
        print 'Elapsed time: %.2fs' % self._run_time
        return group.doc_labels()
Пример #3
0
 def load_title_wordnet(self, min_coocur=2, min_weight=1e-3):
     titleiter = Community.wordpair_weight(self.iter_title_words(),
                                           min_coocur, min_weight)
     elist = []
     for w1, w2, co, weight in titleiter:
         elist.append({'source': w1, 'target': w2, 'weight': weight})
     return igraph.Graph.DictList(vertices=None, edges=elist)
Пример #4
0
 def merge_communities(self, group, merge_freshold=0.5):
     n = len(group)
     dset = DisjoinSet(n)
     coms = list(iter(group))
     for i in range(0,n-1):
         for j in range(i+1,n):
             sim = Community.similarity(coms[i],coms[j])
             #print 'similarity: %.5f' % sim
             if sim > merge_freshold:
                 dset.union(i,j)
     
     clusters = dset.sets(min_size=2)
     for c in clusters:
         group.merge_communities([coms[i] for i in c])
Пример #5
0
    def merge_communities(self, group, merge_freshold=0.5):
        n = len(group)
        dset = DisjoinSet(n)
        coms = list(iter(group))
        for i in range(0, n - 1):
            for j in range(i + 1, n):
                sim = Community.similarity(coms[i], coms[j])
                #print 'similarity: %.5f' % sim
                if sim > merge_freshold:
                    dset.union(i, j)

        clusters = dset.sets(min_size=2)
        for c in clusters:
            group.merge_communities([coms[i] for i in c])
Пример #6
0
    def build_global_cowordnet(self, docs, min_coocur=2):
        dociter = Community.wordpair_weight(docs, min_coocur,0)
        # import itertools
        # co_dict = dict()
        # for words in docs:
        #     for wp in itertools.combinations(words,2):
        #         if wp[1]>wp[0]: wp = (wp[1],wp[0])
        #         try:
        #             co_dict[wp] += 1
        #         except: 
        #             co_dict[wp] = 1

        def dict2list(docs):
           for w1,w2,co,weight in docs:
               yield {'source':w1, 'target':w2, 'weight':co}

        return igraph.Graph.DictList(vertices=None, edges=dict2list(dociter))
Пример #7
0
    def build_global_cowordnet(self, docs, min_coocur=2):
        dociter = Community.wordpair_weight(docs, min_coocur, 0)

        # import itertools
        # co_dict = dict()
        # for words in docs:
        #     for wp in itertools.combinations(words,2):
        #         if wp[1]>wp[0]: wp = (wp[1],wp[0])
        #         try:
        #             co_dict[wp] += 1
        #         except:
        #             co_dict[wp] = 1

        def dict2list(docs):
            for w1, w2, co, weight in docs:
                yield {'source': w1, 'target': w2, 'weight': co}

        return igraph.Graph.DictList(vertices=None, edges=dict2list(dociter))
Пример #8
0
 def load_title_wordnet(self,min_coocur=2, min_weight=1e-3):
     titleiter = Community.wordpair_weight(self.iter_title_words(), min_coocur, min_weight)
     elist = []
     for w1,w2,co,weight in titleiter:
         elist.append({'source':w1, 'target':w2, 'weight':weight})
     return igraph.Graph.DictList(vertices=None, edges=elist)