示例#1
0
    def plot_scatter(self):
        '''
        Overrides the parent class method. Plots all the data points in 2D.
        '''
        #Create a clusterer document list to get the index of a doc (horrible hack I know)
        clusterer_document_list = [key for key in self.document_dict.keys()] 
        corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()])
        all_terms_vector = list(set(corpus))
        table = construct_orange_table(all_terms_vector)
        meta_col_name="cluster_id"
        table = add_metas_to_table(table, meta_col_name=meta_col_name)

        instances = []
        for cluster in self.clusters:
            for doc_id, doc_content in cluster.document_dict.iteritems():
                index = clusterer_document_list.index(doc_id)
                
                #We use index = 1 to force the function to construct the vector according to all the documents in the collection
                self.construct_term_doc_matrix(index, doc_content)
                oc = OnlineCluster(self.td_matrix, 1, doc_id, doc_content, self.attributes)
                oc.resize(all_terms_vector)
                inst = Orange.data.Instance(table.domain, list(oc.center))
                inst[meta_col_name] = str(cluster.id)
                instances.insert(index, inst)

        #we have a table with the clusters ids as metas.                
        table.extend(instances)        
        from visualizations.mds import MDS
        mds = MDS(table)
        classes_list = []
        for c in self.clusters:
            classes_list.append(c.id)   
                     
        mds.plot(classes_list=classes_list, class_col_name="cluster_id")
示例#2
0
    def cluster(self, document):
        '''
        Performs clustering for a new document. It takes as input 
        a document object from the db and finds the closer cluster for it.
        '''
        doc_index = self.add_document(document)
        doc_id = str(document.id)
        doc_content = document.content

        self.construct_term_doc_matrix(index=doc_index, document=doc_content)
        
        print 'N', len(self.clusters)
        print 'clustering', doc_index
        if doc_index > 0: #ignore the first document
            #e = doc_index
            e = self.td_matrix
            newc=OnlineCluster(a=e, cluster_id=self.cluster_id_counter, doc_id=doc_id, doc_content=doc_content, term_vector=self.attributes) 
            
            #If the new term vector is larger then change all the cluster centers
            #However, if the new term vector is smaller then pad the new cluster's center
            if len(self.clusters) > 0:
                if len(newc.term_vector) > len(self.clusters[0].term_vector):
                    self.resize()
                else:
                    newc.resize(self.clusters[0].term_vector)
                    e = newc.center
                
            if len(self.clusters)>0: 
                # Compare the new document to each existing cluster
                c=[ ( i, kernel_dist(x.center, e) ) for i,x in enumerate(self.clusters)]
                closest_cluster = min( c , key=operator.itemgetter(1))
                if closest_cluster[1] < 1.0:
                    closest=self.clusters[closest_cluster[0]]
                    closest.add(e, doc_id, doc_content)
                    # invalidate dist-cache for this cluster
                    self.updatedist(closest)
                else:
                    # make a new cluster for this point
                    self.clusters.append(newc)
                    self.updatedist(newc)
    
            if len(self.clusters)>=self.N and len(self.clusters)>1:
                # merge closest two clusters. It doesn't matter which ones, Only the closest
                m=heapq.heappop(self.dist)
                m.x.merge(m.y)
                self.clusters.remove(m.y)
                self.removedist(m.y)
                self.updatedist(m.x)
                self.cluster_id_counter += 1
        else:
            newc=OnlineCluster(a=self.td_matrix, cluster_id=self.cluster_id_counter, doc_id=doc_id, doc_content=doc_content, term_vector=self.attributes) 
            self.clusters.append(newc)
            self.updatedist(newc)
示例#3
0
    def test_cluster_center_resize(self):
        center = numpy.array([0, 0, 1, 1, 0, 1])
        terms = ['test0', 'test1', 'test2', 'test3', 'test4', 'test5']
        cluster = OnlineCluster(center, 1, 1, "test", terms)

        #'Test a longer term vector
        new_terms_longer_list = [
            'test0', 'test1', 'test2', 'test3', 'test4', 'test5', 'test6',
            'test7'
        ]
        cluster.resize(new_terms_longer_list)
        expected_center = numpy.array([0, 0, 1, 1, 0, 1, 0,
                                       0])  #2 zeroes in the end
        self.assertTrue(numpy.sum(expected_center - cluster.center) == 0)
        self.assertEquals(new_terms_longer_list, cluster.term_vector)

        #'Test a smaller term vector
        new_terms_smaller_list = ['test0', 'test1', 'test2', 'test3']
        cluster.resize(new_terms_smaller_list)
        expected_center = numpy.array([0, 0, 1, 1])
        self.assertTrue(numpy.sum(expected_center - cluster.center) == 0)
        self.assertEquals(new_terms_smaller_list, cluster.term_vector)

        #Test a longer term vector and shuffled
        new_terms_longer_listand_shuffled = [
            'test7', 'test0', 'test5', 'test2', 'test1', 'test3', 'test4',
            'test6'
        ]
        cluster.resize(new_terms_longer_listand_shuffled)
        expected_center = numpy.array([0, 0, 0, 1, 0, 1, 0, 0])
        self.assertTrue(numpy.sum(expected_center - cluster.center) == 0)
        self.assertEquals(new_terms_longer_listand_shuffled,
                          cluster.term_vector)
示例#4
0
    def test_cluster_center_resize(self):
        center = numpy.array([0, 0, 1, 1, 0, 1])
        terms = ['test0', 'test1', 'test2', 'test3', 'test4', 'test5']
        cluster = OnlineCluster(center, 1, 1, "test", terms) 

        #'Test a longer term vector
        new_terms_longer_list = ['test0', 'test1', 'test2', 'test3', 'test4', 'test5', 'test6', 'test7'] 
        cluster.resize(new_terms_longer_list)
        expected_center = numpy.array([0, 0, 1, 1, 0, 1, 0, 0]) #2 zeroes in the end
        self.assertTrue(numpy.sum(expected_center - cluster.center) == 0)
        self.assertEquals(new_terms_longer_list, cluster.term_vector)
        
        #'Test a smaller term vector
        new_terms_smaller_list = ['test0', 'test1', 'test2', 'test3'] 
        cluster.resize(new_terms_smaller_list)
        expected_center = numpy.array([0, 0, 1, 1])
        self.assertTrue(numpy.sum(expected_center - cluster.center) == 0)
        self.assertEquals(new_terms_smaller_list, cluster.term_vector)

        #Test a longer term vector and shuffled
        new_terms_longer_listand_shuffled = ['test7', 'test0', 'test5', 'test2', 'test1', 'test3', 'test4', 'test6']  
        cluster.resize(new_terms_longer_listand_shuffled)
        expected_center = numpy.array([0, 0, 0, 1, 0, 1, 0, 0])
        self.assertTrue(numpy.sum(expected_center - cluster.center) == 0)
        self.assertEquals(new_terms_longer_listand_shuffled, cluster.term_vector)
示例#5
0
    def plot_scatter(self):
        '''
        Overrides the parent class method. Plots all the data points in 2D.
        '''
        #Create a clusterer document list to get the index of a doc (horrible hack I know)
        clusterer_document_list = [key for key in self.document_dict.keys()]
        corpus = nltk.TextCollection(
            [document.tokens for document in self.document_dict.values()])
        all_terms_vector = list(set(corpus))
        table = construct_orange_table(all_terms_vector)
        meta_col_name = "cluster_id"
        table = add_metas_to_table(table, meta_col_name=meta_col_name)

        instances = []
        for cluster in self.clusters:
            for doc_id, doc_content in cluster.document_dict.iteritems():
                index = clusterer_document_list.index(doc_id)

                #We use index = 1 to force the function to construct the vector according to all the documents in the collection
                self.construct_term_doc_matrix(index, doc_content)
                oc = OnlineCluster(self.td_matrix, 1, doc_id, doc_content,
                                   self.attributes)
                oc.resize(all_terms_vector)
                inst = Orange.data.Instance(table.domain, list(oc.center))
                inst[meta_col_name] = str(cluster.id)
                instances.insert(index, inst)

        #we have a table with the clusters ids as metas.
        table.extend(instances)
        from visualizations.mds import MDS
        mds = MDS(table)
        classes_list = []
        for c in self.clusters:
            classes_list.append(c.id)

        mds.plot(classes_list=classes_list, class_col_name="cluster_id")
示例#6
0
    def cluster(self, document):
        '''
        Performs clustering for a new document. It takes as input 
        a document object from the db and finds the closer cluster for it.
        '''
        doc_index = self.add_document(document)
        doc_id = str(document.id)
        doc_content = document.content

        self.construct_term_doc_matrix(index=doc_index, document=doc_content)

        print 'N', len(self.clusters)
        print 'clustering', doc_index
        if doc_index > 0:  #ignore the first document
            #e = doc_index
            e = self.td_matrix
            newc = OnlineCluster(a=e,
                                 cluster_id=self.cluster_id_counter,
                                 doc_id=doc_id,
                                 doc_content=doc_content,
                                 term_vector=self.attributes)

            #If the new term vector is larger then change all the cluster centers
            #However, if the new term vector is smaller then pad the new cluster's center
            if len(self.clusters) > 0:
                if len(newc.term_vector) > len(self.clusters[0].term_vector):
                    self.resize()
                else:
                    newc.resize(self.clusters[0].term_vector)
                    e = newc.center

            if len(self.clusters) > 0:
                # Compare the new document to each existing cluster
                c = [(i, kernel_dist(x.center, e))
                     for i, x in enumerate(self.clusters)]
                closest_cluster = min(c, key=operator.itemgetter(1))
                if closest_cluster[1] < 1.0:
                    closest = self.clusters[closest_cluster[0]]
                    closest.add(e, doc_id, doc_content)
                    # invalidate dist-cache for this cluster
                    self.updatedist(closest)
                else:
                    # make a new cluster for this point
                    self.clusters.append(newc)
                    self.updatedist(newc)

            if len(self.clusters) >= self.N and len(self.clusters) > 1:
                # merge closest two clusters. It doesn't matter which ones, Only the closest
                m = heapq.heappop(self.dist)
                m.x.merge(m.y)
                self.clusters.remove(m.y)
                self.removedist(m.y)
                self.updatedist(m.x)
                self.cluster_id_counter += 1
        else:
            newc = OnlineCluster(a=self.td_matrix,
                                 cluster_id=self.cluster_id_counter,
                                 doc_id=doc_id,
                                 doc_content=doc_content,
                                 term_vector=self.attributes)
            self.clusters.append(newc)
            self.updatedist(newc)