def plot_scatter(self): ''' Overrides the parent class method. Plots all the data points in 2D. ''' #Create a clusterer document list to get the index of a doc (horrible hack I know) clusterer_document_list = [key for key in self.document_dict.keys()] corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()]) all_terms_vector = list(set(corpus)) table = construct_orange_table(all_terms_vector) meta_col_name="cluster_id" table = add_metas_to_table(table, meta_col_name=meta_col_name) instances = [] for cluster in self.clusters: for doc_id, doc_content in cluster.document_dict.iteritems(): index = clusterer_document_list.index(doc_id) #We use index = 1 to force the function to construct the vector according to all the documents in the collection self.construct_term_doc_matrix(index, doc_content) oc = OnlineCluster(self.td_matrix, 1, doc_id, doc_content, self.attributes) oc.resize(all_terms_vector) inst = Orange.data.Instance(table.domain, list(oc.center)) inst[meta_col_name] = str(cluster.id) instances.insert(index, inst) #we have a table with the clusters ids as metas. table.extend(instances) from visualizations.mds import MDS mds = MDS(table) classes_list = [] for c in self.clusters: classes_list.append(c.id) mds.plot(classes_list=classes_list, class_col_name="cluster_id")
def cluster(self, document): ''' Performs clustering for a new document. It takes as input a document object from the db and finds the closer cluster for it. ''' doc_index = self.add_document(document) doc_id = str(document.id) doc_content = document.content self.construct_term_doc_matrix(index=doc_index, document=doc_content) print 'N', len(self.clusters) print 'clustering', doc_index if doc_index > 0: #ignore the first document #e = doc_index e = self.td_matrix newc=OnlineCluster(a=e, cluster_id=self.cluster_id_counter, doc_id=doc_id, doc_content=doc_content, term_vector=self.attributes) #If the new term vector is larger then change all the cluster centers #However, if the new term vector is smaller then pad the new cluster's center if len(self.clusters) > 0: if len(newc.term_vector) > len(self.clusters[0].term_vector): self.resize() else: newc.resize(self.clusters[0].term_vector) e = newc.center if len(self.clusters)>0: # Compare the new document to each existing cluster c=[ ( i, kernel_dist(x.center, e) ) for i,x in enumerate(self.clusters)] closest_cluster = min( c , key=operator.itemgetter(1)) if closest_cluster[1] < 1.0: closest=self.clusters[closest_cluster[0]] closest.add(e, doc_id, doc_content) # invalidate dist-cache for this cluster self.updatedist(closest) else: # make a new cluster for this point self.clusters.append(newc) self.updatedist(newc) if len(self.clusters)>=self.N and len(self.clusters)>1: # merge closest two clusters. It doesn't matter which ones, Only the closest m=heapq.heappop(self.dist) m.x.merge(m.y) self.clusters.remove(m.y) self.removedist(m.y) self.updatedist(m.x) self.cluster_id_counter += 1 else: newc=OnlineCluster(a=self.td_matrix, cluster_id=self.cluster_id_counter, doc_id=doc_id, doc_content=doc_content, term_vector=self.attributes) self.clusters.append(newc) self.updatedist(newc)
def test_cluster_center_resize(self): center = numpy.array([0, 0, 1, 1, 0, 1]) terms = ['test0', 'test1', 'test2', 'test3', 'test4', 'test5'] cluster = OnlineCluster(center, 1, 1, "test", terms) #'Test a longer term vector new_terms_longer_list = [ 'test0', 'test1', 'test2', 'test3', 'test4', 'test5', 'test6', 'test7' ] cluster.resize(new_terms_longer_list) expected_center = numpy.array([0, 0, 1, 1, 0, 1, 0, 0]) #2 zeroes in the end self.assertTrue(numpy.sum(expected_center - cluster.center) == 0) self.assertEquals(new_terms_longer_list, cluster.term_vector) #'Test a smaller term vector new_terms_smaller_list = ['test0', 'test1', 'test2', 'test3'] cluster.resize(new_terms_smaller_list) expected_center = numpy.array([0, 0, 1, 1]) self.assertTrue(numpy.sum(expected_center - cluster.center) == 0) self.assertEquals(new_terms_smaller_list, cluster.term_vector) #Test a longer term vector and shuffled new_terms_longer_listand_shuffled = [ 'test7', 'test0', 'test5', 'test2', 'test1', 'test3', 'test4', 'test6' ] cluster.resize(new_terms_longer_listand_shuffled) expected_center = numpy.array([0, 0, 0, 1, 0, 1, 0, 0]) self.assertTrue(numpy.sum(expected_center - cluster.center) == 0) self.assertEquals(new_terms_longer_listand_shuffled, cluster.term_vector)
def test_cluster_center_resize(self): center = numpy.array([0, 0, 1, 1, 0, 1]) terms = ['test0', 'test1', 'test2', 'test3', 'test4', 'test5'] cluster = OnlineCluster(center, 1, 1, "test", terms) #'Test a longer term vector new_terms_longer_list = ['test0', 'test1', 'test2', 'test3', 'test4', 'test5', 'test6', 'test7'] cluster.resize(new_terms_longer_list) expected_center = numpy.array([0, 0, 1, 1, 0, 1, 0, 0]) #2 zeroes in the end self.assertTrue(numpy.sum(expected_center - cluster.center) == 0) self.assertEquals(new_terms_longer_list, cluster.term_vector) #'Test a smaller term vector new_terms_smaller_list = ['test0', 'test1', 'test2', 'test3'] cluster.resize(new_terms_smaller_list) expected_center = numpy.array([0, 0, 1, 1]) self.assertTrue(numpy.sum(expected_center - cluster.center) == 0) self.assertEquals(new_terms_smaller_list, cluster.term_vector) #Test a longer term vector and shuffled new_terms_longer_listand_shuffled = ['test7', 'test0', 'test5', 'test2', 'test1', 'test3', 'test4', 'test6'] cluster.resize(new_terms_longer_listand_shuffled) expected_center = numpy.array([0, 0, 0, 1, 0, 1, 0, 0]) self.assertTrue(numpy.sum(expected_center - cluster.center) == 0) self.assertEquals(new_terms_longer_listand_shuffled, cluster.term_vector)
def plot_scatter(self): ''' Overrides the parent class method. Plots all the data points in 2D. ''' #Create a clusterer document list to get the index of a doc (horrible hack I know) clusterer_document_list = [key for key in self.document_dict.keys()] corpus = nltk.TextCollection( [document.tokens for document in self.document_dict.values()]) all_terms_vector = list(set(corpus)) table = construct_orange_table(all_terms_vector) meta_col_name = "cluster_id" table = add_metas_to_table(table, meta_col_name=meta_col_name) instances = [] for cluster in self.clusters: for doc_id, doc_content in cluster.document_dict.iteritems(): index = clusterer_document_list.index(doc_id) #We use index = 1 to force the function to construct the vector according to all the documents in the collection self.construct_term_doc_matrix(index, doc_content) oc = OnlineCluster(self.td_matrix, 1, doc_id, doc_content, self.attributes) oc.resize(all_terms_vector) inst = Orange.data.Instance(table.domain, list(oc.center)) inst[meta_col_name] = str(cluster.id) instances.insert(index, inst) #we have a table with the clusters ids as metas. table.extend(instances) from visualizations.mds import MDS mds = MDS(table) classes_list = [] for c in self.clusters: classes_list.append(c.id) mds.plot(classes_list=classes_list, class_col_name="cluster_id")
def cluster(self, document): ''' Performs clustering for a new document. It takes as input a document object from the db and finds the closer cluster for it. ''' doc_index = self.add_document(document) doc_id = str(document.id) doc_content = document.content self.construct_term_doc_matrix(index=doc_index, document=doc_content) print 'N', len(self.clusters) print 'clustering', doc_index if doc_index > 0: #ignore the first document #e = doc_index e = self.td_matrix newc = OnlineCluster(a=e, cluster_id=self.cluster_id_counter, doc_id=doc_id, doc_content=doc_content, term_vector=self.attributes) #If the new term vector is larger then change all the cluster centers #However, if the new term vector is smaller then pad the new cluster's center if len(self.clusters) > 0: if len(newc.term_vector) > len(self.clusters[0].term_vector): self.resize() else: newc.resize(self.clusters[0].term_vector) e = newc.center if len(self.clusters) > 0: # Compare the new document to each existing cluster c = [(i, kernel_dist(x.center, e)) for i, x in enumerate(self.clusters)] closest_cluster = min(c, key=operator.itemgetter(1)) if closest_cluster[1] < 1.0: closest = self.clusters[closest_cluster[0]] closest.add(e, doc_id, doc_content) # invalidate dist-cache for this cluster self.updatedist(closest) else: # make a new cluster for this point self.clusters.append(newc) self.updatedist(newc) if len(self.clusters) >= self.N and len(self.clusters) > 1: # merge closest two clusters. It doesn't matter which ones, Only the closest m = heapq.heappop(self.dist) m.x.merge(m.y) self.clusters.remove(m.y) self.removedist(m.y) self.updatedist(m.x) self.cluster_id_counter += 1 else: newc = OnlineCluster(a=self.td_matrix, cluster_id=self.cluster_id_counter, doc_id=doc_id, doc_content=doc_content, term_vector=self.attributes) self.clusters.append(newc) self.updatedist(newc)