def cluster(self, nclusters=None, npass=100, direction='row', initialid=None, do_pca=False, add_means=True, clear_nonsig=True): '''Perform the clustering; results in sortinds.''' if nclusters is None: # rule of thumb for choosing k: # http://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set#Rule_of_thumb rows = self.z.shape[0] nclusters = int(ceil((rows/2.)**.5)) # clear z along the diagonal if clear_nonsig: rows,cols = self.z.shape for r in range(rows): for c in range(cols): if r == c: self.z[r,c] = 0 self.z[self.sigmat==0] = 0 if do_pca: print 'Performing SVD on %s x %s matrix...' % (shape(self.z)) sys.stdout.flush() u,s,v = svd(self.z.data) if direction=='row': u = u[:,0] if direction == 'col': u = u[0,:] sortind = argsort(u, kind='mergesort') initialid = sortind % nclusters if direction == 'row': transpose = 1 axis = 1 elif direction == 'col': transpose = 0 axis = 0 else: raise ValueError, 'direction must be one of "row" or "col"; %s was provided' % direction print 'Clustering...' sys.stdout.flush() clusterid,error,nfound = kcluster(self.z,transpose=transpose,nclusters=nclusters,npass=npass,initialid=initialid) if add_means: means = self.z.mean(axis=axis) means /= means.max()+10 clusterid = clusterid.astype(float) clusterid += means sortind = argsort(clusterid, kind='mergesort') return sortind
def run_kmeans(matrix, k): clusterid, error, nfound = kcluster(matrix, nclusters=k) #change number of clusters clusternums = clusterid clusterid = [str(i) for i in clusternums] counts = {} for i in clusternums: if "Cluster " + str(i) not in counts: counts["Cluster " + str(i)] = 1; else: counts["Cluster " + str(i)] += 1 clusters_as_string = "\n".join(clusterid) #print(clusters_as_string) binvals = [] for c in counts: print(c + " = " + str(counts[c])) binvals.append(counts[c]) print("variance = ", np.var(binvals))
def cluster(self, assignAndReturnDetails=False, numberOfTopFeatures = 5, algorithmSource='nltk', **kwargs): bestFeatures, error = {}, None if algorithmSource=='nltk': clusterer = cluster.KMeansClusterer(self.numberOfClusters, euclidean_distance, **kwargs) clusters = clusterer.cluster(self.vectors, True) means = clusterer.means() for id, mean in zip(clusterer.cluster_names(), means): bestFeatures[id]=[(dimension, score) for dimension, score in sorted(zip([self.dimensions.get(Clustering.DIMENSION_TO_PHRASE, i) for i in range(len(mean))], mean), key=itemgetter(1), reverse=True)[:numberOfTopFeatures] if score>0] elif algorithmSource=='biopython': from Bio.Cluster import kcluster, clustercentroids clusters, error, _ = kcluster(self.vectors, nclusters=self.numberOfClusters, npass=kwargs['repeats']) means, _ = clustercentroids(self.vectors, self.masks, clusters) means = [unitVector(c) for c in means] for id, mean in zip(range(len(means)), means): bestFeatures[id]=[(dimension, score) for dimension, score in sorted(zip([self.dimensions.get(Clustering.DIMENSION_TO_PHRASE, i) for i in range(len(mean))], mean), key=itemgetter(1), reverse=True)[:numberOfTopFeatures] if score>0] if assignAndReturnDetails: documentAssignments=sorted([(docId, clusterId)for docId, clusterId in zip(self.docIds, clusters)], key=itemgetter(1)) clusters = dict((clusterId, [t[0] for t in documents]) for clusterId, documents in groupby(documentAssignments, key=itemgetter(1))) return {'clusters': clusters, 'bestFeatures': bestFeatures, 'error': error} return clusters
def test_kcluster(self): if TestCluster.module == 'Bio.Cluster': from Bio.Cluster import kcluster elif TestCluster.module == 'Pycluster': from Pycluster import kcluster nclusters = 3 # First data set weight = numpy.array([1, 1, 1, 1, 1]) data = numpy.array([[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5], [4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0]]) mask = numpy.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], int) # TODO - Use a context manager here once we drop Python 2.6 # Method should be one letter: self.assertRaises(ValueError, kcluster, data, **{"nclusters": nclusters, "mask": mask, "weight": weight, "transpose": 0, "npass": 100, "method": "any", "dist": "e"}) # Distance should be one letter: self.assertRaises(ValueError, kcluster, data, **{"nclusters": nclusters, "mask": mask, "weight": weight, "transpose": 0, "npass": 100, "method": "a", "dist": "euclidean"}) clusterid, error, nfound = kcluster(data, nclusters=nclusters, mask=mask, weight=weight, transpose=0, npass=100, method='a', dist='e') self.assertEqual(len(clusterid), len(data)) correct = [0, 1, 1, 2] mapping = [clusterid[correct.index(i)] for i in range(nclusters)] for i in range(len(clusterid)): self.assertEqual(clusterid[i], mapping[correct[i]]) # Second data set weight = numpy.array([1, 1]) data = numpy.array([[1.1, 1.2], [1.4, 1.3], [1.1, 1.5], [2.0, 1.5], [1.7, 1.9], [1.7, 1.9], [5.7, 5.9], [5.7, 5.9], [3.1, 3.3], [5.4, 5.3], [5.1, 5.5], [5.0, 5.5], [5.1, 5.2]]) mask = numpy.array([[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]], int) # TODO - Use a context manager here once we drop Python 2.6 # Method should be one letter: self.assertRaises(ValueError, kcluster, data, **{"nclusters": 3, "mask": mask, "weight": weight, "transpose": 0, "npass": 100, "method": "any", "dist": "e"}) # Distance should be one letter: self.assertRaises(ValueError, kcluster, data, **{"nclusters": 3, "mask": mask, "weight": weight, "transpose": 0, "npass": 100, "method": "a", "dist": "euclidean"}) clusterid, error, nfound = kcluster(data, nclusters=3, mask=mask, weight=weight, transpose=0, npass=100, method='a', dist='e') self.assertEqual(len(clusterid), len(data)) correct = [0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1] mapping = [clusterid[correct.index(i)] for i in range(nclusters)] for i in range(len(clusterid)): self.assertEqual(clusterid[i], mapping[correct[i]])
def test_kcluster(self): if TestCluster.module == "Bio.Cluster": from Bio.Cluster import kcluster elif TestCluster.module == "Pycluster": from Pycluster import kcluster nclusters = 3 # First data set weight = numpy.array([1, 1, 1, 1, 1]) data = numpy.array( [ [1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5], [4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0], ] ) mask = numpy.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], int) clusterid, error, nfound = kcluster( data, nclusters=nclusters, mask=mask, weight=weight, transpose=0, npass=100, method="a", dist="e" ) self.assertEqual(len(clusterid), len(data)) correct = [0, 1, 1, 2] mapping = [clusterid[correct.index(i)] for i in range(nclusters)] for i in range(len(clusterid)): self.assertEqual(clusterid[i], mapping[correct[i]]) # Second data set weight = numpy.array([1, 1]) data = numpy.array( [ [1.1, 1.2], [1.4, 1.3], [1.1, 1.5], [2.0, 1.5], [1.7, 1.9], [1.7, 1.9], [5.7, 5.9], [5.7, 5.9], [3.1, 3.3], [5.4, 5.3], [5.1, 5.5], [5.0, 5.5], [5.1, 5.2], ] ) mask = numpy.array( [[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]], int, ) clusterid, error, nfound = kcluster( data, nclusters=3, mask=mask, weight=weight, transpose=0, npass=100, method="a", dist="e" ) self.assertEqual(len(clusterid), len(data)) correct = [0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1] mapping = [clusterid[correct.index(i)] for i in range(nclusters)] for i in range(len(clusterid)): self.assertEqual(clusterid[i], mapping[correct[i]])
#!/usr/bin/env python import sys, re from collections import defaultdict from numpy import array,zeros from Bio.Cluster import kcluster import database if __name__ == "__main__": # test(1000) for db_name in sys.argv[1:]: sources = [] units = [] sources = database.read_db(db_name, units, max_units_read=1000) X = array([[getattr(u,x) for x in ['length','frequency_min', 'frequency_max', 'frequency_mean']] + [x for x in u.freq_fft] for u in units]) n_clusters = 200 clusterids, error, nfound = kcluster(X, n_clusters) sources_cluster = defaultdict(list) species_cluster = defaultdict(lambda:zeros(n_clusters, 'i')) total_cluster = zeros(n_clusters, 'i') for i in range(len(units)): source = units[i].source clusterid = clusterids[i] total_cluster[clusterid] += 1 sources_cluster[source].append(clusterid) species_cluster[source.scientific_name()][clusterid] += 1 for source in sources: print source, sources_cluster[source] for scientific_name in sorted(species_cluster.keys()): print "%-16s " % (re.sub(r'^(.)[a-z]*', r'\1', scientific_name)), " ".join(['%2d' % i for i in species_cluster[scientific_name]]) c = [] for clusterid in range(n_clusters):
def test_kcluster(module): if module=='Bio.Cluster': from Bio.Cluster import kcluster elif module=='Pycluster': from Pycluster import kcluster else: raise 'Unknown module name', module print "test_kcluster" nclusters = 3 # First data set weight1 = array([1,1,1,1,1]) data1 = array([[ 1.1, 2.2, 3.3, 4.4, 5.5], [ 3.1, 3.2, 1.3, 2.4, 1.5], [ 4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0]]) mask1 = array([[ 1, 1, 1, 1, 1], [ 1, 1, 1, 1, 1], [ 1, 1, 1, 1, 1], [ 1, 1, 1, 1, 1]]) weight2 = array([1,1]) # Second data set data2 = array([[ 1.1, 1.2 ], [ 1.4, 1.3 ], [ 1.1, 1.5 ], [ 2.0, 1.5 ], [ 1.7, 1.9 ], [ 1.7, 1.9 ], [ 5.7, 5.9 ], [ 5.7, 5.9 ], [ 3.1, 3.3 ], [ 5.4, 5.3 ], [ 5.1, 5.5 ], [ 5.0, 5.5 ], [ 5.1, 5.2 ]]) mask2 = array([[ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ]]) # test first data set print "First data set" clusterid, error, nfound = kcluster (data1, nclusters=nclusters, mask=mask1, weight=weight1, transpose=0, npass=100, method='a', dist='e') print "Number of cluster ids is %d (should be %d)" % (len(clusterid), len(data1)) correct = [0,1,1,2] mapping = [clusterid[correct.index(i)] for i in range(nclusters)] same = 1 for i in range(len(clusterid)): if clusterid[i]!=mapping[correct[i]]: same = 0 if same: print "Correct clustering solution found." else: print "Wrong clustering solution found." # test second data set print "Second data set" clusterid, error, nfound = kcluster (data2, nclusters=3, mask=mask2, weight=weight2, transpose=0, npass=100, method='a', dist='e') print "Number of cluster ids is %d (should be %d)" % (len(clusterid), len(data2)) correct = [0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1] mapping = [clusterid[correct.index(i)] for i in range(nclusters)] same = 1 for i in range(len(clusterid)): if clusterid[i]!=mapping[correct[i]]: same = 0 if same: print "Correct clustering solution found." else: print "Wrong clustering solution found." print
intermediate_layer_model = Model( inputs=cnnGenModel.input, outputs=cnnGenModel.get_layer("dense_27").output) reStruMovieList = [ intermediate_layer_model.predict([[col[i]] for col in x_train])[0] for i in range(len(x_train[0])) ] ssList = [] clusterListList = [] clusterRange = range(2, 19, 2) for i in tqdm.tqdm(clusterRange): reStruMovieArr = np.array(reStruMovieList) # clusterModel=KMeans(n=i) # clusterList=clusterModel.fit_predict(reStruMovieArr).tolist() clusterList = kcluster(reStruMovieArr, nclusters=i, dist="u")[0].tolist() clusterListList.append(clusterList) clusterMat=-np.dot(reStruMovieArr,reStruMovieArr.T)/\ np.dot(np.sqrt(np.sum(reStruMovieArr*reStruMovieArr,axis=1)),\ np.sqrt(np.sum(reStruMovieArr*reStruMovieArr,axis=1))) ss = metrics.silhouette_score(clusterMat, clusterList, metric="precomputed") ssList.append(ss) minIndex = ssList.index(min(ssList)) minClu = list(clusterRange)[minIndex] plt.plot(np.array(list(clusterRange)), np.array(ssList)) plt.show() print(clusterListList[1])
# encoding=utf-8 from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from Bio.Cluster import kcluster # 模拟文档集合 corpus = [ 'I like great basketball game', 'This video game is the best action game I have ever played', 'I really really like basketball', 'How about this movie? Is the plot great?', 'Do you like RPG game?', 'You can try this FPS game', 'The movie is really great, so great! I enjoy the plot' ] # 把文本中的词语转换为词典和相应的向量 vectorizer = CountVectorizer() vectors = vectorizer.fit_transform(corpus) # 构建tfidf的值 transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) # 输出每个文档的向量 tfidf_array = tfidf.toarray() words = vectorizer.get_feature_names() # 进行聚类,使用向量的夹角余弦作为相似读的度量 clusterid, error, nfound = kcluster(tfidf_array, nclusters=3, dict='u') print(clusterid)
clusterStarter = 2 clusterEnder = 9 clusterStep = 2 print("loading data ...") with open("structuredData/reStruMovieList.pkl", "rb") as reStruMovieListFile: reStruMovieList = pkl.load(reStruMovieListFile) clusterRange = range(clusterStarter, clusterEnder, clusterStep) for i in tqdm.tqdm(clusterRange): reStruMovieArr = np.array(reStruMovieList) # clusterModel=KMeans(n=i) # clusterList=clusterModel.fit_predict(reStruMovieArr).tolist() clusterList = kcluster(reStruMovieArr, nclusters=i, dist="u", npass=150)[0].tolist() clusterListList.append(clusterList) clusterMat=-np.dot(reStruMovieArr,reStruMovieArr.T)/\ np.dot(np.sqrt(np.sum(reStruMovieArr*reStruMovieArr,axis=1)),\ np.sqrt(np.sum(reStruMovieArr*reStruMovieArr,axis=1))) ss = metrics.silhouette_score(clusterMat, clusterList, metric="precomputed") ssList.append(ss) maxIndex = ssList.index(max(ssList)) maxClu = list(clusterRange)[maxIndex] print("developing figures...") plt.plot(np.array(list(clusterRange)), np.array(ssList)) clusterListArr = np.array(clusterListList)