def cluster(self, nclusters=None, npass=100, direction='row', initialid=None, 
                do_pca=False, add_means=True, clear_nonsig=True):
        '''Perform the clustering; results in sortinds.'''
        if nclusters is None:
            # rule of thumb for choosing k:
            # http://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set#Rule_of_thumb
            rows = self.z.shape[0]
            nclusters = int(ceil((rows/2.)**.5))

        # clear z along the diagonal
        if clear_nonsig:
            rows,cols = self.z.shape
            for r in range(rows):
                for c in range(cols):
                    if r == c:
                        self.z[r,c] = 0
            self.z[self.sigmat==0] = 0

        if do_pca:
            print 'Performing SVD on %s x %s matrix...' % (shape(self.z))
            sys.stdout.flush()
            u,s,v = svd(self.z.data)
            if direction=='row':
                u = u[:,0]
            if direction == 'col':
                u = u[0,:]
            sortind = argsort(u, kind='mergesort')
            initialid = sortind % nclusters
        
        if direction == 'row':
            transpose = 1
            axis = 1
        elif direction == 'col':
            transpose = 0
            axis = 0
        else:
            raise ValueError, 'direction must be one of "row" or "col"; %s was provided' % direction


        print 'Clustering...'
        sys.stdout.flush()
        clusterid,error,nfound = kcluster(self.z,transpose=transpose,nclusters=nclusters,npass=npass,initialid=initialid)
        
        if add_means:
            means = self.z.mean(axis=axis)
            means /= means.max()+10
            clusterid = clusterid.astype(float)
            clusterid += means

        sortind = argsort(clusterid, kind='mergesort')
        return sortind
Пример #2
0
def run_kmeans(matrix, k):
    clusterid, error, nfound = kcluster(matrix, nclusters=k) #change number of clusters
    clusternums = clusterid
    clusterid = [str(i) for i in clusternums]
    counts = {}
    for i in clusternums:
        if "Cluster " + str(i) not in counts:
            counts["Cluster " + str(i)] = 1;
        else:
            counts["Cluster " + str(i)] += 1
    clusters_as_string = "\n".join(clusterid)
    #print(clusters_as_string)
    binvals = []
    for c in counts:
        print(c + " = " + str(counts[c]))
        binvals.append(counts[c])
    print("variance = ", np.var(binvals))
Пример #3
0
 def cluster(self, assignAndReturnDetails=False, numberOfTopFeatures = 5, algorithmSource='nltk', **kwargs):
     bestFeatures, error = {}, None
     if algorithmSource=='nltk':
         clusterer = cluster.KMeansClusterer(self.numberOfClusters, euclidean_distance, **kwargs)
         clusters = clusterer.cluster(self.vectors, True)
         means = clusterer.means()
         for id, mean in zip(clusterer.cluster_names(), means): bestFeatures[id]=[(dimension, score) for dimension, score in sorted(zip([self.dimensions.get(Clustering.DIMENSION_TO_PHRASE, i) for i in range(len(mean))], mean), key=itemgetter(1), reverse=True)[:numberOfTopFeatures] if score>0]
     elif algorithmSource=='biopython':
         from Bio.Cluster import kcluster, clustercentroids
         clusters, error, _ = kcluster(self.vectors, nclusters=self.numberOfClusters, npass=kwargs['repeats'])
         means, _ = clustercentroids(self.vectors, self.masks, clusters)
         means = [unitVector(c) for c in means]
         for id, mean in zip(range(len(means)), means): bestFeatures[id]=[(dimension, score) for dimension, score in sorted(zip([self.dimensions.get(Clustering.DIMENSION_TO_PHRASE, i) for i in range(len(mean))], mean), key=itemgetter(1), reverse=True)[:numberOfTopFeatures] if score>0]
     if assignAndReturnDetails: 
         documentAssignments=sorted([(docId, clusterId)for docId, clusterId in zip(self.docIds, clusters)], key=itemgetter(1))
         clusters = dict((clusterId, [t[0] for t in documents]) for clusterId, documents in groupby(documentAssignments, key=itemgetter(1)))
         return {'clusters': clusters, 'bestFeatures': bestFeatures, 'error': error}
     return clusters
Пример #4
0
    def test_kcluster(self):
        if TestCluster.module == 'Bio.Cluster':
            from Bio.Cluster import kcluster
        elif TestCluster.module == 'Pycluster':
            from Pycluster import kcluster

        nclusters = 3
        # First data set
        weight = numpy.array([1, 1, 1, 1, 1])
        data = numpy.array([[1.1, 2.2, 3.3, 4.4, 5.5],
                            [3.1, 3.2, 1.3, 2.4, 1.5],
                            [4.1, 2.2, 0.3, 5.4, 0.5],
                            [12.1, 2.0, 0.0, 5.0, 0.0]])
        mask = numpy.array([[1, 1, 1, 1, 1],
                             [1, 1, 1, 1, 1],
                             [1, 1, 1, 1, 1],
                             [1, 1, 1, 1, 1]], int)

        # TODO - Use a context manager here once we drop Python 2.6
        # Method should be one letter:
        self.assertRaises(ValueError, kcluster, data,
                          **{"nclusters": nclusters, "mask": mask,
                             "weight": weight, "transpose": 0, "npass": 100,
                             "method": "any", "dist": "e"})

        # Distance should be one letter:
        self.assertRaises(ValueError, kcluster, data,
                          **{"nclusters": nclusters, "mask": mask,
                             "weight": weight, "transpose": 0, "npass": 100,
                             "method": "a", "dist": "euclidean"})

        clusterid, error, nfound = kcluster(data, nclusters=nclusters,
                                            mask=mask, weight=weight,
                                            transpose=0, npass=100,
                                            method='a', dist='e')
        self.assertEqual(len(clusterid), len(data))

        correct = [0, 1, 1, 2]
        mapping = [clusterid[correct.index(i)] for i in range(nclusters)]
        for i in range(len(clusterid)):
            self.assertEqual(clusterid[i], mapping[correct[i]])

        # Second data set
        weight = numpy.array([1, 1])
        data = numpy.array([[1.1, 1.2],
                      [1.4, 1.3],
                      [1.1, 1.5],
                      [2.0, 1.5],
                      [1.7, 1.9],
                      [1.7, 1.9],
                      [5.7, 5.9],
                      [5.7, 5.9],
                      [3.1, 3.3],
                      [5.4, 5.3],
                      [5.1, 5.5],
                      [5.0, 5.5],
                      [5.1, 5.2]])
        mask = numpy.array([[1, 1],
                            [1, 1],
                            [1, 1],
                            [1, 1],
                            [1, 1],
                            [1, 1],
                            [1, 1],
                            [1, 1],
                            [1, 1],
                            [1, 1],
                            [1, 1],
                            [1, 1],
                            [1, 1]], int)

        # TODO - Use a context manager here once we drop Python 2.6
        # Method should be one letter:
        self.assertRaises(ValueError, kcluster, data,
                          **{"nclusters": 3, "mask": mask,
                             "weight": weight, "transpose": 0, "npass": 100,
                             "method": "any", "dist": "e"})

        # Distance should be one letter:
        self.assertRaises(ValueError, kcluster, data,
                          **{"nclusters": 3, "mask": mask,
                             "weight": weight, "transpose": 0, "npass": 100,
                             "method": "a", "dist": "euclidean"})

        clusterid, error, nfound = kcluster(data, nclusters=3, mask=mask,
                                            weight=weight, transpose=0,
                                            npass=100, method='a', dist='e')
        self.assertEqual(len(clusterid), len(data))

        correct = [0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1]
        mapping = [clusterid[correct.index(i)] for i in range(nclusters)]
        for i in range(len(clusterid)):
            self.assertEqual(clusterid[i], mapping[correct[i]])
Пример #5
0
    def test_kcluster(self):
        if TestCluster.module == "Bio.Cluster":
            from Bio.Cluster import kcluster
        elif TestCluster.module == "Pycluster":
            from Pycluster import kcluster

        nclusters = 3
        # First data set
        weight = numpy.array([1, 1, 1, 1, 1])
        data = numpy.array(
            [
                [1.1, 2.2, 3.3, 4.4, 5.5],
                [3.1, 3.2, 1.3, 2.4, 1.5],
                [4.1, 2.2, 0.3, 5.4, 0.5],
                [12.1, 2.0, 0.0, 5.0, 0.0],
            ]
        )
        mask = numpy.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], int)

        clusterid, error, nfound = kcluster(
            data, nclusters=nclusters, mask=mask, weight=weight, transpose=0, npass=100, method="a", dist="e"
        )
        self.assertEqual(len(clusterid), len(data))

        correct = [0, 1, 1, 2]
        mapping = [clusterid[correct.index(i)] for i in range(nclusters)]
        for i in range(len(clusterid)):
            self.assertEqual(clusterid[i], mapping[correct[i]])

        # Second data set
        weight = numpy.array([1, 1])
        data = numpy.array(
            [
                [1.1, 1.2],
                [1.4, 1.3],
                [1.1, 1.5],
                [2.0, 1.5],
                [1.7, 1.9],
                [1.7, 1.9],
                [5.7, 5.9],
                [5.7, 5.9],
                [3.1, 3.3],
                [5.4, 5.3],
                [5.1, 5.5],
                [5.0, 5.5],
                [5.1, 5.2],
            ]
        )
        mask = numpy.array(
            [[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]],
            int,
        )

        clusterid, error, nfound = kcluster(
            data, nclusters=3, mask=mask, weight=weight, transpose=0, npass=100, method="a", dist="e"
        )
        self.assertEqual(len(clusterid), len(data))

        correct = [0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1]
        mapping = [clusterid[correct.index(i)] for i in range(nclusters)]
        for i in range(len(clusterid)):
            self.assertEqual(clusterid[i], mapping[correct[i]])
Пример #6
0
#!/usr/bin/env python
import sys, re
from collections import defaultdict 
from numpy import array,zeros 
from Bio.Cluster import kcluster 
import database

if __name__ == "__main__":
#   test(1000)
    for db_name in sys.argv[1:]:
        sources = []
        units = []
        sources = database.read_db(db_name, units, max_units_read=1000)
        X = array([[getattr(u,x) for x in ['length','frequency_min', 'frequency_max', 'frequency_mean']] + [x for x in u.freq_fft] for u in units])
        n_clusters = 200
        clusterids, error, nfound = kcluster(X, n_clusters)
        sources_cluster = defaultdict(list)
        species_cluster = defaultdict(lambda:zeros(n_clusters, 'i'))
        total_cluster = zeros(n_clusters, 'i')
        for i in range(len(units)):
            source = units[i].source
            clusterid = clusterids[i]
            total_cluster[clusterid] += 1
            sources_cluster[source].append(clusterid)
            species_cluster[source.scientific_name()][clusterid] += 1
        for source in sources:
            print source, sources_cluster[source]
        for scientific_name in sorted(species_cluster.keys()):
            print "%-16s " % (re.sub(r'^(.)[a-z]*', r'\1', scientific_name)), " ".join(['%2d' % i for i in species_cluster[scientific_name]])
        c = []
        for clusterid in range(n_clusters):
Пример #7
0
def test_kcluster(module):
  if module=='Bio.Cluster':
    from Bio.Cluster import kcluster
  elif module=='Pycluster':
    from Pycluster import kcluster
  else:
    raise 'Unknown module name', module
  print "test_kcluster"
  nclusters = 3
  # First data set
  weight1 =  array([1,1,1,1,1])
  data1   =  array([[ 1.1, 2.2, 3.3, 4.4, 5.5],
                    [ 3.1, 3.2, 1.3, 2.4, 1.5], 
                    [ 4.1, 2.2, 0.3, 5.4, 0.5], 
                    [12.1, 2.0, 0.0, 5.0, 0.0]]) 
  mask1 =  array([[ 1, 1, 1, 1, 1], 
                  [ 1, 1, 1, 1, 1], 
                  [ 1, 1, 1, 1, 1], 
                  [ 1, 1, 1, 1, 1]]) 
  weight2 =  array([1,1])

  # Second data set
  data2 = array([[ 1.1, 1.2 ],
                 [ 1.4, 1.3 ],
                 [ 1.1, 1.5 ],
                 [ 2.0, 1.5 ],
                 [ 1.7, 1.9 ],
                 [ 1.7, 1.9 ],
                 [ 5.7, 5.9 ],
                 [ 5.7, 5.9 ],
                 [ 3.1, 3.3 ],
                 [ 5.4, 5.3 ],
                 [ 5.1, 5.5 ],
                 [ 5.0, 5.5 ],
                 [ 5.1, 5.2 ]])
  mask2 = array([[ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ]])

  # test first data set
  print "First data set"
  clusterid, error, nfound = kcluster (data1, nclusters=nclusters, mask=mask1, weight=weight1, transpose=0, npass=100, method='a', dist='e')
  print "Number of cluster ids is %d (should be %d)" % (len(clusterid), len(data1))
  correct = [0,1,1,2]
  mapping = [clusterid[correct.index(i)] for i in range(nclusters)]
  same = 1
  for i in range(len(clusterid)):
    if clusterid[i]!=mapping[correct[i]]: same = 0
  if same: print "Correct clustering solution found."
  else: print "Wrong clustering solution found."

  # test second data set
  print "Second data set"
  clusterid, error, nfound = kcluster (data2, nclusters=3, mask=mask2, weight=weight2, transpose=0, npass=100, method='a', dist='e')
  print "Number of cluster ids is %d (should be %d)" % (len(clusterid), len(data2))
  correct = [0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1]
  mapping = [clusterid[correct.index(i)] for i in range(nclusters)]
  same = 1
  for i in range(len(clusterid)):
    if clusterid[i]!=mapping[correct[i]]: same = 0
  if same: print "Correct clustering solution found."
  else: print "Wrong clustering solution found."
  print
Пример #8
0
    intermediate_layer_model = Model(
        inputs=cnnGenModel.input,
        outputs=cnnGenModel.get_layer("dense_27").output)
    reStruMovieList = [
        intermediate_layer_model.predict([[col[i]] for col in x_train])[0]
        for i in range(len(x_train[0]))
    ]

    ssList = []
    clusterListList = []
    clusterRange = range(2, 19, 2)
    for i in tqdm.tqdm(clusterRange):
        reStruMovieArr = np.array(reStruMovieList)
        #         clusterModel=KMeans(n=i)
        #         clusterList=clusterModel.fit_predict(reStruMovieArr).tolist()
        clusterList = kcluster(reStruMovieArr, nclusters=i,
                               dist="u")[0].tolist()
        clusterListList.append(clusterList)
        clusterMat=-np.dot(reStruMovieArr,reStruMovieArr.T)/\
                    np.dot(np.sqrt(np.sum(reStruMovieArr*reStruMovieArr,axis=1)),\
                           np.sqrt(np.sum(reStruMovieArr*reStruMovieArr,axis=1)))
        ss = metrics.silhouette_score(clusterMat,
                                      clusterList,
                                      metric="precomputed")
        ssList.append(ss)
    minIndex = ssList.index(min(ssList))
    minClu = list(clusterRange)[minIndex]

    plt.plot(np.array(list(clusterRange)), np.array(ssList))
    plt.show()

    print(clusterListList[1])
Пример #9
0
# encoding=utf-8
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from Bio.Cluster import kcluster

# 模拟文档集合
corpus = [
    'I like great basketball game',
    'This video game is the best action game I have ever played',
    'I really really like basketball',
    'How about this movie? Is the plot great?',
    'Do you like RPG game?',
    'You can try this FPS game',
    'The movie is really great, so great! I enjoy the plot'
]

# 把文本中的词语转换为词典和相应的向量
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(corpus)

# 构建tfidf的值
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))

# 输出每个文档的向量
tfidf_array = tfidf.toarray()
words = vectorizer.get_feature_names()

# 进行聚类,使用向量的夹角余弦作为相似读的度量
clusterid, error, nfound = kcluster(tfidf_array, nclusters=3, dict='u')
print(clusterid)
    clusterStarter = 2
    clusterEnder = 9
    clusterStep = 2

    print("loading data ...")
    with open("structuredData/reStruMovieList.pkl",
              "rb") as reStruMovieListFile:
        reStruMovieList = pkl.load(reStruMovieListFile)

    clusterRange = range(clusterStarter, clusterEnder, clusterStep)
    for i in tqdm.tqdm(clusterRange):
        reStruMovieArr = np.array(reStruMovieList)
        #         clusterModel=KMeans(n=i)
        #         clusterList=clusterModel.fit_predict(reStruMovieArr).tolist()
        clusterList = kcluster(reStruMovieArr,
                               nclusters=i,
                               dist="u",
                               npass=150)[0].tolist()
        clusterListList.append(clusterList)
        clusterMat=-np.dot(reStruMovieArr,reStruMovieArr.T)/\
                    np.dot(np.sqrt(np.sum(reStruMovieArr*reStruMovieArr,axis=1)),\
                           np.sqrt(np.sum(reStruMovieArr*reStruMovieArr,axis=1)))
        ss = metrics.silhouette_score(clusterMat,
                                      clusterList,
                                      metric="precomputed")
        ssList.append(ss)
    maxIndex = ssList.index(max(ssList))
    maxClu = list(clusterRange)[maxIndex]

    print("developing figures...")
    plt.plot(np.array(list(clusterRange)), np.array(ssList))
    clusterListArr = np.array(clusterListList)