def cluster(self): for i in range(self.numberOfClusters): self.means.append( VectorGenerator.getRandomGaussianUnitVector( len(self.vectors[0]), 4, 1).values()) clusterer = cluster.EMClusterer(self.means, bias=0.1) return clusterer.cluster(self.vectors, True, trace=True)
def batch_em_cluster(read_directory, write_directory1, write_directory2): file_number = sum( [len(files) for root, dirs, files in os.walk(read_directory)]) cluster_number = 8 init_mu = 0.1 init_sigma = 1.0 for i in range(file_number): vsm = np.loadtxt(read_directory + '/' + str(i + 1) + '.txt') data_dimension = vsm.shape[1] init_means = [] for j in range(cluster_number): init_means.append(init_sigma * np.random.randn(data_dimension) + init_mu) cluster_model = cluster.EMClusterer(init_means, bias=0.1) cluster_tag = cluster_model.cluster(vsm, True, trace=False) cluster_tag_to_string = [str(x) for x in cluster_tag] center_data = cluster_model._means quick_write_list_to_text(cluster_tag_to_string, write_directory1 + '/' + str(i + 1) + '.txt') write_matrix_to_text(center_data, write_directory2 + '/' + str(i + 1) + '.txt')
def gmm_cluster_docs(docs, nclusters=3, svd_d=5): # gaussian mixture model # first convert to numeric vectors import random dv = (lambda docs: ([(id, array([count for fname, count in dfreq])) for id, dfreq in docs.iteritems() if sum([count for fname, count in dfreq]) > 0]))(docs) n_features = len(dv[0][1]) rand_means = [ array([random.random() for i in xrange(n_features)]) for j in xrange(nclusters) ] kmc = cluster.EMClusterer( rand_means, normalise=True) ## ,svd_dimensions=svd_d) ## svd is horribly kmc.cluster([dv_[1] for dv_ in dv]) #print kmc.cluster(dv.values()) classes_by_jid = dict([(id, kmc.classify(fv)) for id, fv in dv]) return dv, classes_by_jid, kmc
def demo(): """ Non-interactive demonstration of the clusterers with simple 2-D data. """ from nltk import cluster # example from figure 14.10, page 519, Manning and Schutze vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]] means = [[4, 2], [4, 2.01]] clusterer = cluster.EMClusterer(means, bias=0.1) clusters = clusterer.cluster(vectors, True, trace=True) print('Clustered:', vectors) print('As: ', clusters) print() for c in range(2): print('Cluster:', c) print('Prior: ', clusterer._priors[c]) print('Mean: ', clusterer._means[c]) print('Covar: ', clusterer._covariance_matrices[c]) print() # classify a new vector vector = numpy.array([2, 2]) print('classify(%s):' % vector, end=' ') print(clusterer.classify(vector)) # show the classification probabilities vector = numpy.array([2, 2]) print('classification_probdist(%s):' % vector) pdist = clusterer.classification_probdist(vector) for sample in pdist.samples(): print('%s => %.0f%%' % (sample, pdist.prob(sample) *100))
""" Non-interactive demonstration of the clusterers with simple 2-D data. """ from nltk import cluster import numpy # example from figure 14.10, page 519, Manning and Schutze vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]] means = [[4, 2], [4, 2.01]] clusterer = cluster.EMClusterer(means, bias=0.1) clusters = clusterer.cluster(vectors, True, trace=True) print('Clustered:', vectors) print('As: ', clusters) print() for c in range(2): print('Cluster:', c) print('Prior: ', clusterer._priors[c]) print('Mean: ', clusterer._means[c]) print('Covar: ', clusterer._covariance_matrices[c]) print() # classify a new vector vector = numpy.array([2, 2]) print('classify(%s):' % vector, end=' ') print(clusterer.classify(vector))