def grade3(): marks = 0 try: data = np.array([[i, i] for i in range(5)]) centers = np.array([[1., 1.], [2., 2.], [3., 3.]]) op = np.array([[0.5, 0.5], [2.0, 2.0], [3.5, 3.5]]) kmeans = KMeans(D=2, n_clusters=3) kmeans.cluster_centers = centers it = kmeans.train(data, 1) if np.allclose(kmeans.cluster_centers, op) and it == 0: marks += 0.5 data = np.array([[i + 1, i * 2.3] for i in range(5)]) centers = np.array([[5., 1.], [-1., 2.], [3., 6.]]) op = np.array([[5, 1], [1.5, 1.15], [4.0, 6.8999999999999995]]) kmeans = KMeans(D=2, n_clusters=3) kmeans.cluster_centers = centers it = kmeans.train(data, 1) if np.allclose(kmeans.cluster_centers, op) and it == 0: marks += 0.5 data = np.array([[i + 1, i * 2.3] for i in range(3)]) centers = np.array([[5, 1], [-1., 2]]) op = np.array([[3.0, 4.6], [1.5, 1.15]]) kmeans = KMeans(D=2, n_clusters=2) kmeans.cluster_centers = centers it = kmeans.train(data, 5) if np.allclose(kmeans.cluster_centers, op) and it == 1: marks += 1 except: print('Error in k-means') return marks
def test02_non_fitted_model_raises_not_fitted_error_message(self): model = KMeans(k=2) try: model.predict(np.array([[1, 0], [0, 1]])) self.fail() except Exception as e: self.assertEqual(str(e), KMeans.NOT_FITTED_ERROR_MESSAGE)
def kmeans_image_compression(): im = plt.imread('baboon.tiff') N, M = im.shape[:2] im = im / 255 # convert to RGB array data = im.reshape(N * M, 3) k_means = KMeans(n_cluster=16, max_iter=100, e=1e-6) centroids, _, i = k_means.fit(data) print('RGB centroids computed in {} iteration'.format(i)) new_im = transform_image(im, centroids) assert new_im.shape == im.shape, \ 'Shape of transformed image should be same as image' mse = np.sum((im - new_im)**2) / (N * M) print('Mean square error per pixel is {}'.format(mse)) plt.imsave('plots/compressed_baboon.png', new_im) np.savez('results/k_means_compression.npz', im=im, centroids=centroids, step=i, new_image=new_im, pixel_error=mse)
def _initialise_prams(self, X): # Get initial clusters using Kmeans kmeans = KMeans(k=self.k, max_iters=500) kmeans.fit(X) kmeans_preds = kmeans.predict(X) N, col_length = X.shape mixture_labels = np.unique(kmeans_preds) initial_mean = np.zeros((self.k, col_length)) initial_cov = np.zeros((self.k, col_length, col_length)) initial_pi = np.zeros(self.k) for index, mixture_label in enumerate(mixture_labels): mixture_indices = (kmeans_preds == mixture_label) Nk = X[mixture_indices].shape[0] # Initial pi initial_pi[index] = Nk / N # Intial mean initial_mean[index, :] = np.mean(X[mixture_indices], axis=0) # Initial covariance de_meaned = X[mixture_indices] - initial_mean[index, :] initial_cov[index] = np.dot(initial_pi[index] * de_meaned.T, de_meaned) / Nk assert np.sum(initial_pi) == 1 return initial_pi, initial_mean, initial_cov
def do_KMeans_clustering(N_cluster, X, device): """ This function will use KMeans Clustering method to label training data according to its proximity with a cluster Input: N_cluster: number of cluster estimated by Gap Statistics X: Training data for the input layer Output: cluster_label: label assigned to every point over_coef: this will be used in the oversampling method to increase number of points in the less densed cluster region """ X = X.to(device) #Instantiating kmeans object kmeans = KMeans(n_clusters=N_cluster, mode='euclidean', verbose=1) cluster_label = kmeans.fit_predict(X) #Calculating the size of cluster (number of data near the cluster centroid) cluster_size = torch.zeros(N_cluster, dtype=torch.int32).to(device) for cluster in range(N_cluster): cluster_size[cluster] = len(torch.where(cluster_label==cluster)[0]) over_coef = torch.zeros(N_cluster, dtype=torch.int32).to(device) for cluster in range(N_cluster): over_coef[cluster] = torch.clone((max(cluster_size))/cluster_size[cluster]).to(device) if over_coef[cluster] > 10: over_coef[cluster] = 10 return cluster_label.cpu(), over_coef.cpu()
def main(): # NOTE(Jovan): Load data data = pd.read_csv("data/skincancer.csv", delimiter=',', index_col=0) mort = data.Mort.values lat = data.Lat.values lon = data.Long.values # NOTE(Jovan): Init LinearRegression and predict lin_reg = LinearRegression(lat, mort) hawaii = lin_reg.predict(20) print("Prediction for hawaii[lat=20]:", hawaii) # NOTE(Jovan): Init KMeans and add lat and long points k_means = KMeans() for i, j in zip(lat, lon): k_means.points.append(Point(i, j)) k_means.split(2, 0.01) # NOTE(Jovan): Plot clusters fig = plt.figure() ax = fig.add_axes([0,0,1,1]) # NOTE(Jovan): First clusters for p in k_means._clusters[0].points: ax.scatter(p.x, p.y, c="#ff0000") # NOTE(Jovan): Second clusters for p in k_means._clusters[1].points: ax.scatter(p.x, p.y, c="#00ff00") # NOTE(Jovan): Plot cluster centers center1 = k_means._clusters[0].center center2 = k_means._clusters[1].center ax.scatter(center1.x, center1.y, marker="P", c="#ff0000") ax.scatter(center2.x, center2.y, marker="P", c="#00ff00") plt.show()
def test_fit_with_different_initial_centroids(self): expected_labels = [0, 0, 0, 1, 1, 1] expected_centroids = [[-1.6666667, -1.6666667], [1.6666667, 1.6666667]] k_means = KMeans(num_clusters=self.num_clusters, seed=0) k_means.fit(self.data) self.assertEqual(expected_labels, k_means.labels_) np.testing.assert_almost_equal(expected_centroids, k_means.centroids_)
def test_cluster_points_two_cluster(self): test_vector = self.create_test_data_vector() kmeans = KMeans(test_vector, 2) test_point0 = datapoint.DataPoint() test_point0.add_dimension(1.1) test_point0.add_dimension(2.1) test_point0.add_dimension(3.1) test_point1 = datapoint.DataPoint() test_point1.add_dimension(3.1) test_point1.add_dimension(1.1) test_point1.add_dimension(2.1) test_cluster = datapoint.DataVector() test_cluster.add_point(test_point0) test_cluster.add_point(test_point1) self.assertEqual( [1.0, 2.0, 3.0], kmeans.cluster_points(test_cluster)[0].data_points[0].coordinates) self.assertEqual( [2.0, 3.0, 1.0], kmeans.cluster_points(test_cluster)[0].data_points[1].coordinates) self.assertEqual( [3.0, 1.0, 2.0], kmeans.cluster_points(test_cluster)[1].data_points[0].coordinates)
def _init_components(self, points, K, **kwargs): # [5pts] """ Args: points: NxD numpy array, the observations K: number of components kwargs: any other args you want Return: pi: numpy array of length K, prior mu: KxD numpy array, the center for each gaussian. sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian. You will have KxDxD numpy array for full covariance matrix case """ sigma = np.zeros((K, points.shape[1], points.shape[1])) pi = np.array([i / K for i in range(K)]) clusters_idx, mu, _ = KMeans()(points, K, max_iters=10000, verbose=False) for k in range(K): n_k = len(np.where(clusters_idx == k)) mu_k = mu[k] sigma[k] = np.dot(pi[k] * mu_k.T, mu_k) / n_k print("sigma shape".format(sigma.shape)) return pi, mu, sigma
def score(self): scores_dict = {} for ivecset in self.ivecs: name = os.path.normpath(ivecset.name) ivecs = ivecset.get_all() loginfo('[Diarization.score] Scoring {} ...'.format(name)) size = ivecset.size() if size > 0: if ivecset.num_speakers is not None: num_speakers = min(ivecset.num_speakers, size) sklearnkmeans = sklearnKMeans( n_clusters=num_speakers).fit(ivecs) centroids = KMeans(sklearnkmeans.cluster_centers_, num_speakers, self.plda).fit(ivecs) else: num_speakers, centroids = self.get_num_speakers(ivecs) if self.norm_list is None: scores_dict[name] = self.plda.score( ivecs, centroids, self.scale, self.shift) else: scores_dict[name] = self.s_norm(ivecs, centroids) else: logwarning( '[Diarization.score] No i-vectors to score in {}.'.format( ivecset.name)) return scores_dict
def get_num_speakers(self, ivecs, min_speakers=2, max_speakers=6): """ Obtain number of speakers from pretrained model. :param ivecs: input i-vectors :type ivecs: numpy.array :param min_speakers: minimal number of speakers from model :type min_speakers: int :param max_speakers: maximal number of speakers from model :type max_speakers: int :returns: estimated number of speakers and KMeans centroid :rtype: tuple """ avg, centroids_list = [], [] features = [] for num_speakers in range(min_speakers, max_speakers + 1): sklearnkmeans = sklearnKMeans(n_clusters=num_speakers).fit(ivecs) centroids = KMeans(sklearnkmeans.cluster_centers_, num_speakers, self.plda).fit(ivecs) centroids_list.append(centroids) scores = self.s_norm(centroids, centroids)[np.tril_indices(num_speakers, -1)] features.append(Normalization.get_features(scores)) num_speakers = np.argmax( np.sum(self.model.test(features, prob=True), axis=0)) # raw_input('ENTER') return num_speakers + min_speakers, centroids_list[num_speakers]
def test_assign_points(self): """ Tests initialize methods of the KMeans class. """ X, y, centers = generate_cluster_samples() n_samples = X.shape[0] k = centers.shape[0] kmeans = KMeans(k, N_ITER) # Set cluster centers so that assignment is deterministic kmeans.cluster_centers = centers assignments, distances = kmeans.assign_points(X) # check assignment array shape self.assertEqual(assignments.ndim, 1) self.assertEqual(assignments.shape[0], n_samples) # check distances array shape self.assertEqual(distances.ndim, 1) self.assertEqual(distances.shape[0], n_samples) # check that assignments only include valid cluster indices (0 <= idx < k) self.assertTrue( np.all(np.logical_and(assignments < k, assignments >= 0))) # Check cluster assignments are correct self.assertTrue(np.all(assignments[:25] == 0)) self.assertTrue(np.all(assignments[25:50] == 1)) self.assertTrue(np.all(assignments[50:75] == 2)) self.assertTrue(np.all(assignments[75:] == 3))
def test_initialize(self): """ Tests initialize methods of the KMeans class. """ k = 3 n_samples = 100 n_features = 10 for i in range(N_TRIALS): X = np.random.randn(n_samples, n_features) kmeans = KMeans(k, N_ITER) kmeans.initialize_clusters(X) # ensure that the cluster_centers matrix has the right shape self.assertEqual(kmeans.cluster_centers.ndim, 2) self.assertEqual(kmeans.cluster_centers.shape[0], k) self.assertEqual(kmeans.cluster_centers.shape[1], n_features) # Check that every center is one the points in X. # Calculcate the distances between every cluster center # and every point in X. Find the closest matches. # Checks that the distances are nearly 0.0 distances = find_smallest_distances(X, kmeans.cluster_centers) for d in distances: self.assertAlmostEqual(d, 0.0)
def test_whole(self): """ Tests the score method. """ X, y, centers = generate_cluster_samples() n_samples = X.shape[0] n_features = X.shape[1] k = centers.shape[0] # run N_TRIALS, pick best model best_model = None for i in range(N_TRIALS): kmeans = KMeans(k, N_ITER) kmeans.fit(X) if best_model is None: best_model = kmeans elif kmeans.score(X) < best_model.score(X): best_model = kmeans # check sum squared errors sum_squared_errors = best_model.score(X) self.assertLess(sum_squared_errors / n_samples, EPS) # compare centers to expected centers smallest_distances = find_smallest_distances( best_model.cluster_centers, centers) for distance in smallest_distances: self.assertLess(distance, EPS)
def test_update_centers(self): """ Tests update centers """ X, y, centers = generate_cluster_samples() n_samples = X.shape[0] n_features = X.shape[1] k = centers.shape[0] kmeans = KMeans(k, N_ITER) # Set cluster centers so that assignment is deterministic kmeans.cluster_centers = centers assignments, distances = kmeans.assign_points(X) assignments = kmeans.reinitialize_empty_clusters( X, assignments, distances) # clear out centers to test method kmeans.cluster_centers = np.zeros((k, n_features)) kmeans.update_centers(X, assignments) # calculate average difference in coordinates of estimated # and real centers error = np.linalg.norm(kmeans.cluster_centers - centers) / k self.assertLess(error, EPS)
def B4(pca=False): ''' Evaluate using NMI and visualize in 2D. ''' fnames = [ 'digits-embedding.csv', 'digits-embedding-2467.csv', 'digits-embedding-67.csv' ] nmi = zeros(len(fnames)) for i, k, fname in zip([0, 1, 2], [8, 4, 2], fnames): raw = genfromtxt(fname, delimiter=',') X = raw[:, 2:] y = get_normalized_labels(raw[:, 1]) kmeans = KMeans(n_clusters=k) ind = kmeans.fit(X, y) _, _, nmi[i] = kmeans.get_evals() figure() perm = permutation(X.shape[0])[:1000] X = X[perm] ind = ind[perm] colors = rand(k, 3)[ind, :] scatter(X[:, 0], X[:, 1], c=colors, alpha=0.9, s=30) print(fnames) print("NMI =", nmi) show()
def kmeans_trials(k=3, r=1): # Create and train r models for trials models = [KMeans(k, data) for _ in range(r)] training_err = [m.train(data) for m in models] # Sort modes by sum-of-squares error results = [(err[-1], model) for err, model in zip(training_err, models)] results = sorted(results, key=lambda x: x[0]) # Sort asscending by sum square error # Plot trial results for i, trial in enumerate(results): final_err = round(trial[0], 2) m = trial[1] plt.title(f'Trial {i+1} Cluster Assignments (SSE={final_err})') plotKClusters(m, k, data) plt.show() # Show best model from r trials best_sse = round(results[0][0], 2) best_model = results[0][1] plt.title(f"Best model (SSE={best_sse})") plotKClusters(best_model, k, data) plt.show()
def B1(pca=False): ''' Plot WC_SSD and SC over K. ''' K = [2, 4, 6, 8, 16, 32] fnames = [ 'digits-embedding.csv', 'digits-embedding-2467.csv', 'digits-embedding-67.csv' ] wc_ssd_val = zeros((len(fnames), len(K))) sc_val = zeros((len(fnames), len(K))) for i, fname in enumerate(fnames): X = genfromtxt(fname, delimiter=',')[:, 2:] for j, k in enumerate(K): kmeans = KMeans(n_clusters=k) kmeans.fit(X) wc_ssd_val[i, j], sc_val[i, j], _ = kmeans.get_evals() # Plot WC_SSD figure() for i, fname in enumerate(fnames): plot(K, wc_ssd_val[i], label=fname) legend() title('WC_SSD v.s. K') figure() for i, fname in enumerate(fnames): plot(K, sc_val[i], label=fname) legend() title('SC v.s. K') show()
def test_predict(self): test_samples = [[-3, -3], [3, 3], [-1, -1], [1, 1]] expected_predictions = [0, 1, 0, 1] k_means = KMeans(num_clusters=self.num_clusters, seed=1) k_means.fit(self.data) predictions = k_means.predict(test_samples) self.assertEqual(expected_predictions, predictions)
def cluster_colors(img, n_clusters): kmeans = KMeans(n_clusters=n_clusters) color_vectors = cv.cvtColor(img, cv.COLOR_BGR2RGB).reshape([-1, 3]) centroids = kmeans.fit(color_vectors) labels = kmeans.predict(color_vectors) pred = labels.reshape(img.shape[:-1]) # Initialize img for clusters cluster_img = np.zeros(img.shape) for i in range(n_clusters): cluster_img[np.where(pred == i)] = centroids[i] cluster_img = cluster_img.astype(np.uint8) plt.figure(figsize=(10, 10)) plt.imshow(cluster_img) colors = ["Cluster {}".format(i) for i in range(n_clusters)] patches = [ mpatches.Patch(color=centroids[i] / 255, label=colors[i]) for i in range(len(colors)) ] plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.show() return kmeans
def test_select_initial_centroids(self): expected_initial_centroids = [[2, 1], [-1, -2]] k_means = KMeans(num_clusters=self.num_clusters, seed=3) k_means.fit(self.data) initial_centroids = k_means._select_initial_centroids(self.data) self.assertEqual(expected_initial_centroids, initial_centroids) self.assertEqual(self.num_clusters, len(initial_centroids))
def spectral(X, sigma, k, centroids): """ Ng谱聚类算法 :param X: 数据点 :param sigma: 参数 :param k: 参数 :return: accu聚类精度 """ (n, d) = X.shape L_sym, L = get_L(X, k, sigma) eig, eigvec = np.linalg.eig(L_sym) # eigvec按列 # eig_index = np.argsort(eig)[1:d+1] eig_index = np.argsort(eig)[:d] # 最小的d个特征值的索引 U = eigvec[:, eig_index] T = np.zeros(U.shape) for i in range(n): for j in range(d): T[i][j] = U[i][j] / np.linalg.norm(U[i]) Y = T # visual(Y, k=k, sigma=sigma, save=1) cluster = KMeans(2, 100, centroids) cluster.fit(Y) labels = cluster.labels if labels[0] == 0: n1 = 100 - sum(labels[:100]) n2 = sum(labels[100:]) else: n1 = sum(labels[:100]) n2 = 100 - sum(labels[100:]) accu = (n1 + n2) / n print('---------------------sigma=%.2f, k=%d, accu=%.4f' % (sigma, k, accu)) return accu
def main(): tagged_words = brown.tagged_words() words_corpus = brown.words() word2vec = Word2Vec() word2vec.train(words_corpus) word_vecs = [word2vec.word2vec(word) for word in words_corpus] n_clusters = 10 # random number for now kmeans = KMeans(n_clusters) kmeans.compute(word_vecs) # word-cluster HMM p_word = {} p_cluster = {} p_cluster_given_word = None # softmax p_word_given_cluster = None # joint probability formula p_transition_cluster = None # count p_initial_cluster = None # count # cluster-tag HMM p_cluster_given_tag = None # softmax p_transition_tag = None # count from tagged data p_initial_tag = None # count from tagged data hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster) hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag) words = [] clusters = hmm_word_cluster.viterbi(words) tags = hmm_cluster_tag.viterbi(clusters)
def main(): filepath = "./data/self_test.csv" #filepath = "./data/self_test_petit.csv" #filepath = "./data/iris.csv" # chargement des données data, labels = load_dataset(filepath) # initialisation de l'objet KMeans kmeans = KMeans(n_clusters=3, max_iter=100, early_stopping=True, tol=1e-6, display=True) # calcule les clusters kmeans.fit(data) # calcule la pureté de nos clusters score = kmeans.score(data, labels) print("Pureté : {}".format(score)) input("Press any key to exit...")
def kmeans_image_compression(): print("[+] K-Means Image Compression") im = plt.imread('baboon.tiff') N, M = im.shape[:2] im = im / 255 # convert to RGB array data = im.reshape(N * M, 3) # print(im) k_means = KMeans(n_cluster=16, max_iter=100, e=1e-6) centroids, _, i = k_means.fit(data) # print(centroids.shape) print('[+] RGB centroids computed in {} iteration'.format(i)) new_im = transform_image(im, centroids) assert new_im.shape == im.shape, \ 'Shape of transformed image should be same as image' mse = np.sum((im - new_im)**2) / (N * M) print('[+] Mean square error per pixel is {}\n'.format(mse)) plt.imsave('plots/compressed_baboon.png', new_im)
def initialize_clusters(X, n_clusters): """ Initialize the clusters by storing the information in the data matrix X into the clusters Parameter: X: Input feature matrix n_clusters: Number of clusters we are trying to classify Return: cluster: List of clusters. Each cluster center is calculated by the KMeans algorithm above. """ clusters = [] index = np.arange(X.shape[0]) # We use the KMeans centroids to initialise the GMM kmeans = KMeans().fit(X) mu_k = kmeans.centers for i in range(n_clusters): clusters.append({ 'w_k': 1.0 / n_clusters, 'mu_k': mu_k[i], 'cov_k': np.identity(X.shape[1], dtype=np.float64) }) return clusters
def kmeans_builder(centroid_func): samples_per_cluster = 50 n_cluster = 9 x, y = toy_dataset(n_cluster, samples_per_cluster) fig = Figure() fig.ax.scatter(x[:, 0], x[:, 1], c=y) fig.savefig('plots/toy_dataset_real_labels.png') fig.ax.scatter(x[:, 0], x[:, 1]) fig.savefig('plots/toy_dataset.png') k_means = KMeans(n_cluster=n_cluster, max_iter=100, e=1e-8) centroids, membership, i = k_means.fit(x, centroid_func) assert centroids.shape == (n_cluster, 2), \ ('centroids for toy dataset should be numpy array of size {} X 2' .format(n_cluster)) assert membership.shape == (samples_per_cluster * n_cluster,), \ 'membership for toy dataset should be a vector of size {}'.format(len(membership)) assert type(i) == int and i > 0, \ 'Number of updates for toy datasets should be integer and positive' print('[success] : kmeans clustering done on toy dataset') print('Toy dataset K means clustering converged in {} steps'.format(i)) fig = Figure() fig.ax.scatter(x[:, 0], x[:, 1], c=membership) fig.ax.scatter(centroids[:, 0], centroids[:, 1], c='red') fig.savefig('plots/toy_dataset_predicted_labels.png')
def __init__(self, n_cluster: int, data: np.ndarray, use_kmeans: bool = False, w: float = 0.9, c1: float = 0.5, c2: float = 0.3, flag: int = 1, weights: list = None): index = np.random.choice(list(range(len(data))), n_cluster) self.centroids = data[index].copy() if use_kmeans: kmeans = KMeans(n_cluster=n_cluster, init_pp=False) kmeans.fit(data) self.centroids = kmeans.centroid.copy() self.best_position = self.centroids.copy() self.best_score = quantization_error(self.centroids, self._predict(data), data) self.flag=flag if self.flag%2==1: self.best_sse = calc_sse(self.centroids, self._predict(data), data) else: self.best_sse = calc_sse2(self.centroids, self._predict(data), data, weights) self.velocity = np.zeros_like(self.centroids) self._w = w self._c1 = c1 self._c2 = c2
def kmeans_toy(): x, y = toy_dataset(4) fig = Figure() fig.ax.scatter(x[:, 0], x[:, 1], c=y) fig.savefig('plots/toy_dataset_real_labels.png') fig.ax.scatter(x[:, 0], x[:, 1]) fig.savefig('plots/toy_dataset.png') n_cluster = 4 k_means = KMeans(n_cluster=n_cluster, max_iter=100, e=1e-8) centroids, membership, i = k_means.fit(x) assert centroids.shape == (n_cluster, 2), \ ('centroids for toy dataset should be numpy array of size {} X 2' .format(n_cluster)) assert membership.shape == (50 * n_cluster,), \ 'membership for toy dataset should be a vector of size 200' assert type(i) == int and i > 0, \ 'Number of updates for toy datasets should be integer and positive' print('[success] : kmeans clustering done on toy dataset') print('Toy dataset K means clustering converged in {} steps'.format(i)) fig = Figure() fig.ax.scatter(x[:, 0], x[:, 1], c=membership) fig.ax.scatter(centroids[:, 0], centroids[:, 1], c='red') fig.savefig('plots/toy_dataset_predicted_labels.png') np.savez('results/k_means_toy.npz', centroids=centroids, step=i, membership=membership, y=y)
def cluster_newsgroups(): """ Cluster newsgroup categories. """ from kmeans import KMeans from similarity import simMatrix corpus, dictionary = build_dictionary(bigram=True) tfidf = TFIDF(dictionary) newsgroups = tfidf.vectorize(corpus) dictionary = tfidf.dictionary categories = sorted(corpus.keys()) N = 6 print "\n{}-Most Common Words".format(N) for index, category in enumerate(categories): nlargest = np.argpartition(newsgroups[index, :], -N)[-N:] nlargest = nlargest[np.argsort(newsgroups[index, nlargest])][::-1] print "{:>24} {}".format(category, dictionary[nlargest]) print K = 3 km = KMeans(n_clusters=K) km.fit(newsgroups) labels = km.labels_ print "\nKMeans Label Assignment, K = {}".format(K) for category, label, in zip(categories, labels): print int(label), category simMatrix(newsgroups).plot().show()