def kmeans_builder(centroid_func): samples_per_cluster = 50 n_cluster = 9 x, y = toy_dataset(n_cluster, samples_per_cluster) fig = Figure() fig.ax.scatter(x[:, 0], x[:, 1], c=y) fig.savefig('plots/toy_dataset_real_labels.png') fig.ax.scatter(x[:, 0], x[:, 1]) fig.savefig('plots/toy_dataset.png') k_means = KMeans(n_cluster=n_cluster, max_iter=100, e=1e-8) centroids, membership, i = k_means.fit(x, centroid_func) assert centroids.shape == (n_cluster, 2), \ ('centroids for toy dataset should be numpy array of size {} X 2' .format(n_cluster)) assert membership.shape == (samples_per_cluster * n_cluster,), \ 'membership for toy dataset should be a vector of size {}'.format(len(membership)) assert type(i) == int and i > 0, \ 'Number of updates for toy datasets should be integer and positive' print('[success] : kmeans clustering done on toy dataset') print('Toy dataset K means clustering converged in {} steps'.format(i)) fig = Figure() fig.ax.scatter(x[:, 0], x[:, 1], c=membership) fig.ax.scatter(centroids[:, 0], centroids[:, 1], c='red') fig.savefig('plots/toy_dataset_predicted_labels.png')
def kmeans_image_compression(): im = plt.imread('baboon.tiff') N, M = im.shape[:2] im = im / 255 # convert to RGB array data = im.reshape(N * M, 3) k_means = KMeans(n_cluster=16, max_iter=100, e=1e-6) centroids, _, i = k_means.fit(data) print('RGB centroids computed in {} iteration'.format(i)) new_im = transform_image(im, centroids) assert new_im.shape == im.shape, \ 'Shape of transformed image should be same as image' mse = np.sum((im - new_im)**2) / (N * M) print('Mean square error per pixel is {}'.format(mse)) plt.imsave('plots/compressed_baboon.png', new_im) np.savez('results/k_means_compression.npz', im=im, centroids=centroids, step=i, new_image=new_im, pixel_error=mse)
def B4(pca=False): ''' Evaluate using NMI and visualize in 2D. ''' fnames = [ 'digits-embedding.csv', 'digits-embedding-2467.csv', 'digits-embedding-67.csv' ] nmi = zeros(len(fnames)) for i, k, fname in zip([0, 1, 2], [8, 4, 2], fnames): raw = genfromtxt(fname, delimiter=',') X = raw[:, 2:] y = get_normalized_labels(raw[:, 1]) kmeans = KMeans(n_clusters=k) ind = kmeans.fit(X, y) _, _, nmi[i] = kmeans.get_evals() figure() perm = permutation(X.shape[0])[:1000] X = X[perm] ind = ind[perm] colors = rand(k, 3)[ind, :] scatter(X[:, 0], X[:, 1], c=colors, alpha=0.9, s=30) print(fnames) print("NMI =", nmi) show()
def test_predict(self): test_samples = [[-3, -3], [3, 3], [-1, -1], [1, 1]] expected_predictions = [0, 1, 0, 1] k_means = KMeans(num_clusters=self.num_clusters, seed=1) k_means.fit(self.data) predictions = k_means.predict(test_samples) self.assertEqual(expected_predictions, predictions)
def __init__(self, K_max, embedding_mats, vec_ids_dict, durations_dict, landmarks_dict, n_slices_min=0, n_slices_max=20, min_duration=0, p_boundary_init=0.5, init_assignments="rand", wip=0): # Attributes from parameters self.n_slices_min = n_slices_min self.n_slices_max = n_slices_max self.wip = wip # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance) embeddings, vec_ids, ids_to_utterance_labels = process_embeddings( embedding_mats, vec_ids_dict #, n_slices_min=n_slices_min ) self.ids_to_utterance_labels = ids_to_utterance_labels N = embeddings.shape[0] # Initialize `utterances` lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels] landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels] durations = [durations_dict[i] for i in ids_to_utterance_labels] self.utterances = Utterances(lengths, vec_ids, durations, landmarks, p_boundary_init=p_boundary_init, n_slices_min=n_slices_min, n_slices_max=n_slices_max, min_duration=min_duration) # Embeddings in the initial segmentation init_embeds = [] for i in range(self.utterances.D): init_embeds.extend(self.utterances.get_segmented_embeds_i(i)) init_embeds = np.array(init_embeds, dtype=int) init_embeds = init_embeds[np.where(init_embeds != -1)] print("No. initial embeddings: {}".format(init_embeds.shape[0])) # Initialize the K-means components assignments = -1 * np.ones(N, dtype=int) if init_assignments == "rand": assignments[init_embeds] = np.random.randint( 0, K_max, len(init_embeds)) elif init_assignments == "spread": n_init_embeds = len(init_embeds) assignment_list = ( range(K_max) * int(np.ceil(float(n_init_embeds) / K_max)))[:n_init_embeds] random.shuffle(assignment_list) assignments[init_embeds] = np.array(assignment_list) self.acoustic_model = KMeans(embeddings, K_max, assignments)
def cluster_colors(img, n_clusters): kmeans = KMeans(n_clusters=n_clusters) color_vectors = cv.cvtColor(img, cv.COLOR_BGR2RGB).reshape([-1, 3]) centroids = kmeans.fit(color_vectors) labels = kmeans.predict(color_vectors) pred = labels.reshape(img.shape[:-1]) # Initialize img for clusters cluster_img = np.zeros(img.shape) for i in range(n_clusters): cluster_img[np.where(pred == i)] = centroids[i] cluster_img = cluster_img.astype(np.uint8) plt.figure(figsize=(10, 10)) plt.imshow(cluster_img) colors = ["Cluster {}".format(i) for i in range(n_clusters)] patches = [ mpatches.Patch(color=centroids[i] / 255, label=colors[i]) for i in range(len(colors)) ] plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.show() return kmeans
def kmeans_toy(): x, y = toy_dataset(4) fig = Figure() fig.ax.scatter(x[:, 0], x[:, 1], c=y) fig.savefig('plots/toy_dataset_real_labels.png') fig.ax.scatter(x[:, 0], x[:, 1]) fig.savefig('plots/toy_dataset.png') n_cluster = 4 k_means = KMeans(n_cluster=n_cluster, max_iter=100, e=1e-8) centroids, membership, i = k_means.fit(x) assert centroids.shape == (n_cluster, 2), \ ('centroids for toy dataset should be numpy array of size {} X 2' .format(n_cluster)) assert membership.shape == (50 * n_cluster,), \ 'membership for toy dataset should be a vector of size 200' assert type(i) == int and i > 0, \ 'Number of updates for toy datasets should be integer and positive' print('[success] : kmeans clustering done on toy dataset') print('Toy dataset K means clustering converged in {} steps'.format(i)) fig = Figure() fig.ax.scatter(x[:, 0], x[:, 1], c=membership) fig.ax.scatter(centroids[:, 0], centroids[:, 1], c='red') fig.savefig('plots/toy_dataset_predicted_labels.png') np.savez('results/k_means_toy.npz', centroids=centroids, step=i, membership=membership, y=y)
def test_fit_with_different_initial_centroids(self): expected_labels = [0, 0, 0, 1, 1, 1] expected_centroids = [[-1.6666667, -1.6666667], [1.6666667, 1.6666667]] k_means = KMeans(num_clusters=self.num_clusters, seed=0) k_means.fit(self.data) self.assertEqual(expected_labels, k_means.labels_) np.testing.assert_almost_equal(expected_centroids, k_means.centroids_)
def _initialise_prams(self, X): # Get initial clusters using Kmeans kmeans = KMeans(k=self.k, max_iters=500) kmeans.fit(X) kmeans_preds = kmeans.predict(X) N, col_length = X.shape mixture_labels = np.unique(kmeans_preds) initial_mean = np.zeros((self.k, col_length)) initial_cov = np.zeros((self.k, col_length, col_length)) initial_pi = np.zeros(self.k) for index, mixture_label in enumerate(mixture_labels): mixture_indices = (kmeans_preds == mixture_label) Nk = X[mixture_indices].shape[0] # Initial pi initial_pi[index] = Nk / N # Intial mean initial_mean[index, :] = np.mean(X[mixture_indices], axis=0) # Initial covariance de_meaned = X[mixture_indices] - initial_mean[index, :] initial_cov[index] = np.dot(initial_pi[index] * de_meaned.T, de_meaned) / Nk assert np.sum(initial_pi) == 1 return initial_pi, initial_mean, initial_cov
def test02_non_fitted_model_raises_not_fitted_error_message(self): model = KMeans(k=2) try: model.predict(np.array([[1, 0], [0, 1]])) self.fail() except Exception as e: self.assertEqual(str(e), KMeans.NOT_FITTED_ERROR_MESSAGE)
def cluster_newsgroups(): """ Cluster newsgroup categories. """ from kmeans import KMeans from similarity import simMatrix corpus, dictionary = build_dictionary(bigram=True) tfidf = TFIDF(dictionary) newsgroups = tfidf.vectorize(corpus) dictionary = tfidf.dictionary categories = sorted(corpus.keys()) N = 6 print "\n{}-Most Common Words".format(N) for index, category in enumerate(categories): nlargest = np.argpartition(newsgroups[index, :], -N)[-N:] nlargest = nlargest[np.argsort(newsgroups[index, nlargest])][::-1] print "{:>24} {}".format(category, dictionary[nlargest]) print K = 3 km = KMeans(n_clusters=K) km.fit(newsgroups) labels = km.labels_ print "\nKMeans Label Assignment, K = {}".format(K) for category, label, in zip(categories, labels): print int(label), category simMatrix(newsgroups).plot().show()
def main(): # NOTE(Jovan): Load data data = pd.read_csv("data/skincancer.csv", delimiter=',', index_col=0) mort = data.Mort.values lat = data.Lat.values lon = data.Long.values # NOTE(Jovan): Init LinearRegression and predict lin_reg = LinearRegression(lat, mort) hawaii = lin_reg.predict(20) print("Prediction for hawaii[lat=20]:", hawaii) # NOTE(Jovan): Init KMeans and add lat and long points k_means = KMeans() for i, j in zip(lat, lon): k_means.points.append(Point(i, j)) k_means.split(2, 0.01) # NOTE(Jovan): Plot clusters fig = plt.figure() ax = fig.add_axes([0,0,1,1]) # NOTE(Jovan): First clusters for p in k_means._clusters[0].points: ax.scatter(p.x, p.y, c="#ff0000") # NOTE(Jovan): Second clusters for p in k_means._clusters[1].points: ax.scatter(p.x, p.y, c="#00ff00") # NOTE(Jovan): Plot cluster centers center1 = k_means._clusters[0].center center2 = k_means._clusters[1].center ax.scatter(center1.x, center1.y, marker="P", c="#ff0000") ax.scatter(center2.x, center2.y, marker="P", c="#00ff00") plt.show()
def spectral(X, sigma, k, centroids): """ Ng谱聚类算法 :param X: 数据点 :param sigma: 参数 :param k: 参数 :return: accu聚类精度 """ (n, d) = X.shape L_sym, L = get_L(X, k, sigma) eig, eigvec = np.linalg.eig(L_sym) # eigvec按列 # eig_index = np.argsort(eig)[1:d+1] eig_index = np.argsort(eig)[:d] # 最小的d个特征值的索引 U = eigvec[:, eig_index] T = np.zeros(U.shape) for i in range(n): for j in range(d): T[i][j] = U[i][j] / np.linalg.norm(U[i]) Y = T # visual(Y, k=k, sigma=sigma, save=1) cluster = KMeans(2, 100, centroids) cluster.fit(Y) labels = cluster.labels if labels[0] == 0: n1 = 100 - sum(labels[:100]) n2 = sum(labels[100:]) else: n1 = sum(labels[:100]) n2 = 100 - sum(labels[100:]) accu = (n1 + n2) / n print('---------------------sigma=%.2f, k=%d, accu=%.4f' % (sigma, k, accu)) return accu
def __init__(self, n_cluster: int, data: np.ndarray, use_kmeans: bool = False, w: float = 0.9, c1: float = 0.5, c2: float = 0.3, flag: int = 1, weights: list = None): index = np.random.choice(list(range(len(data))), n_cluster) self.centroids = data[index].copy() if use_kmeans: kmeans = KMeans(n_cluster=n_cluster, init_pp=False) kmeans.fit(data) self.centroids = kmeans.centroid.copy() self.best_position = self.centroids.copy() self.best_score = quantization_error(self.centroids, self._predict(data), data) self.flag=flag if self.flag%2==1: self.best_sse = calc_sse(self.centroids, self._predict(data), data) else: self.best_sse = calc_sse2(self.centroids, self._predict(data), data, weights) self.velocity = np.zeros_like(self.centroids) self._w = w self._c1 = c1 self._c2 = c2
def test_select_initial_centroids(self): expected_initial_centroids = [[2, 1], [-1, -2]] k_means = KMeans(num_clusters=self.num_clusters, seed=3) k_means.fit(self.data) initial_centroids = k_means._select_initial_centroids(self.data) self.assertEqual(expected_initial_centroids, initial_centroids) self.assertEqual(self.num_clusters, len(initial_centroids))
def main2(): df = pd.read_csv('credit_card_data.csv') df = df.fillna(df.median()) original_data = df.iloc[:, 1:].values data = copy.deepcopy(original_data) columns = list(df.columns)[1:] # lista naziva kolona print(columns) # min_max_data(df, columns) normalizacija(data) # radimo normalizaciju nad ucitanim podacima pca = PCA() pca.fit(data) # odredjujem na koliiko cu da smanjim dimenzionalnost plt.plot(range(1, 18), pca.explained_variance_ratio_.cumsum(), marker='x', linestyle='--') plt.xlabel('Components') # features plt.ylabel('Variance') plt.show() components = 7 # vidimo iz plota pca = PCA(n_components=components) pca.fit(data) scores = pca.transform(data) # print(scores) # ima onoliko komponenti koliko smo stavili # pokazujemo da prve dve dimenzije uticu najvise na grafik plt.bar(range(pca.n_components_), pca.explained_variance_ratio_, color='black') plt.xlabel('PCA components') plt.ylabel('Variance %') # procenat koliko uticu na grafik, da tako kazemo plt.xticks(range(pca.n_components_)) plt.show() # dobijam optimal k = 5 za 500 prvih ucitanih # za sve ucitane dobijam 6 # optimal_k_plot(data) broj_klastera = 6 k_means = MyKMeans(n_clusters=broj_klastera, max_iter=100) k_means.fit(scores, normalize=False) klaster_indeksi = k_means.klaster_indeksi print(klaster_indeksi) lista_klastera_sa_originalnim_podacima = [] # lista klastera sa originalnim podacima for i in range(broj_klastera): lista_klastera_sa_originalnim_podacima.append([]) for i in range(len(original_data)): lista_klastera_sa_originalnim_podacima[klaster_indeksi[i]].append(original_data[i]) # printujem osobine i stablo odlucivanja print_descriptions(lista_klastera_sa_originalnim_podacima, columns) # print_decision_tree(original_data, klaster_indeksi, columns) print_clusters_description() # iscrtavamo tacke plot_2_D(k_means)
def main(): tagged_words = brown.tagged_words() words_corpus = brown.words() word2vec = Word2Vec() word2vec.train(words_corpus) word_vecs = [word2vec.word2vec(word) for word in words_corpus] n_clusters = 10 # random number for now kmeans = KMeans(n_clusters) kmeans.compute(word_vecs) # word-cluster HMM p_word = {} p_cluster = {} p_cluster_given_word = None # softmax p_word_given_cluster = None # joint probability formula p_transition_cluster = None # count p_initial_cluster = None # count # cluster-tag HMM p_cluster_given_tag = None # softmax p_transition_tag = None # count from tagged data p_initial_tag = None # count from tagged data hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster) hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag) words = [] clusters = hmm_word_cluster.viterbi(words) tags = hmm_cluster_tag.viterbi(clusters)
def main(): filepath = "./data/self_test.csv" #filepath = "./data/self_test_petit.csv" #filepath = "./data/iris.csv" # chargement des données data, labels = load_dataset(filepath) # initialisation de l'objet KMeans kmeans = KMeans(n_clusters=3, max_iter=100, early_stopping=True, tol=1e-6, display=True) # calcule les clusters kmeans.fit(data) # calcule la pureté de nos clusters score = kmeans.score(data, labels) print("Pureté : {}".format(score)) input("Press any key to exit...")
def test_whole(self): """ Tests the score method. """ X, y, centers = generate_cluster_samples() n_samples = X.shape[0] n_features = X.shape[1] k = centers.shape[0] # run N_TRIALS, pick best model best_model = None for i in range(N_TRIALS): kmeans = KMeans(k, N_ITER) kmeans.fit(X) if best_model is None: best_model = kmeans elif kmeans.score(X) < best_model.score(X): best_model = kmeans # check sum squared errors sum_squared_errors = best_model.score(X) self.assertLess(sum_squared_errors / n_samples, EPS) # compare centers to expected centers smallest_distances = find_smallest_distances( best_model.cluster_centers, centers) for distance in smallest_distances: self.assertLess(distance, EPS)
def do_KMeans_clustering(N_cluster, X, device): """ This function will use KMeans Clustering method to label training data according to its proximity with a cluster Input: N_cluster: number of cluster estimated by Gap Statistics X: Training data for the input layer Output: cluster_label: label assigned to every point over_coef: this will be used in the oversampling method to increase number of points in the less densed cluster region """ X = X.to(device) #Instantiating kmeans object kmeans = KMeans(n_clusters=N_cluster, mode='euclidean', verbose=1) cluster_label = kmeans.fit_predict(X) #Calculating the size of cluster (number of data near the cluster centroid) cluster_size = torch.zeros(N_cluster, dtype=torch.int32).to(device) for cluster in range(N_cluster): cluster_size[cluster] = len(torch.where(cluster_label==cluster)[0]) over_coef = torch.zeros(N_cluster, dtype=torch.int32).to(device) for cluster in range(N_cluster): over_coef[cluster] = torch.clone((max(cluster_size))/cluster_size[cluster]).to(device) if over_coef[cluster] > 10: over_coef[cluster] = 10 return cluster_label.cpu(), over_coef.cpu()
def B1(pca=False): ''' Plot WC_SSD and SC over K. ''' K = [2, 4, 6, 8, 16, 32] fnames = [ 'digits-embedding.csv', 'digits-embedding-2467.csv', 'digits-embedding-67.csv' ] wc_ssd_val = zeros((len(fnames), len(K))) sc_val = zeros((len(fnames), len(K))) for i, fname in enumerate(fnames): X = genfromtxt(fname, delimiter=',')[:, 2:] for j, k in enumerate(K): kmeans = KMeans(n_clusters=k) kmeans.fit(X) wc_ssd_val[i, j], sc_val[i, j], _ = kmeans.get_evals() # Plot WC_SSD figure() for i, fname in enumerate(fnames): plot(K, wc_ssd_val[i], label=fname) legend() title('WC_SSD v.s. K') figure() for i, fname in enumerate(fnames): plot(K, sc_val[i], label=fname) legend() title('SC v.s. K') show()
def __init_parameters(self): N = self.X.shape[0] n_features = self.X.shape[1] kmeans = KMeans(n_clusters=self.n_components, n_init=5) kmeans.fit(self.X) # mu, means for each component self.means_ = kmeans.cluster_centers_ # sigma, covariances for each component self.covariances_ = np.zeros( [self.n_components, n_features, n_features]) # pi, weights for each component self.weights_ = np.zeros(self.n_components) for k in range(self.n_components): logic = (kmeans.labels_ == k) Nk = logic.sum() # otherwise error if Nk > 1: Xk = self.X[logic] self.covariances_[k] = np.cov(Xk.T) self.weights_[k] = Nk / N # gamma(Znk) self.gamma = np.zeros([N, self.n_components]) # log_likelihood self.lower_bound_ = -np.inf return self
def kmeans_image_compression(): print("[+] K-Means Image Compression") im = plt.imread('baboon.tiff') N, M = im.shape[:2] im = im / 255 # convert to RGB array data = im.reshape(N * M, 3) # print(im) k_means = KMeans(n_cluster=16, max_iter=100, e=1e-6) centroids, _, i = k_means.fit(data) # print(centroids.shape) print('[+] RGB centroids computed in {} iteration'.format(i)) new_im = transform_image(im, centroids) assert new_im.shape == im.shape, \ 'Shape of transformed image should be same as image' mse = np.sum((im - new_im)**2) / (N * M) print('[+] Mean square error per pixel is {}\n'.format(mse)) plt.imsave('plots/compressed_baboon.png', new_im)
def test_cluster_points_two_cluster(self): test_vector = self.create_test_data_vector() kmeans = KMeans(test_vector, 2) test_point0 = datapoint.DataPoint() test_point0.add_dimension(1.1) test_point0.add_dimension(2.1) test_point0.add_dimension(3.1) test_point1 = datapoint.DataPoint() test_point1.add_dimension(3.1) test_point1.add_dimension(1.1) test_point1.add_dimension(2.1) test_cluster = datapoint.DataVector() test_cluster.add_point(test_point0) test_cluster.add_point(test_point1) self.assertEqual( [1.0, 2.0, 3.0], kmeans.cluster_points(test_cluster)[0].data_points[0].coordinates) self.assertEqual( [2.0, 3.0, 1.0], kmeans.cluster_points(test_cluster)[0].data_points[1].coordinates) self.assertEqual( [3.0, 1.0, 2.0], kmeans.cluster_points(test_cluster)[1].data_points[0].coordinates)
def test_assign_points(self): """ Tests initialize methods of the KMeans class. """ X, y, centers = generate_cluster_samples() n_samples = X.shape[0] k = centers.shape[0] kmeans = KMeans(k, N_ITER) # Set cluster centers so that assignment is deterministic kmeans.cluster_centers = centers assignments, distances = kmeans.assign_points(X) # check assignment array shape self.assertEqual(assignments.ndim, 1) self.assertEqual(assignments.shape[0], n_samples) # check distances array shape self.assertEqual(distances.ndim, 1) self.assertEqual(distances.shape[0], n_samples) # check that assignments only include valid cluster indices (0 <= idx < k) self.assertTrue( np.all(np.logical_and(assignments < k, assignments >= 0))) # Check cluster assignments are correct self.assertTrue(np.all(assignments[:25] == 0)) self.assertTrue(np.all(assignments[25:50] == 1)) self.assertTrue(np.all(assignments[50:75] == 2)) self.assertTrue(np.all(assignments[75:] == 3))
def test_initialize(self): """ Tests initialize methods of the KMeans class. """ k = 3 n_samples = 100 n_features = 10 for i in range(N_TRIALS): X = np.random.randn(n_samples, n_features) kmeans = KMeans(k, N_ITER) kmeans.initialize_clusters(X) # ensure that the cluster_centers matrix has the right shape self.assertEqual(kmeans.cluster_centers.ndim, 2) self.assertEqual(kmeans.cluster_centers.shape[0], k) self.assertEqual(kmeans.cluster_centers.shape[1], n_features) # Check that every center is one the points in X. # Calculcate the distances between every cluster center # and every point in X. Find the closest matches. # Checks that the distances are nearly 0.0 distances = find_smallest_distances(X, kmeans.cluster_centers) for d in distances: self.assertAlmostEqual(d, 0.0)
def cluster_newsgroups(): """ Cluster newsgroup categories. """ from kmeans import KMeans from similarity import simMatrix corpus, dictionary = build_dictionary(bigram=True) tfidf = TFIDF(dictionary) newsgroups = tfidf.vectorize(corpus) dictionary = tfidf.dictionary categories = sorted(corpus.keys()) N = 6 print "\n{}-Most Common Words".format(N) for index, category in enumerate(categories): nlargest = np.argpartition(newsgroups[index,:], -N)[-N:] nlargest = nlargest[np.argsort(newsgroups[index,nlargest])][::-1] print "{:>24} {}".format(category, dictionary[nlargest]) print K = 3 km = KMeans(n_clusters=K) km.fit(newsgroups) labels = km.labels_ print "\nKMeans Label Assignment, K = {}".format(K) for category, label, in zip(categories, labels): print int(label), category simMatrix(newsgroups).plot().show()
def test_kmeans(self): locations = [[1, 1], [1, 2], [2, 1], [1, 3], [3, 1], [2, 2], [10, 10], [10, 20], [20, 10], [10, 30], [30, 10], [20, 20]] clusterer = KMeans(2) clusterer.train(locations)
def plot_elbow(interval, data, random_seed=None): inertia = [] for n_clusters in interval: clf = KMeans(k=n_clusters, init='kmeans++', random_seed=random_seed) clf.fit(data) inertia.append(clf.inertia) plot_metrics(interval, inertia, 'Elbow method', 'Number of clusters (K)', 'Sum of Squared Error')
def test06_fit_two_clusters(self): np.random.seed(1) model = KMeans(k=2, init=init.forgy_initialization) data = np.array([[-1.0, 0.0], [-1.001, 0.0], [-0.999, 0.0], [0.0, 1.0], [0.0, 0.999], [0.0, 1.001]]) model.fit(data) self.assertEquals(model.predict(data), [1, 1, 1, 0, 0, 0])
def test_fit(self): expected_labels = [0, 0, 0, 1, 1, 1] expected_centroids = [[-1.6666667, -1.6666667], [1.6666667, 1.6666667]] expected_inertia = 2.6666667 k_means = KMeans(num_clusters=self.num_clusters, seed=1) k_means.fit(self.data) self.assertEqual(expected_labels, k_means.labels_) np.testing.assert_almost_equal(expected_centroids, k_means.centroids_) self.assertAlmostEqual(expected_inertia, k_means.inertia_)
def squared_clustering_errors(inputs, k): """finds the total squared error from k-means clustering the inputs""" clusterer = KMeans(k) clusterer.train(inputs) means = clusterer.means() assignments = map(clusterer.classify, inputs) return sum(squared_distance(input, means[cluster]) for input, cluster in zip(inputs, assignments))
def task3(dataset): dimensions = len(dataset.training[0].data) print('k-means') for k in [9, 10, 20]: kmeans = KMeans(dimensions, k) kmeans.train(dataset.training) predictions = [kmeans(x) for x in dataset.testing] print('k=', k, ' ', sep='', end='') print_error(predictions, dataset.testing)
def process_articles(input_file, num_partitions=8): sc = SparkContext() try: input_rdd = sc.textFile(input_file) vectorized_docs = CorpusVectorizer(input_rdd).vectorize_corpus() centroids = KMeans(vectorized_docs).centroids print >> sys.stdout, centroids.take(4) except Exception as e: print >> sys.stderr, "Unable to load file" print >> sys.stderr, e sys.exit(0)
def ClusterQueryDoc(dataset,rankerPath,feature_count, path_train_dataset, path_test_dataset, iterations, click_model, clusterData,queryDataPath,from_var,to_var): C = Fake(dataset, path_train_dataset, rankerPath,feature_count) C.Save() bestRankersFile = 'QueryData/'+dataset+'.data' KM = KMeans(from_var, to_var, bestRankersFile, dataset) (queryToCluster, clusterToRanker) = KM.runScript() g=GroupRanker(path_train_dataset,path_test_dataset,feature_count,iterations,click_model,dataset,clusterData,queryDataPath) g.groupRanker()
def _compute_k_means_clusters(data, similarity_calculator, similarity_diff_threshold): computed_clusters = {} k_means = KMeans(data.persons, similarity_calculator) for personID in data.originalPeople: friends_of_person = data.persons.getPerson(personID).getFriends() if len(friends_of_person) > 250: k = 12 else: k = 6 clusters = k_means.computeClusters(friends_of_person, k, similarity_diff_threshold) computed_clusters[personID] = clusters return computed_clusters
def calculate_em(X, n_clusters, diag=False, ridge=1e-10, verbose=False, max_iterations=100): """ Returns mu, sigma and tpi """ n_samples, n_features = X.shape # Initialise the data using kmeans k_means = KMeans(k=n_clusters) k_means_labels, _ = k_means.fit(X.copy()) k_means_cluster_centers = k_means.centers_ # OK, so we've got the centers and the labels. Let's now compute the EM # algorithm tau = np.zeros((n_samples, n_clusters)) mu = np.zeros((n_clusters, n_features)) sigma = np.zeros((n_clusters, n_features, n_features)) p = np.zeros((n_clusters, n_samples)) # FIXME shouldbe able to do the following using pure matric arithmetics for i, element in enumerate(k_means_labels): tau[i, element] = 1 for j in range(max_iterations): old_mu = mu.copy() for i in range(n_clusters): mu[i] = (tau[:, i].reshape((tau.shape[0], 1)) * X).sum(axis=0) / (tau[:, i]).sum() for i in range(n_clusters): a = 0 for n in range(n_samples): b = (X[n, :] - mu[i]).reshape((2, 1)) if diag: a += tau[n, i] * np.dot(b.T, b) else: a += tau[n, i] * np.dot(b, b.T) if diag: sigma[i, :] = a.mean() / tau[:, i].sum() * np.identity(mu.shape[1]) else: sigma[i, :] = a / tau[:, i].sum() tpi = tau.sum(axis=1) / n_samples for i in range(n_clusters): p[i, :] = _calculate_normal(X, mu[i, :], sigma[i, :]) for i in range(n_clusters): tau.T[i, :] = tpi[i] * p[i, :] / (tpi * p).sum(axis=0) if ((old_mu - mu) ** 2).sum() < ridge: if verbose: print "break at iterations %d" % j break return mu, sigma, tpi
class Reducer: def __init__(self): self.k = int(self.params.get("k", "10")) self.max_iterations = int(self.params.get("max_iterations", "100")) self.kmeans = KMeans(self.k, self.max_iterations) def __call__(self, key, values): # convert input to numpy_array and feed the vectors to KMeans instance for _vid, _vector_array in enumerate(values): _vector_array = numpy.array(_vector_array) self.kmeans.add_vector(_vid, _vector_array) self.kmeans.initialize() for _cluster in self.kmeans.run(): for item in _cluster: yield item.cid, item
def apply_decluster(self): """ apply window method to the whole catalog and write mainshocks on file """ # get instances of classes we'll need catalog = Catalog() kmeans = KMeans() # from the catalog we want, get earthquakes array on memory earthquake_array = catalog.get_earthquake_array('../catalogs/new_jma.txt') # decluster array, separating mainshocks and aftershocks declustered_array = kmeans.do_kmeans(earthquake_array) # record the mainshocks on a catalog catalog.record_mainshocks(declustered_array, file_write='../results/mainshocks.txt', file_read='../catalogs/jma.txt')
def main(args): df = pd.read_csv(args.data_csv) data = np.array(df[['X', 'Y']]) plt.clf() plt.scatter(data[:, 0], data[:, 1], s=3, color='blue') if args.algorithm == 'gmm': gmm = GaussianMixtureModel(args.num_clusters) gmm.fit(data) y = gmm.predict_cluster(data) else: km = KMeans(args.num_clusters) km.fit(data) y = km.predict(data) plt.scatter(data[:, 0], data[:, 1], c=y) plt.show()
def apply_decluster_smaller(self): """ apply window method to a smaller catalog and write mainshocks on file """ # get instances of classes we'll need catalog = Catalog() kmeans = KMeans() # obtain a smaller catalog, so we can run this function faster catalog.get_smaller_catalog(300) # from the catalog we want, get earthquakes array on memory earthquake_array = catalog.get_earthquake_array() # decluster array, separating mainshocks and aftershocks declustered_array = kmeans.do_kmeans(earthquake_array, 25) # record the mainshocks on a catalog catalog.record_mainshocks(declustered_array, file_write='../results/mainshocks.txt', file_read='../catalogs/reduced_jma.txt')
def kmeans(): filename = 0 threshold = 0 numClusters = 4 filename = 'data/4clusters.csv' kmeans = KMeans(filename, numClusters) clusters = kmeans.cluster() formatted = dict() for i, cluster in enumerate(clusters): formatted[i] = [] for point in cluster: #f_cluster = dict() #f_cluster[point[0]] = point[1] #formatted[i].append(f_cluster) formatted[i].append(point) print formatted return {'clusters':formatted, 'k':len(formatted), 'get_url': app.get_url}
def train(self, X, Y, beta, nb_epochs=5, normalize=False, gradients=True, alpha=1e-3): # m = nb_examples # n = nb_features # k = nb_hidden (nb of neurons in hidden layer) (m, n), k = X.shape, self.nb_hidden # K-Means for input layer: self.km = KMeans(nb_cluster=k) # select k centroids from training set: self.km.train(X, nb_iters=100, init_from_train=True) # set the centroids as ours weights from of the layer: self.weights[0] = self.km.get_centroids() # calcule the activation of rbf: # in pythonic way: A = np.array([np.exp(-beta * np.sum(np.power(self.weights[0] - x, 2), axis=1)) for x in X]).reshape(m, k) # in noob way: # A, betas = np.zeros((m, k)), np.ones((k, 1)) * beta # for i in range(m): # A[i] = self.get_activation(X[i, :], betas).T # speed up the convergence and its necessary # for pseudo-inverse method if normalize: # divide each row by its sum A = A / np.sum(A, axis=1)[:,None] # for bias A = add_column_with_ones(A) if gradients: errors = [] while not reached_precision(errors, precision=1e-7): # the same process as used in linear regression, i.e. # use the gradient descent for minimize the loss function self.weights[1] = self.gradient_descent(A, Y, beta, alpha) # calculate the errors with the trained weights: errors.append(np.sum(np.power(self.predict(X, beta, normalize=normalize) - Y, 2)) / m) else: # learn the weights using pseudo-inverse function: self.weights[1] = pinv(A.T.dot(A)).dot(A.T.dot(Y)) # calculate the errors with the trained weights: errors = [0, np.sum(np.power(self.predict(X, beta, normalize=normalize) - Y, 2)) / m] return errors
def main(): dataset = _load_csv_data(CSV_PATH, CSV_COLUMN_DELIMITER) k = 2 max_iter = 10 handler = KMeans(dataset, k=k) handler.kmeans() while k < max_iter: handler.reinitialize(k=k) handler.kmeans() k += 1
def compare_window_kmeans(num_entries): """ receives a number of entries the function applies to a catalog with that number of entries the window method and the kmeans to decluster the catalog the function returns a comparation, showing how much there is of a difference between the two Complexity: O(n^2) """ # obtain a smaller catalog catalog = Catalog() catalog.get_smaller_catalog(num_entries) # get earthquake array of that catalog quakes = catalog.get_earthquake_array() # get declustered array of that catalog, according to the window method window = Window() window_quakes = window.decluster(quakes) # get the number of mainshocks of that catalog, according to the kmeans clustering method num_mainshocks = 0 for i in range(len(window_quakes)): if window_quakes[i].is_aftershock == False: num_mainshocks += 1 print(num_mainshocks) # apply declustering using the kmeans method to the catalog kmeans = KMeans() kmeans_quakes = kmeans.do_kmeans(quakes, num_mainshocks) # show what are the differences between both methods for i in range(len(quakes)): if window_quakes[i].is_aftershock != kmeans_quakes[i].is_aftershock: print("found a difference!")
def main(): print "K-Means algorithm illustrated through the iris dataset" print "The algorithm uses random initialization and iterates until no iris" print "switches clusters." print "If matplotlib is installed, the resulting clusters are illustrated" print "in a graphical manner." print # import the data irii = import_csv() # create and initialize the algorithm kmeans = KMeans(3, irii, euclidean_similarity) # run the algorithm num_iterations = kmeans.run() print "K-Means ran in %d iterations" % (num_iterations) print print "SSE Values for each cluster:" for cluster_num, sse in enumerate(kmeans.sses()): num_members = len(kmeans.clusters[cluster_num]) print "Cluster %d (%d members): %f" % (cluster_num, num_members, sse) print # create a plot try: print "Plotting sepal length vs sepal width." print "Colors indicate the cluster, as identified by k-means across all" print "attributes. The symbol indicates the 'correct' group, as" print "determined by the name of the IRIS. The black + symbols indicate" print "the centroids." create_plot(kmeans) except: print "There was a plotting error. Do you have matplotlib installed?"
def urf_games(): my_kmeans = KMeans() results = [] results.append({"Control: Wards placed/Wards destroyed" : my_kmeans.calculate('control')}) results.append({'Damage: Physical/Magic/True' : my_kmeans.calculate('damage')}) results.append({'Economy: Gold Earned/Gold Spent' : my_kmeans.calculate('economy')}) results.append({'Kills: Kills/Deaths/Assists' : my_kmeans.calculate('kills')}) results.append({'Multi Kills: Combo Kills/Killing Sprees/Largest Killing Spree': my_kmeans.calculate('multi_kills')}) return render_template('results.html', data=results)
from kmeans import KMeans from matplotlib import pyplot as plt path_to_file = "casino1.jpg" import matplotlib.image as mpimg img = mpimg.imread(path_to_file) pixels = [pixel for row in img for pixel in row] clusterer = KMeans(5) clusterer.train(pixels) # this might take a while def recolor(pixel): cluster = clusterer.classify(pixel) # index of the closest cluster return clusterer.means[cluster] new_img = [[recolor(pixel) for pixel in row] # recolor this row of pixels for row in img] plt.imshow(new_img) plt.axis('off') plt.show()
def __init__(self, n_clusters, initCent, max_iter): self.data = np.array([]) self.belongApp = np.array([]) self.n_clusters = n_clusters self.clf = KMeans(n_clusters, initCent, max_iter)
class ClsfCRTL(object): def __init__(self, n_clusters, initCent, max_iter): self.data = np.array([]) self.belongApp = np.array([]) self.n_clusters = n_clusters self.clf = KMeans(n_clusters, initCent, max_iter) def genDataset(self, file_name): dataSet, belongApp = [], [] f = open(file_name, "r") lines = f.readlines() for line in lines: line_elm = line.split("\t") dataSet.append([int(line_elm[0]), 0]) belongApp.append(line_elm[1].rstrip("\n")) self.data = np.array(dataSet) self.belongApp = np.array(belongApp) f.close() def clsf(self): self.clf.fit(self.data) def show(self): cents = self.clf.centroids labels = self.clf.labels sse = self.clf.sse colors = ['b', 'g', 'r', 'k', 'c', 'm', 'y', '#e24fff', '#524C90', '#845868', '#00FF00', '#330000', '#333300', '#333333', '#CC0099', '#FFFF00', '#FF99CC', '#CCCC66', '#003333', '#66FFFF'] for i in range(self.n_clusters): index = np.nonzero(labels==i)[0] x0 = self.data[index, 0] x1 = self.data[index, 1] y_i = self.belongApp[index] for j in range(len(x0)): plt.scatter(x0[j], x1[j], marker='o', color=colors[i]) # plt.text(x0[j],x1[j],str(y_i[j]),color=colors[i],fontdict={'weight': 'bold', 'size': 9}) plt.scatter(cents[i,0], cents[i,1], marker='x', color=colors[i], linewidths=5) plt.title("SSE={:.2f}".format(sse)) plt.axis([0, 1600, -2, 2]) plt.show() def showBar(self): n = 1600 X = np.arange(n) Y1 = (1-X/float(n) * np.random.uniform(0.5, 1.0, n)) rect = plt.bar(X, +Y1, facecolor='#524C90', edgecolor='white') for x,y in zip(X, Y1): plt.text(x+0.4, y+0.05, '%.2f' % y, ha='center', va='bottom') plt.xlim(-0.5, 12.5) plt.ylim(-0.1, +1.25) plt.xlabel("xlabel") plt.ylabel("ylabel") plt.title("title") plt.legend((rect,), ("example",)) plt.show() def genResFile(self, i): cents = self.clf.centroids sse = self.clf.sse f = open("Res" + "-" + str(i), "w") f.write(str(cents.shape[0]) + '\n') for cent in cents: f.write(str(cent[0]) + '\t' + str(cent[1]) + '\n') f.write(str(sse) + '\n') # test f.write("\n") for clu in self.clf.clusterAssment: f.write(str(clu[0]) + '\t' + str(clu[1]) + '\n') # test f.close()
iris_data = load_iris() # ucitavanje Iris data seta iris_data = iris_data.data[:, 1:3] # uzima se druga i treca osobina iz data seta (sirina sepala i duzina petala) plt.figure() for i in range(len(iris_data)): plt.scatter(iris_data[i, 0], iris_data[i, 1]) plt.xlabel('Sepal width') plt.ylabel('Petal length') plt.show() # --- INICIJALIZACIJA I PRIMENA K-MEANS ALGORITMA --- # # TODO 2: K-means na Iris data setu kmeans = KMeans(n_clusters=2, max_iter=100) kmeans.fit(iris_data, normalize=True) colors = {0: 'red', 1: 'green'} plt.figure() for idx, cluster in enumerate(kmeans.clusters): plt.scatter(cluster.center[0], cluster.center[1], c=colors[idx], marker='x', s=200) # iscrtavanje centara for datum in cluster.data: # iscrtavanje tacaka plt.scatter(datum[0], datum[1], c=colors[idx]) plt.xlabel('Sepal width') plt.ylabel('Petal length') plt.show() # --- ODREDJIVANJE OPTIMALNOG K --- #
def doTrain(self, feats, clusters, maxIters = 1024, epsilon = 1e-4): # Initialise using kmeans... km = KMeans() kmAssignment = numpy.empty(feats.shape[0],dtype=numpy.float_) km.train(feats,clusters,assignOut = kmAssignment) # Create the assorted data structures needed... mix = numpy.ones(clusters,dtype=numpy.float_)/float(clusters) mean = numpy.empty((clusters,feats.shape[1]),dtype=numpy.float_) for c in xrange(clusters): mean[c,:] = km.getCentre(c) sd = numpy.zeros(clusters,dtype=numpy.float_) tempCount = numpy.zeros(clusters,dtype=numpy.int_) for f in xrange(feats.shape[0]): c = kmAssignment[f] dist = ((feats[f,:] - mean[c,:])**2).sum() tempCount[c] += 1 sd += (dist-sd)/float(tempCount[c]) sd = numpy.sqrt(sd/float(feats.shape[1])) wv = numpy.ones((feats.shape[0],clusters),dtype=numpy.float_) # Weight vectors calculated in e-step. pwv = numpy.empty(clusters,dtype=numpy.float_) # For convergance detection. norms = numpy.empty(clusters,dtype=numpy.float_) # Normalising constants for the distributions, to save repeated calculation. sqrt2pi = math.sqrt(2.0*math.pi) # The code... code = """ for (int iter=0;iter<maxIters;iter++) { // e-step - for all features calculate the weight vector (Also do convergance detection.)... for (int c=0;c<Nmean[0];c++) { norms[c] = pow(sqrt2pi*sd[c], Nmean[1]); } bool done = true; for (int f=0;f<Nfeats[0];f++) { float sum = 0.0; for (int c=0;c<Nmean[0];c++) { float distSqr = 0.0; for (int i=0;i<Nmean[1];i++) { float diff = FEATS2(f,i) - MEAN2(c,i); distSqr += diff*diff; } pwv[c] = WV2(f,c); float core = -0.5*distSqr / (sd[c]*sd[c]); WV2(f,c) = mix[c]*exp(core); // Unnormalised. WV2(f,c) /= norms[c]; // Normalisation sum += WV2(f,c); } for (int c=0;c<Nmean[0];c++) { WV2(f,c) /= sum; done = done && (fabs(WV2(f,c)-pwv[c])<epsilon); } } if (done) break; // Zero out mix,mean and sd, ready for filling... for (int c=0;c<Nmean[0];c++) { mix[c] = 0.0; for (int i=0;i<Nmean[1];i++) MEAN2(c,i) = 0.0; sd[c] = 0.0; } // m-step - update the mixing vector, means and sd... // *Calculate mean and mixing vector incrimentally... for (int f=0;f<Nfeats[0];f++) { for (int c=0;c<Nmean[0];c++) { mix[c] += WV2(f,c); if (WV2(f,c)>1e-6) // Msut not update if value is too low due to division in update - NaN avoidance. { for (int i=0;i<Nmean[1];i++) { MEAN2(c,i) += WV2(f,c) * (FEATS2(f,i) - MEAN2(c,i)) / mix[c]; } } } } // prevent the mix of any given component getting too low - will cause the algorithm to NaN... for (int c=0;c<Nmean[0];c++) { if (mix[c]<1e-6) mix[c] = 1e-6; } // *Calculate the sd simply, initial calculation is sum of squared differences... for (int f=0;f<Nfeats[0];f++) { for (int c=0;c<Nmean[0];c++) { float distSqr = 0.0; for (int i=0;i<Nmean[1];i++) { float delta = FEATS2(f,i) - MEAN2(c,i); distSqr += delta*delta; } sd[c] += WV2(f,c) * distSqr; } } // *Final adjustments for the new state... float mixSum = 0.0; for (int c=0;c<Nmean[0];c++) { sd[c] = sqrt(sd[c]/(mix[c]*float(Nfeats[1]))); mixSum += mix[c]; } for (int c=0;c<Nmean[0];c++) mix[c] /= mixSum; } """ # Weave it... weave.inline(code,['feats', 'maxIters', 'epsilon', 'mix', 'mean', 'sd', 'wv', 'pwv', 'norms', 'sqrt2pi']) # Store result... self.mix = mix self.mean = mean self.sd = sd
def fit(self, X): self.X = X self.N = X.shape[0] self.ndim = X.shape[1] np.random.seed(self.random_seed) matX = np.asmatrix(X) # initialization schemes if self.init_method == 'random': if self.init_means is not None: mu = self.init_means else: mu = X[np.random.choice(range(0, len(X)), self.num_gaussians), :] # sample from the data if self.init_cov is not None: sigma = self.init_cov else: sigma = list() for k in range(self.num_gaussians): sigma.append(np.identity(self.ndim, dtype=np.float64)) sigma[k] += np.random.rand(self.ndim, self.ndim) # purely synthetic sigma[k] = np.dot(sigma[k], sigma[k].T) # making it positive semi-definite and symmetric sigma[k] /= sigma[k].sum() # lowerbound = k * self.N / self.num_gaussians # sample from data # upperbound = lowerbound + 20 # sigma[k] = np.cov(X[lowerbound:upperbound, :].T) if self.init_weights is not None: lmbda = self.init_weights else: lmbda = np.random.rand(self.num_gaussians) lmbda /= lmbda.sum() elif self.init_method == 'kmeans': # use means of kmeans as initial means, and calculate cov from the clusters model = KMeans(K=self.num_gaussians, max_iter=5) model.fit(X) labels = model.pred(X) mu = np.zeros((self.num_gaussians, self.ndim)) sigma = [np.zeros((self.ndim, self.ndim))] * self.num_gaussians for k in range(self.num_gaussians): cluster = X[labels == k] mu[k] = cluster.mean(axis=0) sigma[k] = np.cov(cluster.T) if self.init_weights is not None: lmbda = self.init_weights else: lmbda = np.random.rand(self.num_gaussians) lmbda /= lmbda.sum() ######## BEGIN ACTUAL ALGORITHM ################### for iter in range(self.max_iter): phat = np.zeros((self.N, self.num_gaussians)) N = np.zeros(self.num_gaussians) # E step for k in range(0, self.num_gaussians): normal_var = normal(mean=mu[k], cov=sigma[k]) phat[:, k] = lmbda[k] * normal_var.pdf(X) phat /= phat.sum(axis=1)[:, None] # faster to do it all with numpy than use loops # for n in range(0, self.N): # loop over each data point # for k in range(0, self.num_gaussians): # normalx = normal(mean=mu[k], cov=sigma[k]).pdf(X[n, :]) # phat[n, k] = lmbda[k] * normalx # phat[n, :] /= phat[n, :].sum() # M step for k in range(self.num_gaussians): N[k] = phat[:, k].sum() mu[k] = np.dot(phat[:, k], X) / N[k] intermed = np.multiply(phat[:, k], (matX - mu[k]).T).T sigma[k] = np.dot(intermed.T, (matX - mu[k])) / N[k] lmbda[k] = N[k] / self.N pass # end of this iteration self.mu = mu self.sigma = sigma self.lmbda = lmbda
dimensions = 3 for tests in range(0, 4): precisions = [] times = [] for executions in range(0, 100): print(str(executions) + "%") start_time = time.time() particles = [] k_means = KMeans() # Particles Initialization for i in range(population_size): p = Particle() num_clusters = random.randint(2, 7) plist = [num_clusters, ] plist.extend([random.uniform(0, 10) for i in range(0, k_means.dimens * 7)]) p.current_position = array(plist) p.best_position = p.current_position p.fitness = 0.0 p.velocity = 0.0 particles.append(p)
import cPickle import matplotlib.pyplot as plt import numpy as np from kmeans import KMeans,biKMeans if __name__ == "__main__": #加载数据 X,y = cPickle.load(open('data.pkl','r')) #依次画出迭代1次、2次、3次...的图 for max_iter in range(6): #设置参数 n_clusters = 10 initCent = X[50:60] #将初始质心初始化为X[50:60] #训练模型 clf = KMeans(n_clusters,initCent,max_iter) clf.fit(X) cents = clf.centroids labels = clf.labels sse = clf.sse #画出聚类结果,每一类用一种颜色 colors = ['b','g','r','k','c','m','y','#e24fff','#524C90','#845868'] for i in range(n_clusters): index = np.nonzero(labels==i)[0] x0 = X[index,0] x1 = X[index,1] y_i = y[index] for j in range(len(x0)): plt.text(x0[j],x1[j],str(int(y_i[j])),color=colors[i],\ fontdict={'weight': 'bold', 'size': 9}) plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],linewidths=12)
import numpy as np import matplotlib.pyplot as plt from scipy.misc import * import scipy.io as sio from kmeans import KMeans plt.close('all') X = sio.loadmat('ex7data2.mat')['X'] classifier = KMeans(X) initial_centroids = np.asarray([[3, 3], [6, 2], [8, 5]]) idx = classifier.find_closest_centroids(initial_centroids) print("Closest centroids for the first 3 examples:") print(np.str(idx[0:3])) print("(the closest centroids should be 0, 2, 1 respectively)") centroids = classifier.compute_centroids(idx) print("Centroids computed after initial finding of closest centroids: \n") print(np.str(centroids)) print('(the centroids should be'); print(' [ 2.428301 3.157924 ]'); print(' [ 5.813503 2.633656 ]'); print(' [ 7.119387 3.616684 ]\n'); centroids, idx = classifier.run(plot_progress=True) plt.show() print("K-Means Done.")
s1[i] = (x1, y1) r2, theta2 = np.random.normal(5, 0.25), np.random.uniform(0, 2*np.pi) x2, y2 = r2 * np.cos(theta2), r2 * np.sin(theta2) s2[i] = (x2, y2, r2, theta2) plt.scatter(x1, y1) plt.scatter(x2, y2) data.append((x1, y1)) data.append((x2, y2)) plt.show() # TODO 5: K-means nad ovim podacima kmeans = KMeans(n_clusters=2, max_iter=100) kmeans.fit(data) colors = {0: 'red', 1: 'green'} plt.figure() for idx, cluster in enumerate(kmeans.clusters): plt.scatter(cluster.center[0], cluster.center[1], c=colors[idx], marker='x', s=200) # iscrtavanje centara for datum in cluster.data: # iscrtavanje tacaka plt.scatter(datum[0], datum[1], c=colors[idx]) plt.show() # TODO 7: DBSCAN nad ovim podacima dbscan = DBScan(epsilon=1.2, min_points=3) dbscan.fit(data)
from em import calculate_em, _calculate_normal from em import calculate_log_likelihood from kmeans import KMeans n_clusters = 4 X = utils.load_data('EMGaussienne.data') Xtest = utils.load_data('EMGaussienne.test') max_iterations = 150 ridge = 1e-6 verbose = True n_samples, n_features = X.shape # Initialise the data using kmeans k_means = KMeans(k=n_clusters) k_means_labels, _ = k_means.fit(X.copy()) k_means_cluster_centers = k_means.centers_ mu, sigma, tpi = calculate_em(X, n_clusters) print 'Log likelihood %d' % calculate_log_likelihood(X, mu, sigma, tpi) print 'Log likelihood %d' % calculate_log_likelihood(Xtest, mu, sigma, tpi) p = np.zeros((n_clusters, n_samples)) for i in range(n_clusters): p[i, :] = _calculate_normal(X, mu[i, :], sigma[i, :]) em_labels = p.argmax(axis=0) p = np.zeros((n_clusters, n_samples)) for i in range(n_clusters):
def preConfigureModel(self): configureStart = time.time() # the pre-configuration starting time # get the previous trading day yesterday = self.dateOfTrade while True: yesterday -= datetime.timedelta(days = 1) datasetPath = 'dataset/' + yesterday.strftime('%d-%h-%G') if isdir(datasetPath): break else: continue try: remove('data/all_features.pkl') except: pass print 'Pre-Configuration stage, on date :', yesterday # the last day best features and last day points are saved featureObject = GetFeatures(datasetPath + '/corpora') extractionStart = time.time() featureObject.extractFeatures(parameters.ourFeatureType) # 2-word combinations fe method extractionEnd = time.time() print '\nFeature Extraction time : %.2f minutes' %((extractionEnd - extractionStart)/60) selectionStart = time.time() featureObject.selectFeatures(parameters.ourSelectionType, parameters.initialNumberOfFeatures) # bns fs method, no of features selectionEnd = time.time() print 'Feature Selection time : %.2f minutes' %((selectionEnd - selectionStart)/60) numberOfVectors = featureObject.representFeatures() print 'Document vectors formed .. ' copy('data/best_features.pkl', 'data/all_best_features.pkl') #------------------------------------ K-means Running --------------------------------------- print '\nRunning K-means ..' kmeansStart = time.time() kmeansObject = KMeans() kmeansObject.getDataPoints() kmeansObject.getInitialCenters() iterationNumber = 1 notConverged = True while notConverged == True and iterationNumber < parameters.maximumIterations : timeNow = time.time() if iterationNumber % 20 == 0: print '..Iteration Number : %3d Time Elapsed till now : %.2f minutes' %(iterationNumber, (timeNow - kmeans_start) / 60.0) else: pass kmeansObject.assignToCluster() notConverged = kmeansObject.recalculateCentroids() iterationNumber += 1 kmeansObject.saveClusters() kmeansEnd = time.time() print 'Kmeans running time : %.2f minutes' %((kmeansEnd - kmeansStart)/60) #------------------------------------------------------------------------------------------------------- # cluster info fileReader = open('data/cluster_info.pkl', 'r') clusterInfo = pickle.load(fileReader) fileReader.close() # for projected clustering print '\nPreparing the initial fading clusters ..' projectedClusteringStart = time.time() self.projectedClusteringObject = ProjectedClustering() self.projectedClusteringObject.prepareFadingClusters(clusterInfo) projectedClusteringEnd = time.time() print 'Fading clusters preparation time : %.2f minutes' %((projectedClusteringEnd - projectedClusteringStart) / 60) # take the last 10 files of 'yesterday' and store in the 'last' folder lastFileNames = [] fileReader = open(datasetPath + '/log_file.txt', 'r') for line in fileReader: lastFileNames.append(line.split(' ')[0]) fileReader.close() fileWriter = open('last/log.txt', 'w') # write the names of files lastFileNames = lastFileNames[-10:] for fileName in lastFileNames: try: copy(datasetPath + '/corpora/' + fileName, 'last/' + fileName) except: pass fileWriter.write(fileName + '\n') fileWriter.close() # get the ann ready self.annObject.loadAnnWeights() print '\nANN locked and loaded ..' configureEnd = time.time() print "Total time taken to pre-configure : %.2f minutes" %((configureEnd - configureStart)/60)