def apply_kmeans(do_pca, x_train, y_train, x_test, y_test, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## max_repeat = 7 for repeat in range(1, max_repeat): if repeat == 1: for k in range(1, kmeans_max_k): kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) elif repeat == max_repeat-1: for k in range(1, kmeans_max_k): kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter[k-1] += sse_vs_iter[k-1] train_sses_vs_iter[k-1] = train_sses_vs_iter[k-1]/repeat train_purities_vs_k[k-1] += kmeans.get_purity(x_train, y_train) train_purities_vs_k[k-1] = train_purities_vs_k[k-1] / repeat print("Purity: ", train_purities_vs_k[k-1]) train_sses_vs_k[k-1] += min(sse_vs_iter) train_sses_vs_k[k-1] = train_sses_vs_k[k-1]/repeat else: for k in range(1, kmeans_max_k): kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter[k-1] += sse_vs_iter[k-1] train_purities_vs_k[k-1] += kmeans.get_purity(x_train, y_train) train_sses_vs_k[k-1] += min(sse_vs_iter) plot_y_vs_x_list(train_sses_vs_iter, x_label='iter', y_label='sse', save_path='plot_sse_vs_k_subplots_%d'%do_pca) plot_y_vs_x(train_sses_vs_k, x_label='k', y_label='sse', save_path='plot_sse_vs_k_%d'%do_pca) plot_y_vs_x(train_purities_vs_k, x_label='k', y_label='purities', save_path='plot_purity_vs_k_%d'%do_pca)
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## start = time.time() for k in range(1, kmeans_max_k): print("On step k =", k, "of", kmeans_max_k, "\telapsed time: %.2f" % (time.time() - start), "s") kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) plot_y_vs_x_list(train_sses_vs_iter, x_label='iter', y_label='sse', save_path='plot_sse_vs_k_subplots_%d' % do_pca) plot_y_vs_x(train_sses_vs_k, x_label='k', y_label='sse', save_path='plot_sse_vs_k_%d' % do_pca) plot_y_vs_x(train_purities_vs_k, x_label='k', y_label='purities', save_path='plot_purity_vs_k_%d' % do_pca)
def apply_kmeans(do_pca, x_train, y_train, x_test, y_test, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## for k in range(1, kmeans_max_k): kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) plot_y_vs_x_list(train_sses_vs_iter, x_label='iter', y_label='sse', save_path='plot_sse_vs_k_subplots_%d' % do_pca) plot_y_vs_x(train_sses_vs_k, x_label='k', y_label='sse', save_path='plot_sse_vs_k_%d' % do_pca) plot_y_vs_x(train_purities_vs_k, x_label='k', y_label='purities', save_path='plot_purity_vs_k_%d' % do_pca)
def apply_kmeans3(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## result = [] for k in range(1, 11): print('k:', k) for times in range(0, 5): kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) print(train_purities_vs_k) avg = sum(train_purities_vs_k) / len(train_purities_vs_k) result.append(avg) train_purities_vs_k = [] print(result) print('max purity', max(result)) plot_y_vs_x(result, x_label='k', y_label='purities', save_path='plot_purity_vs_k_%d' % do_pca)
def apply_kmeans1(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## for run in range(0, 5): kmeans = KMeans(6, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) result = [] for col in range(len(train_sses_vs_iter[0])): sum = 0 for row in range(0, 5): sum += train_sses_vs_iter[row][col] sum = sum / 5 result.append(sum) result = [result] print(result) plot_y_vs_x_list(result, x_label='iter', y_label='sse', save_path='plot_sse_vs_k_subplots_%d' % do_pca)
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## # iterations for 5 different runs of k-means. for k in range(0, 5): kmeans = KMeans(6, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) if k == 0: avg_list = [0] * len(sse_vs_iter) avg_list = [ avg_list[i] + sse_vs_iter[i] for i in range(len(sse_vs_iter)) ] plot_y_vs_x_list(train_sses_vs_iter, x_label='iter', y_label='sse', save_path='plot_sse_vs_k_subplots_%d' % do_pca) plot_y_vs_x(avg_list, x_label='iterations', y_label='sse', save_path='plot_sse_vs_iter_%d' % do_pca)
def main(dataset_fn, output_fn, clusters_no): geo_locs = [] # read location data from csv file and store each location as a Point(latit,longit) object df = pd.read_csv(dataset_fn) for index, row in df.iterrows(): loc_ = Point(float(row['LAT']), float(row['LON'])) #tuples for location geo_locs.append(loc_) # run k_means clustering model = KMeans(geo_locs, clusters_no) flag = model.fit(True) if flag == -1: print("No of points are less than cluster number!") else: # save clustering results is a list of lists where each list represents one cluster model.save(output_fn)
def _fit(self, X): cov = np.cov(X.T) kmeans = KMeans(self.n_components) kmeans.fit(X) self.mu = kmeans.centers self.cov = np.array([cov for _ in range(self.n_components)]) self.coef = np.ones(self.n_components) / self.n_components params = np.hstack( (self.mu.ravel(), self.cov.ravel(), self.coef.ravel()) ) while True: stats = self._expectation(X) self._maximization(X, stats) new_params = np.hstack( (self.mu.ravel(), self.cov.ravel(), self.coef.ravel()) ) if np.allclose(params, new_params): break else: params = new_params
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## for k in range(1, kmeans_max_k): sses = None avg_purity = 0. # do five tests to reduce effect of random start for i in range(5): kmeans = KMeans(k, kmeans_max_iter) sse = kmeans.fit(x_train) if (sses == None): sses = sse else: for j in range(len(sse)): sses[j] = (sses[j] + sse[j]) avg_purity += kmeans.get_purity(x_train, y_train) avg_purity = avg_purity / 5. for j in range(len(sses)): sses[j] = sses[j] / 5.0 # avg_sses = np.sum(np.array(sses), 0) / 5 train_sses_vs_iter.append(sses) train_purities_vs_k.append(avg_purity) train_sses_vs_k.append(min(sses)) plot_y_vs_x_list(train_sses_vs_iter, x_label='iter', y_label='sse', save_path='plot_sse_vs_k_subplots_%d' % do_pca) plot_y_vs_x(train_sses_vs_k, x_label='k', y_label='sse', save_path='plot_sse_vs_k_%d' % do_pca) plot_y_vs_x(train_purities_vs_k, x_label='k', y_label='purities', save_path='plot_purity_vs_k_%d' % do_pca)
def apply_kmeans_2(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): print('kmeans\n') train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] avg_me = [] for k in range(1, 11): for it in range(0, 5): kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) avg_me.append((sum(train_sses_vs_k) / len(train_sses_vs_k))) plot_y_vs_x(avg_me, x_label='k', y_label='sse', save_path='plot_sse_vs_k_%d' % do_pca)
def main(): k = 3 X = [[random.randint(0,20),random.randint(0,20)] for i in range(30)] \ + [[random.randint(40,60), random.randint(40,60)] for i in range(30)] \ + [[random.randint(80, 100), random.randint(80, 100)] for i in range(30)] print(f"Cluster points:{X}") kmeans = KMeans(n_cluster=k, tol=3e-4) centroids = kmeans.fit(X) prediction = kmeans.predict([[0.0,0.0],[50.0,40.0],[100.0,100.0]]) print(f"KMeans centroids: {centroids}") print(f"KMeans predict for [0,0],[50,40],[100,100]]: {prediction}") colors = ['r', 'g', 'b'] for i in range(k): plt.scatter([x[0] for x in X], [x[1] for x in X], s=7, c=colors[i]) plt.scatter([x[0] for x in centroids], [x[1] for x in centroids], marker='*', s=200, c='black') plt.show()
def apply_kmeans_3(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] averg_list = [] for k in range(1, 11): for it in range(0, 5): kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) train_sses_vs_iter.append(sse_vs_iter) train_purities_vs_k.append(kmeans.get_purity(x_train, y_train)) train_sses_vs_k.append(min(sse_vs_iter)) averg_list.append( (sum(train_purities_vs_k) / len(train_purities_vs_k))) #plot the average purity plot_y_vs_x(averg_list, x_label='k', y_label='purities', save_path='plot_purity_vs_k_%d' % do_pca)
def main(dataset_fn, output_fn, clusters_no, w): geo_locs = [] # read location data from csv file and store each location as a Point(latit,longit) object df = pd.read_csv(dataset_fn) for index, row in df.iterrows(): loc_ = Node( [float(row['X']), float(row['Y']), float(row['PreChange'])], row['ID']) geo_locs.append(loc_) # run k_means clustering w = np.array(w) model = KMeans(geo_locs, clusters_no, w) flag = model.fit(True) if flag == -1: print("No of points are less than cluster number!") else: # save clustering results is a list of lists where each list represents one cluster model.save(output_fn) model.showresult(True)
def apply_kmeans_avg(x_train, y_train, kmeans_max_iter, k, iterations=5): train_sses_vs_iter = None sse = 0 purity = 0 print("") for step in range(iterations): print("On step ", step + 1, "of", iterations, "for k =", k) kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter_loop = np.array(kmeans.fit(x_train)) # initialize the train sse array if train_sses_vs_iter is None: train_sses_vs_iter = np.zeros(len(sse_vs_iter_loop)) train_sses_vs_iter += sse_vs_iter_loop purity += kmeans.get_purity(x_train, y_train) sse += sse_vs_iter_loop.min() return (train_sses_vs_iter / iterations).tolist(), sse / iterations, purity / iterations
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k): train_sses_vs_iter = [] train_sses_vs_k = [] train_purities_vs_k = [] ################################## # YOUR CODE GOES HERE # ################################## sses_sum = 0 purities_sum = 0 for k in range(1, kmeans_max_k): # for k in range(1, 6): for i in range(5): kmeans = KMeans(k, kmeans_max_iter) sse_vs_iter = kmeans.fit(x_train) sses_sum += min(sse_vs_iter) purities_sum += kmeans.get_purity(x_train, y_train) print(k) sses_sum /= 5 purities_sum /= 5 train_sses_vs_k.append(sses_sum) train_purities_vs_k.append(purities_sum) print(train_sses_vs_k) print(train_purities_vs_k) plot_y_vs_x_list(train_sses_vs_iter, x_label='iter', y_label='sse', save_path='plot_sse_vs_k_subplots_%d' % do_pca) plot_y_vs_x(train_sses_vs_k, x_label='k', y_label='sse', save_path='plot_sse_vs_k_%d' % do_pca) plot_y_vs_x(train_purities_vs_k, x_label='k', y_label='purities', save_path='plot_purity_vs_k_%d' % do_pca)
def _derive_initial_parameters(self, x): km = KMeans(k=self.k, tol=self.tol, max_iter=self.max_iter) km.fit(x) return km.means, km.covariance_matrices, km.cluster_probabilities
import numpy as np import pandas as pd import matplotlib.pyplot as plt from clustering import KMeans data = np.array( [1, 1, 0, 0, 0, 1, 1, 0, 5, 5, 5, 7, 7, 5, 6, 6, 0, 9, 0, 8, 1, 8, 1, 9]) data = np.reshape(data, (12, 2)) print(data) model = KMeans(k=3, iterations=10) model.fit(data) print(model.centroids_) labels = model.label(data) print(labels) data = pd.DataFrame(data) print(data) for ki in range(3): plt.scatter(data[labels == ki][0], data[labels == ki][1]) for c in model.centroids_: plt.scatter(c[0], c[1], c='k', marker='x') plt.show()
class ClusteringTest(unittest.TestCase): def setUp(self): random.seed(1) self.sc = SparkContext(master='local') self.points = self.sc.parallelize([ (1, 1, 0), (1, 2, 2), (2, 2, 2), (2, 1, 0), (8, 0, 0), (8, 1, 2), (9, 1, 0), (9, 2, 2), (1, 1, 7), (1, 2, 9), (2, 2, 9), (2, 1, 7), ]) self.k = 3 self.kmeans = KMeans(self.k) def tearDown(self): self.sc.stop() def test_find_outer_vertices(self): edges = KMeans.find_outer_vertices(self.points) self.assertEqual(edges, (1, 9, 0, 2, 0, 9)) def test_assign_points_to_centroids(self): centroids = { (1.5, 1.5, 1.0): [], (1.5, 1.5, 8.0): [], (8.5, 1.0, 1.0): [], } data = self.kmeans.assign_points_to_centroids(centroids, self.points).collectAsMap() self.assertDictEqual(data, { (1.5, 1.5, 1.0): [ (1, 1, 0), (1, 2, 2), (2, 2, 2), (2, 1, 0), ], (1.5, 1.5, 8.0): [ (1, 1, 7), (1, 2, 9), (2, 2, 9), (2, 1, 7), ], (8.5, 1.0, 1.0): [ (8, 0, 0), (8, 1, 2), (9, 1, 0), (9, 2, 2), ], }) def test_recalculate_cluster_centroids(self): centroids = [ ( (10, 11), [(1, 2), (3, 4)] ), ( (12, 13), [(2, 3), (4, 5), (6, 7)] ), ] clusters = self.sc.parallelize(centroids).groupByKey().flatMapValues(lambda x: x) data = self.kmeans.recalculate_cluster_centroids(clusters).collectAsMap() self.assertDictEqual(data, { (2, 3): [ (1, 2), (3, 4), ], (4, 5): [ (2, 3), (4, 5), (6, 7), ], }) def test_calculate_centroid(self): points = [ (1, 2, 3), (3, 4, 5), ] centroids = KMeans.calculate_centroid(points) self.assertEqual(centroids, (2, 3, 4)) def test_fit(self): points = self.sc.parallelize([ (1, 1), (1, 3), (3, 1), (3, 3), (5, 6), (5, 8), (7, 6), (7, 8), (10, 2), (10, 4), (12, 2), (12, 4), ]) centroids = self.kmeans.fit(points).collectAsMap() self.assertEqual(len(centroids), self.k) self.assertDictEqual(centroids, { (0.090909090909090912, 0.14285714285714285): [ (0.0, 0.0), (0.0, 0.2857142857142857), (0.18181818181818182, 0.0), (0.18181818181818182, 0.2857142857142857), ], (0.45454545454545453, 0.85714285714285721): [ (0.36363636363636365, 0.7142857142857143), (0.36363636363636365, 1.0), (0.5454545454545454, 0.7142857142857143), (0.5454545454545454, 1.0), ], (0.9090909090909092, 0.2857142857142857): [ (0.8181818181818182, 0.14285714285714285), (0.8181818181818182, 0.42857142857142855), (1.0, 0.14285714285714285), (1.0, 0.42857142857142855), ], }) def test_calculate_distance(self): a = (3, 5, 8, 15) b = (2, 3, 4, 5) distance = KMeans.calculate_distance(a, b) self.assertEqual(distance, 11) def test_calculate_average_distance(self): centre = (1, 1) points = [ (4, 1), (5, 4), ] self.assertEqual(KMeans.calculate_average_distance(centre, points), 4) def test_normalize_data(self): points = self.sc.parallelize([ (10, 2, 50, 11), (25, 4, 55, 12), (80, 9, 60, 19), (100, 10, 65, 21), ]) result = self.kmeans.normalize_data(points, 4).collect() self.assertEqual(result, [ (0, 0, 0, 0), (1/6.0, 1/4.0, 1/3.0, 1/10.0), (7/9.0, 7/8.0, 2/3.0, 4/5.0), (1, 1, 1, 1), ]) def test_normalize_data__single_column(self): points = self.sc.parallelize([ (1,), (2,), (3,), ]) result = self.kmeans.normalize_data(points, 1).collect() self.assertEqual(result, [ (0,), (0.5,), (1,), ])
def kmeans_area_example(): from time import time import numpy as np import matplotlib.pyplot as plt from sklearn import metrics from sklearn.cluster import KMeans from sklearn.datasets import load_digits from sklearn.decomposition import PCA from sklearn.preprocessing import scale np.random.seed(42) digits = load_digits() data = scale(digits.data) n_samples, n_features = data.shape n_digits = len(np.unique(digits.target)) labels = digits.target sample_size = 300 print("n_digits: %d, \t n_samples %d, \t n_features %d" % (n_digits, n_samples, n_features)) print(79 * '_') print( '% 9s' % 'init' ' time inertia h**o compl v-meas ARI AMI silhouette') def bench_k_means(estimator, name, data): t0 = time() estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size))) bench_k_means(KMeans(init='k-means++', n_clusters=n_digits, n_init=10), name="k-means++", data=data) bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10), name="random", data=data) # in this case the seeding of the centers is deterministic, hence we run the # kmeans algorithm only once with n_init=1 pca = PCA(n_components=n_digits).fit(data) bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1), name="PCA-based", data=data) print(79 * '_') reduced_data = PCA(n_components=2).fit_transform(data) kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10) kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # print Z # Put the result into a color plot Z = Z.reshape(xx.shape) print Z[0] plt.figure(1) plt.clf() plt.imshow( Z, #interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), #cmap=plt.cm.Paired, #aspect='auto', origin='lower' ) plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) # Plot the centroids as a white X centroids = kmeans.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n' 'Centroids are marked with white cross') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show()
def cluster_image(self, n_clusters, algorithm="km"): if algorithm == "km": kmeans = KMeans(n_clusters=n_clusters, visualise=self.visualise) self.labelled_image = kmeans.fit(self.image, self.segments)