def test_basic(self, Xl_blobs_easy): X, _ = Xl_blobs_easy # make it super easy to cluster a = DKKMeans(n_clusters=3, random_state=0) b = SKKMeans(n_clusters=3, random_state=0) a.fit(X) b.fit(X) assert_estimator_equal( a, b, exclude=["n_iter_", "inertia_", "cluster_centers_", "labels_"]) assert abs(a.inertia_ - b.inertia_) < 0.01 # order is arbitrary, so align first a_order = np.argsort(a.cluster_centers_, 0)[:, 0] b_order = np.argsort(b.cluster_centers_, 0)[:, 0] a_centers = a.cluster_centers_[a_order] b_centers = b.cluster_centers_[b_order] np.testing.assert_allclose(a_centers, b_centers, rtol=1e-3) b_labels = replace(b.labels_, [0, 1, 2], a_order[b_order]).astype(b.labels_.dtype) assert_eq(a.labels_.compute(), b_labels) assert a.n_iter_ # this is hacky b.cluster_centers_ = b_centers a.cluster_centers_ = a_centers assert_eq(a.transform(X), b.transform(X), rtol=1e-3) yhat_a = a.predict(X) yhat_b = b.predict(X) assert_eq(yhat_a.compute(), yhat_b)
def test_k_means(n_clusters, n_samples, n_features): # Initialize the models cluster_centers = np.random.rand(n_clusters, n_features) model = KMeans(n_clusters, n_features) sk_model = SKKMeans(n_clusters, cluster_centers, N_INIT, N_ITER) # Random dataset X = np.random.rand(n_samples, n_features) if n_clusters == 2: X[n_samples // 2:, :] += 0.5 else: X[n_samples // 3:2 * n_samples // 3, :] += 0.5 X[n_samples // 3:2 * n_samples // 3, :] -= 0.5 # Fit the models model.fit(X, N_ITER, N_INIT, cluster_centers) sk_model.fit(X) # get cluster-centers and sort them centers = np.sort(model.cluster_centers, axis=0) sk_centers = np.sort(sk_model.cluster_centers_, axis=0) # Compare between the centers assert centers.shape == sk_centers.shape diff = 0 for i in (centers - sk_centers): diff += np.linalg.norm(i)**2 assert diff < ACCEPTABLE_ERROR
def test_random_init(self, Xl_blobs_easy): X, y = Xl_blobs_easy X_ = X.compute() rs = 0 dkkm = DKKMeans(3, init="random", random_state=rs) skkm = SKKMeans(3, init="random", random_state=rs, n_init=1) dkkm.fit(X) skkm.fit(X_) assert abs(dkkm.inertia_ - skkm.inertia_) < 1e-4 assert dkkm.init == "random"
def test_kmeanspp_init(self, Xl_blobs_easy): X, y = Xl_blobs_easy X_ = X.compute() rs = np.random.RandomState(0) dkkm = DKKMeans(3, init="k-means++", random_state=rs) skkm = SKKMeans(3, init="k-means++", random_state=rs) dkkm.fit(X) skkm.fit(X_) assert abs(dkkm.inertia_ - skkm.inertia_) < 1e-4 assert dkkm.init == "k-means++"
def test_fit_given_init(self, X_blobs): X_ = X_blobs.compute() x_squared_norms = k_means_.row_norms(X_, squared=True) rs = np.random.RandomState(0) init = k_means_._k_init(X_, 3, x_squared_norms, rs) dkkm = DKKMeans(3, init=init, random_state=rs) skkm = SKKMeans(3, init=init, random_state=rs, n_init=1) dkkm.fit(X_blobs) skkm.fit(X_) assert_eq(dkkm.inertia_, skkm.inertia_)
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): inputarr = table[input_cols] if n_samples is None: n_samples = len(inputarr) validate(greater_than_or_equal_to(n_clusters, 1, 'n_clusters'), greater_than_or_equal_to(n_init, 1, 'n_init'), greater_than_or_equal_to(max_iter, 1, 'max_iter'), greater_than(tol, 0.0, 'tol'), greater_than_or_equal_to(n_jobs, 1, 'n_jobs'), greater_than_or_equal_to(n_samples, 0, 'n_samples')) k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) params = {'input_cols':input_cols, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol, 'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm} cluster_centers = k_means.cluster_centers_ labels = k_means.labels_ pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) fig_centers = _kmeans_centers_plot(input_cols, cluster_centers) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers) fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Kmeans Result | - Number of iterations run: {n_iter_}. | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_pca} | {fig_samples} | | ### Parameters | {params} """.format(n_iter_=k_means.n_iter_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params)))) model = _model_dict('kmeans') model['model'] = k_means model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = labels return {'out_table':out_table, 'model':model}
def test_fit_given_init(self): X, y = sklearn.datasets.make_blobs(n_samples=1000, n_features=4, random_state=1) X = da.from_array(X, chunks=500) X_ = X.compute() x_squared_norms = sklearn.utils.extmath.row_norms(X_, squared=True) rs = np.random.RandomState(0) init = _k_init(X_, 3, x_squared_norms, rs) dkkm = DKKMeans(3, init=init, random_state=0) skkm = SKKMeans(3, init=init, random_state=0, n_init=1) dkkm.fit(X) skkm.fit(X_) assert_eq(dkkm.inertia_, skkm.inertia_)
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): feature_names, inputarr = check_col_type(table, input_cols) if n_samples is None: n_samples = len(inputarr) k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) params = {'input_cols':feature_names, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol, 'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm, 'n_samples':n_samples} cluster_centers = k_means.cluster_centers_ n_clusters = len(cluster_centers) colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters) labels = k_means.labels_ pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr) pca2 = pca2_model.transform(inputarr) fig_centers = _kmeans_centers_plot(feature_names, cluster_centers, colors) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers, seed, colors) fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2, colors) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Kmeans Result | - Number of iterations run: {n_iter_}. | - Sum of square error: {sse_}. | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_pca} | {fig_samples} | | ### Parameters | {params} """.format(n_iter_=k_means.n_iter_, sse_=k_means.inertia_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params)))) model = _model_dict('kmeans') model['model'] = k_means model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = labels return {'out_table':out_table, 'model':model}
def test_dtypes(self): X = da.random.uniform(size=(100, 2), chunks=(50, 2)) X2 = X.astype("f4") pairs = [(X, X), (X2, X2), (X, X2), (X2, X)] for xx, yy in pairs: a = DKKMeans() b = SKKMeans() a.fit(xx) b.fit(xx) assert a.cluster_centers_.dtype == b.cluster_centers_.dtype assert a.labels_.dtype == b.labels_.dtype assert a.transform(xx).dtype == b.transform(xx).dtype assert a.transform(yy).dtype == b.transform(yy).dtype
def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): feature_names, features = check_col_type(table, input_cols) if n_samples is None: n_samples = len(table) inputarr = features pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr) pca2 = pca2_model.transform(inputarr) silhouette_list = [] models = [] centers_list = [] images = [] for k in n_clusters_list: k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) models.append(k_means) predict = k_means.labels_ centersk = k_means.cluster_centers_ centers_list.append(centersk) score = silhouette_score(inputarr, predict) silhouette_list.append(score) samples = silhouette_samples(inputarr, predict) # silhouette_samples_list.append(samples) pca2_centers = pca2_model.transform(centersk) _, (ax1, ax2) = plt.subplots(1, 2) colors = cm.nipy_spectral(np.arange(k).astype(float) / k) y_lower = 0 for i, color in zip(range(k), colors): si = samples[predict == i] si.sort() sizei = si.shape[0] y_upper = y_lower + sizei ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, si, facecolor=color, edgecolor=color, alpha=0.7) # cluster label ax1.text(0.9, y_lower + 0.45 * sizei, str(i)) y_lower = y_upper if pca2.shape[1] == 1: ax2.scatter(pca2[:, 0][predict == i], pca2[:, 0][predict == i], color=color) else: ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color) ax1.axvline(x=score, color="red") ax1.set_xlim(right=1.0) ax1.set_yticks([]) ax1.set_xlabel("Silhouette coefficient values") ax1.set_ylabel("Cluster label") if pca2.shape[1] == 1: ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 0], marker='x', edgecolors=1, s=200, color=colors) ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 1st feature") else: ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors) ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.tight_layout() imagek = plt2MD(plt) plt.clf() images.append(imagek) argmax = np.argmax(silhouette_list) best_k = n_clusters_list[argmax] best_model = models[argmax] predict = best_model.predict(inputarr) best_centers = best_model.cluster_centers_ best_labels = best_model.labels_ best_sse = best_model.inertia_ n_clusters = len(best_centers) colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters) fig_centers = _kmeans_centers_plot(feature_names, best_centers, colors) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers, seed, colors) fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2, colors) x_clusters = range(len(n_clusters_list)) plt.xticks(x_clusters, n_clusters_list) plt.plot(x_clusters, silhouette_list, '.-') plt.xlabel("Number of Clusters k") plt.tight_layout() fig_silhouette = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Kmeans Silhouette Result | - silhoutte metrics: | {fig_silhouette} | - best K: {best_k} | - Sum of square error: {best_sse}. | - best centers: | {fig_pca} | {fig_centers} | {fig_samples} | """.format(fig_silhouette=fig_silhouette, best_k=best_k, best_sse=best_sse, fig_pca=fig_pca, fig_centers=fig_centers, fig_samples=fig_samples))) for k, image in zip(n_clusters_list, images): rb.addMD(strip_margin(""" | ### k = {k} | {image} | """.format(k=k, image=image))) model = _model_dict('kmeans_silhouette') model['best_k'] = best_k model['best_centers'] = best_centers model['best_model'] = best_model model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = predict # out_table['silhouette'] = silhouette_samples_list[best_k-2] # out_table = out_table.sort_values(by=['prediction','silhouette']) # out_table = out_table.reset_index(drop=True) return {'out_table': out_table, 'model': model}
def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): if n_samples is None: n_samples = len(table) inputarr = table[input_cols] validate(all_elements_greater_than(n_clusters_list, 1, 'n_clusters_list'), greater_than_or_equal_to(n_init, 1, 'n_init'), greater_than_or_equal_to(max_iter, 1, 'max_iter'), greater_than(tol, 0.0, 'tol'), greater_than_or_equal_to(n_jobs, 1, 'n_jobs'), greater_than_or_equal_to(n_samples, 0, 'n_samples')) pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) silhouette_list = [] silouette_samples_list = [] models = [] centers_list = [] images = [] for k in n_clusters_list: k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) models.append(k_means) predict = k_means.labels_ centersk = k_means.cluster_centers_ centers_list.append(centersk) score = silhouette_score(inputarr, predict) silhouette_list.append(score) samples = silhouette_samples(inputarr, predict) silouette_samples_list.append(samples) pca2_centers = pca2_model.transform(centersk) _, (ax1, ax2) = plt.subplots(1, 2) colors = cm.nipy_spectral(np.arange(k).astype(float) / k) y_lower = 0 for i, color in zip(range(k), colors): si = samples[predict == i] si.sort() sizei = si.shape[0] y_upper = y_lower + sizei ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, si, facecolor=color, edgecolor=color, alpha=0.7) y_lower = y_upper ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color) ax1.axvline(x=score, color="red") ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors) imagek = plt2MD(plt) plt.clf() images.append(imagek) argmax = np.argmax(silhouette_list) best_k = n_clusters_list[argmax] best_model = models[argmax] predict = best_model.predict(inputarr) best_centers = best_model.cluster_centers_ best_labels = best_model.labels_ fig_centers = _kmeans_centers_plot(input_cols, best_centers) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers) fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2) x_clusters = range(len(n_clusters_list)) plt.xticks(x_clusters, n_clusters_list) plt.plot(x_clusters, silhouette_list, '.-') fig_silhouette = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Kmeans Silhouette Result | - silloutte metrics: | {fig_silhouette} | - best K: {best_k} | - best centers: | {fig_pca} | {fig_centers} | {fig_samples} | """.format(fig_silhouette=fig_silhouette, best_k=best_k, fig_pca=fig_pca, fig_centers=fig_centers, fig_samples=fig_samples))) for k, image in zip(n_clusters_list, images): rb.addMD( strip_margin(""" | ### k = {k} | {image} | """.format(k=k, image=image))) model = _model_dict('kmeans_silhouette') model['best_k'] = best_k model['best_centers'] = best_centers model['best_model'] = best_model model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = predict return {'out_table': out_table, 'model': model}
if n_colors == 'bw': # Black and white option n_colors = 2 bw = True assert 0 < int(n_colors) < 256 # Upload the image and convert it to numpy array img = Image.open(img_path) img = np.array(img) w, h, d = img.shape X = img.reshape((w * h, d)) print(f"Succefly uploaded image from: \"{img_path}\"") # Initialize K-Means model and cauterize the data model = SKKMeans(int(n_colors), "random", 1, 100) model.fit(X) labels = model.labels_ print("Succefly created and fit K-Means model") # Cluster centers if bw: centers = np.array([[0, 0, 0], [255, 255, 255]]) else: centers = model.cluster_centers_ # Build new image new_img = [] for label in labels: new_img.append(centers[int(label)])