# Clustering Evaluation Imports from functools import partial from sklearn.cluster import KMeans from sklearn.datasets import make_blobs as sk_make_blobs from yellowbrick.cluster import InterclusterDistance # Helpers for easy dataset creation N_SAMPLES = 1000 N_FEATURES = 12 SHUFFLE = True # Make blobs partial make_blobs = partial(sk_make_blobs, n_samples=N_SAMPLES, n_features=N_FEATURES, shuffle=SHUFFLE) if __name__ == '__main__': # Make 8 blobs dataset X, y = make_blobs(centers=12) # Instantiate the clustering model and visualizer # Instantiate the clustering model and visualizer visualizer = InterclusterDistance(KMeans(9)) visualizer.fit(X) # Fit the training data to the visualizer visualizer.poof(outpath="images/icdm.png") # Draw/show/poof the data
def cluster_metrics(i_patches, a_patches, g_patches, city_names, K, save_path, g_indices): # intra-cluster distances: ssd of samples to the nearest cluster centre sum_of_squared_distances = [] silhouette_scores = [] calinski_harabasz_scores = [] davies_bouldin_scores = [] k_mean_list = [] for k in K: model, k_means, A = get_kmeans144_result(a_patches, k) k_mean_list.append(k_means) sum_of_squared_distances.append(k_means.inertia_) labels = k_means.labels_ score = metrics.silhouette_score(A, labels, metric='euclidean') silhouette_scores.append(score) score = metrics.calinski_harabasz_score(A, labels) calinski_harabasz_scores.append(score) score = metrics.davies_bouldin_score(A, labels) davies_bouldin_scores.append(score) mydict = dict_cluster(i_patches, a_patches, g_patches, city_names, k_means) save_path_k = '{}_{}'.format(save_path, k) gt_ratio = gt_metric(mydict, save_path_k) plot_figure(K, sum_of_squared_distances, save_path, 'sum_of_squared_distances') plot_figure(K, silhouette_scores, save_path, 'silhouette_scores') plot_figure(K, calinski_harabasz_scores, save_path, 'calinski_harabasz_scores') plot_figure(K, davies_bouldin_scores, save_path, 'davies_bouldin_score') ssd_best_index = sum_of_squared_distances.index( max(sum_of_squared_distances)) sil_best_index = silhouette_scores.index(max(silhouette_scores)) ch_best_index = calinski_harabasz_scores.index( max(calinski_harabasz_scores)) db_best_index = davies_bouldin_scores.index(max(davies_bouldin_scores)) #gtr_best_index = gt_ratio.index(max(gt_ratio)) all_indices = [ ssd_best_index, sil_best_index, ch_best_index, db_best_index ] #, gtr_best_index] #, axis=None) best_k = np.array(K)[np.unique(all_indices)] for ind in range(len(K)): #best_k: # Visualize output clusters of K means in 2D k_means = k_mean_list[ind] visualizer = InterclusterDistance(k_means) visualizer.fit(A) # Fit the data to the visualizer #visualizer.show() # Finalize and render the figure visualizer.show( outpath='{}_{}_InterclusterDistance.png'.format(save_path, ind)) visualizer.poof() # Visualize through TSNE A_embedded = TSNE().fit_transform(A) plt.figure() palette = sns.color_palette("bright", 2) y_ = np.asarray(g_indices) y = y_.astype(np.float32) sns.scatterplot(A_embedded[:, 0], A_embedded[:, 1], hue=y, legend='full', palette=palette) plt.savefig('{}_tsne.png'.format(save_path)) return