import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns import clusters import plot data = clusters.get_data() num_clusters = 4 pipe = clusters.get_pipeline_kmeans(num_clusters) pipe.fit(data) _, _, data_pca = clusters.get_pca_data_kmeans(num_clusters) data_with_clusters = data data_with_clusters['predicted_cluster'] = pipe.predict(data) mean_vals = data_with_clusters.groupby('predicted_cluster', as_index=False).mean() max_vals = data_with_clusters.groupby('predicted_cluster', as_index=False).max() min_vals = data_with_clusters.groupby('predicted_cluster', as_index=False).min() std_dev_vals = data_with_clusters.groupby('predicted_cluster', as_index=False).std() # print(min_vals.to_markdown()) # Finally we plot all of our data and make it look a bit pretty plt.style.use("fivethirtyeight") plt.figure(figsize=(8, 8))
color = cm.nipy_spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-1, -0.8, -.6, -.4, -.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) plt.show() data = get_data() plot_average_silhouette(data, 8) plot_average_silhouette(data, 8, usePCA=False)