def do_kmeans(df, k): k_means = KMeans(init='k-means++', n_clusters=k, n_init=10, max_iter=1000, random_state=40) k_means.fit(df) wcss = k_means.inertia_ sil = silhouette_score(df, k_means.labels_) plt.style.use('default'); sample_silhouette_values = silhouette_samples(df, k_means.labels_) sizes = 200*sample_silhouette_values plt.figure(figsize=(16, 10)); plt.grid(True); plt.scatter(df.iloc[:, 0], df.iloc[:, 1], s=sizes, c=k_means.labels_) plt.scatter(k_means.cluster_centers_[:, 0], k_means.cluster_centers_[:, 1], marker='x', s=300, c="black") plt.title("K-Means (K={}, WCSS={:.2f}, Sil={:.2f})".format(k, wcss, sil), fontsize=20); plt.xlabel('Age', fontsize=22); plt.ylabel('Income', fontsize=22); plt.xticks(fontsize=18); plt.yticks(fontsize=18); plt.show() visualizer = SilhouetteVisualizer(k_means) visualizer.fit(df) visualizer.poof() fig = visualizer.ax.get_figure(); print("K={}, WCSS={:.2f}, Sil={:.2f}".format(k, wcss, sil))
def showSilhouette(): # Make 8 blobs dataset X, y = make_blobs(centers=8) # Instantiate the clustering model and visualizer model = MiniBatchKMeans(6) visualizer = SilhouetteVisualizer(model) visualizer.fit(X) # Fit the training data to the visualizer visualizer.poof() # Draw/show/poof the data
def Silhouette_plot(x, from_k, to_k): sil_score = [] for k in range(from_k, to_k + 1): #Instatiate the clustering model and visualizer m = KMeans(n_clusters=k) visualizer = SilhouetteVisualizer(m) visualizer.fit(x) #Draw/show/poof the data visualizer.poof() sil_score.append([visualizer.silhouette_score_.round(3), k]) return sil_score
def silhouette(matrix, k): """ This function is also not explicitly used since it shows the decided 'k' is good or not. :param matrix: tf-idf matrix :param k: decided k (from elbow matrix) :return: show graph with all cluster's internal similarities and uniqueness with other clusters. """ model_kmeans = KMeans(n_clusters=k, max_iter=200) silhouette = SilhouetteVisualizer(model_kmeans) silhouette.fit(matrix) silhouette.poof()
# Clustering Evaluation Imports from functools import partial from sklearn.cluster import MiniBatchKMeans from sklearn.datasets import make_blobs as sk_make_blobs from yellowbrick.cluster import SilhouetteVisualizer # Helpers for easy dataset creation N_SAMPLES = 1000 N_FEATURES = 12 SHUFFLE = True # Make blobs partial make_blobs = partial(sk_make_blobs, n_samples=N_SAMPLES, n_features=N_FEATURES, shuffle=SHUFFLE) if __name__ == '__main__': # Make 8 blobs dataset X, y = make_blobs(centers=8) # Instantiate the clustering model and visualizer model = MiniBatchKMeans(6) visualizer = SilhouetteVisualizer(model) visualizer.fit(X) # Fit the training data to the visualizer visualizer.poof(outpath="images/silhouette.png") # Draw/show/poof the data
# print(sample_silhouette_value) return sils ss = sil_score(x, 2, 5) print(f'score={ss}') print(f'optinum number of clusters ={max(ss)[1]}') # #Visualize Silhouette #Instantiate the clustering model and visualizer model = KMeans(n_clusters=3) visualizer = SilhouetteVisualizer(model) #fit the training data to visualizer visualizer.fit(x) #Draw/show/poof the data visualizer.poof() print(visualizer.silhouette_score_) #near 1 is good def Silhouette_plot(x, from_k, to_k): sil_score = [] for k in range(from_k, to_k + 1): #Instatiate the clustering model and visualizer m = KMeans(n_clusters=k) visualizer = SilhouetteVisualizer(m) visualizer.fit(x) #Draw/show/poof the data visualizer.poof() sil_score.append([visualizer.silhouette_score_.round(3), k]) return sil_score
axis=1)) / df_normalized.shape[0]) # Plot the elbow plt.plot(K, distortions, 'bx-') plt.xlabel('k') plt.ylabel('Distortion') plt.title('The Elbow Method showing the optimal k') plt.show() # Compute Silhoette Graph for different number of clusters to select # optimal number of clusters from sklearn.cluster import KMeans from yellowbrick.cluster import SilhouetteVisualizer for n_clusters in range(2, 9): model = SilhouetteVisualizer(KMeans(n_clusters)) model.fit(df_normalized) model.poof() # Utlize TSNE to visualize data from sklearn.cluster import KMeans from sklearn.manifold import TSNE import pylab as pl num_of_clusters = 4 kmeans = KMeans(n_clusters=num_of_clusters) kmeans.fit(df_normalized) X = TSNE(n_components=2).fit_transform(df_normalized) for i in range(0, X.shape[0]): if kmeans.labels_[i] == 0: c1 = pl.scatter(X[i, 0], X[i, 1], c='red') elif kmeans.labels_[i] == 1:
plt.title('Gap Values by Cluster Count') plt.savefig("Gap Values.png") plt.show() # ============================================================================= # ============================================================================= # Using the silhouette to find the optimal number of clusters for n_clusters in range(4, 10): model = KMeans(n_clusters, init='k-means++') cluster_labels = model.fit_predict(X) visualizer = SilhouetteVisualizer(model) visualizer.fit(X) # Fit the training data to the visualizer visualizer.show(outpath="BoW_Silhouette %d" % n_clusters) visualizer.poof() # Draw/show/poof the data silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # ============================================================================= # ============================================================================= # Clustering Using K-Means kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42) kmeans.fit(X) y_kmeans = kmeans.predict(X) # reduce the features to 2D reduced_features = pca.fit_transform(X) # reduce the cluster centers to 2D
plt.figure(dpi=150) plt.xlabel("Number of clusters") plt.ylabel("SSE") plt.plot(range(2, 30), SSE) plt.savefig("cluster_plot_tfidf") km_tfidf = MiniBatchKMeans(n_clusters=16, random_state=4444) nmf_tfidf_clusters2 = km_tfidf.fit_predict(nmf_tfidf_data) #Silhouette Plot visualiser_tfidf = SilhouetteVisualizer(MiniBatchKMeans(n_clusters=16), random_state=4444) visualiser_tfidf.fit(nmf_tfidf_data) visualiser_tfidf.poof() #TSNE Plot model_2 = TSNE(n_components=2, random_state=0, verbose=0) low_data_2 = model_2.fit_transform(nmf_tfidf_data) colors = ([ 'crimson', 'b', 'mediumseagreen', 'cyan', 'm', 'y', 'k', 'orange', 'springgreen', 'deepskyblue', 'yellow', 'teal', 'navy', 'plum', 'darkslategray', 'lightcoral', 'papayawhip' ]) plt.figure(dpi=150) for i, c, label in zip(range(16), colors, list(range(16))):
def silhouette_method(matrix, k): model_kmeans = KMeans(n_clusters=k, max_iter=200) silhouette = SilhouetteVisualizer(model_kmeans) silhouette.fit(matrix) silhouette.poof()
# Load the data from the files in the corpus for cat in categories: for name in os.listdir(os.path.join(path, cat)): files.append(os.path.join(path, cat, name)) target.append(cat) with open(os.path.join(path, cat, name), 'r') as f: data.append(f.read()) # Return the data bunch for use similar to the newsgroups example return Bunch( categories=categories, files=files, data=data, target=target, ) corpus = load_corpus('hobbies') tfidf = TfidfVectorizer(stop_words='english') docs = tfidf.fit_transform(corpus.data) # Instantiate the clustering model and visualizer visualizer = SilhouetteVisualizer(KMeans(n_clusters=6)) visualizer.fit(docs) visualizer.poof() # Instantiate the clustering model and visualizer visualizer = KElbowVisualizer(KMeans(), metric='silhouette', k=[4,10]) visualizer.fit(docs) visualizer.poof()