def silhouette(self): model = KMeans(random_state=1234556) visualizer = KElbowVisualizer(model, metric='silhouette', timings=False) visualizer.fit(self.data_continuous_std) visualizer.show()
def makespaces(s2, k, alpha, beta, legend, title): kk = pd.DataFrame({ 'Skew²': s2, 'Kurtosis': k, 'Alpha': alpha, 'Beta': beta }) K = 8 model = KMeans() visualizer = KElbowVisualizer(model, k=(1, K)) kIdx = visualizer.fit( kk.drop(columns="Beta")) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure kIdx = kIdx.elbow_value_ model = KMeans(n_clusters=kIdx).fit(kk.drop(columns="Beta")) fig = plt.figure() ax = Axes3D(fig) cmap = plt.get_cmap('gnuplot') clr = [cmap(i) for i in np.linspace(0, 1, kIdx)] for i in range(0, kIdx): ind = (model.labels_ == i) ax.scatter(kk["Skew²"][ind], kk["Kurtosis"][ind], kk["Alpha"][ind], s=30, c=clr[i], label='Cluster %d' % i) ax.set_xlabel("Skew²") ax.set_ylabel("Kurtosis") ax.set_zlabel(r"$\alpha$") ax.legend() plt.title(title + ": EDF-K-means") plt.savefig(title + "EDF.png") plt.show() model = KMeans() visualizer = KElbowVisualizer(model, k=(1, K)) kIdx = visualizer.fit( kk.drop(columns="Alpha")) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure kIdx = kIdx.elbow_value_ model = KMeans(n_clusters=kIdx).fit(kk.drop(columns="Alpha")) fig = plt.figure() ax = Axes3D(fig) cmap = plt.get_cmap('gnuplot') clr = [cmap(i) for i in np.linspace(0, 1, kIdx)] for i in range(0, kIdx): ind = (model.labels_ == i) ax.scatter(kk["Skew²"][ind], kk["Kurtosis"][ind], kk["Beta"][ind], s=30, c=clr[i], label='Cluster %d' % i) ax.set_xlabel("Skew²") ax.set_ylabel("Kurtosis") ax.set_zlabel(r"$\beta$") ax.legend() plt.title(title + ": EPSB-K-means") plt.savefig(title + "EPSB.png") plt.show()
def run_kmeans_2(trainX): #find k cluster_counts = { 'wine': 3, 'wage': 2, } model = KMeans() visualizer = KElbowVisualizer(model, k=(2, 100), metric='calinski_harabasz', timings=True) visualizer.fit(X_train) visualizer.show() plt.tight_layout() plt.savefig('plots/km_sl_' + dataset + '.png') #validation model.set_params(n_clusters=cluster_counts[dataset]) model.fit(X_train) score_fns = [ v_measure_score, homogeneity_score, completeness_score, ] cluster_validation_df = pd.DataFrame() for score in score_fns: cluster_validation_df.loc[score.__name__, 'score'] = score(y_test[y_test.columns[0]], model.predict(X_test)) print(cluster_validation_df)
def plot_elbow(estimator, k_values, dataset, version, metric='distortion'): visualizer = KElbowVisualizer(estimator, k=k_values, metric=metric) visualizer.fit(data.DATA[dataset][version]['x_train']) visualizer.show( f'{PLOTS_FOLDER}/{dataset}/{version}/{dataset}_{version}_{estimator.__class__.__name__}_elbow_{metric}.png' ) plt.clf()
def elbowCheck(): mat = data.values mat = mat.astype(float) model = KMeans() visualizer = KElbowVisualizer(model, k=(1, 12)) visualizer.fit(mat) visualizer.show(outpath="static/images/kmeans.png")
def find_optimal_clusters(X): # Instantiate the clustering model and visualizer model = KMeans() visualizer = KElbowVisualizer(model, k=(2, 21)) visualizer.fit(X) # Fit the data to the visualizer visualizer.show()
def makeK(d,ilist, title): d=np.array(d) kk=pd.DataFrame({'Variance': d[:,0], 'Skewness': d[:,1], 'Kurtosis': d[:,2]}) K=20 model=KMeans() visualizer = KElbowVisualizer(model, k=(1,K)) kIdx=visualizer.fit(kk) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure kIdx=kIdx.elbow_value_ model=KMeans(n_clusters=kIdx).fit(kk) # scatter plot fig = plt.figure() ax = Axes3D(fig) #.add_subplot(111)) cmap = plt.get_cmap('gnuplot') clr = [cmap(i) for i in np.linspace(0, 1, kIdx)] for i in range(0,kIdx): ind = (model.labels_==i) ax.scatter(d[ind,2],d[ind,1], d[ind,0], s=30, c=clr[i], label='Cluster %d'%i) ax.set_xlabel("Kurtosis") ax.set_ylabel("Skew") ax.set_zlabel("Variance") plt.title(title+': KMeans clustering with K=%d' % kIdx) plt.legend() plt.savefig(title+"clustersnoises.png") plt.show() d=pd.DataFrame({'Variance': d[:,0], 'Skewness': d[:,1], 'Kurtosis': d[:,2], 'Alpha': d[:,3], 'Beta': d[:,4], "Cluster": model.labels_}, index=ilist) return d
def find_clusters(ClusterTeams): """ Finds the optimal number of clusters using KMeans. :param ClusterTeams: DataFrame. The data to find the number of clusters of. """ from yellowbrick.cluster import KElbowVisualizer from sklearn.cluster import KMeans kmeans = KMeans(random_state=52594) visualizer = KElbowVisualizer(kmeans, k=(2, 10), metric='calinski_harabasz', timings=False) visualizer.fit(ClusterTeams) visualizer.show() kmeans = KMeans(random_state=52594) visualizer = KElbowVisualizer(kmeans, k=(2, 10), metric='silhouette', timings=False) visualizer.fit(ClusterTeams) visualizer.show()
def kelbow_optimization(df): # Shows optimal number of clusters for model model = KMeans() visualizer = KElbowVisualizer(model, k=(1, 10)) visualizer.fit(df) visualizer.poof() visualizer.show(outpath="Elbow Kmeans Cluster.pdf") return df
def plot_elbow(run_ids): for run_id in run_ids: latent_vectors = latent_vectors[run_id] model = KMeans() visualizer = KElbowVisualizer(model, k=(1, 20)) visualizer.fit(latent_vectors) visualizer.show(exp_config.ANALYSIS_PATH + "elbow_curve.jpg")
def find_optimal_kelbow(self, fitted_vectorizer): # Instantiate the clustering model and visualizer model = MiniBatchKMeans(random_state=42) visualizer = KElbowVisualizer(model, k=self.params["k_range"]) visualizer.fit(fitted_vectorizer) # Fit the data to the visualizer visualizer.show(outpath="kelbow_minibatchkmeans.pdf" ) # Finalize and render the figure
def kmeans_groups(min, max, df, metric='calinski_harabasz'): model = KMeans() visualizer = KElbowVisualizer(model, k=(min, max), metric=metric, timings=False) visualizer.fit(df) visualizer.show()
def elbow_point(df): visualizer = KElbowVisualizer(KMeans(), k=(2, 10), metric='calinski_harabasz', timings=False) visualizer.fit(df) visualizer.show() plt.close() return visualizer.elbow_value_
def elbow(X): mms = MinMaxScaler() mms.fit(X) data_transformed = mms.transform(X) model = KMeans() visualizer = KElbowVisualizer(model, k=(1, 10), timings=False) visualizer.fit(data_transformed) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure
def elbow_chart(self, df, model_type): model = KMeans() # create a model for elbow method if (model_type == 'AGNES'): model = AgglomerativeClustering() visualizer = KElbowVisualizer(model, k=(1, 12)) visualizer.fit(df) visualizer.show() return visualizer.elbow_value_
def elbow_algorithm(the_data): print("[ ELBOW ALGORITHM ... ]") model = KMeans() visualizer = KElbowVisualizer(model, k=(4, 14)) visualizer.fit(the_data) # Fit the data to the visualizer visualizer.show() print(" Estimated number of clusters: " + str(visualizer.elbow_value_)) return visualizer.elbow_value_
def elbow(self, audiofeatures): X = np.vstack(np.array(audiofeatures[key]) for key in audiofeatures) # Instantiate the clustering model and visualizer model = skc.KMeans() visualizer = KElbowVisualizer(model, k=(4, 25), timings=False) visualizer.elbow_value_ visualizer.fit(X) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure
def elbow(f, g, krange): df = pd.read_table(f, index_col=0, header=0) if g == 'c': df = df.T model = KMeans(random_state=1) min_k = krange[0] max_k = krange[1] visualizer = KElbowVisualizer(model, k=(min_k, max_k + 1)) visualizer.fit(df) visualizer.show(outpath=os.path.basename(f).replace('.txt', '_elbow.pdf'), clear_figure=True)
def print_yellowbrickkelbow(model, som_weights_to_nodes, destination, up_to=26): kelbow_visualizer = KElbowVisualizer(KMeans(), k=(2, up_to)) kelbow_visualizer.fit(som_weights_to_nodes) kelbow_visualizer.show( f'{destination}/{model.RUN_time}_{utils.time_now()}-{model.month_names_joined}_yellowbrickelbowfor-k2.png' ) if kelbow_visualizer.elbow_value_ != None: return kelbow_visualizer.elbow_value_
def elbowplot(x, k, metric, title, fignam, elbow=True): plt.clf() km = KMeans() visualizer = KElbowVisualizer(km, k=k, metric=metric, timings=False, title=title, locate_elbow=elbow) visualizer.fit(x) visualizer.show(outpath=fignam) return
def run_k_means_elbow(params, x_data): model = KMeans() visualizer = KElbowVisualizer(model, k=(2, 40), metric='distortion') plt.figure() visualizer.fit(x_data) visualizer.set_title(params['elbow_graph']) try: path = params['path'] + params['elbow_graph'] + '.png' except: path = params['elbow_graph'] + '.png' visualizer.show(outpath=path)
def defineClusters(self, df): '''' : ações: plota uma gráfico com a métrica do "cotovelo" para ver qual quantidade de clusters adequada ''' from sklearn.cluster import KMeans from yellowbrick.cluster import KElbowVisualizer # Instantiate the clustering model and visualizer model = KMeans() visualizer = KElbowVisualizer(model, k=(4, 16)) visualizer.fit(df) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure
def makespaces(s2, k, alpha, beta, legend, title, ilist): kk=pd.DataFrame({'Skew²': s2, 'Kurtosis': k, 'Alpha': alpha, 'Beta': beta, "Entity": ilist}) K=8 model=KMeans() visualizer = KElbowVisualizer(model, k=(1,K)) kIdx=visualizer.fit(kk.drop(columns=["Beta", "Entity"])) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure kIdx=kIdx.elbow_value_ model=KMeans(n_clusters=kIdx).fit(kk.drop(columns=["Beta", "Entity"])) print(len(model.labels_)) fig = plt.figure(figsize=(20,15)) ax = Axes3D(fig) cmap = plt.get_cmap('gnuplot') ilist2=list(set(ilist)) clr = [cmap(i) for i in np.linspace(0, 1, len(ilist2))] for i in range(0,len(ilist2)): ind = (kk["Entity"]==ilist2[i]) ax.scatter(kk["Skew²"][ind],kk["Kurtosis"][ind], kk["Alpha"][ind], s=30, c=clr[i], label=ilist2[i]) ax.set_xlabel("Skew²") ax.set_ylabel("Kurtosis") ax.set_zlabel(r"$\alpha$") ax.legend() plt.title(title+": EDF-K-means") plt.savefig("masoq.png") plt.show() kk=pd.DataFrame({'Skew²': s2, 'Kurtosis': k, 'Alpha': alpha, 'Beta': beta, "Entity": ilist}, index=model.labels_) kk.sort_index(inplace=True) kk.to_csv("clusteringalpha.csv") model=KMeans() visualizer = KElbowVisualizer(model, k=(1,K)) kIdx=visualizer.fit(kk.drop(columns=["Alpha", "Entity"])) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure kIdx=kIdx.elbow_value_ model=KMeans(n_clusters=kIdx).fit(kk.drop(columns=["Alpha", "Entity"])) fig = plt.figure(figsize=(20,15)) ax = Axes3D(fig) cmap = plt.get_cmap('gnuplot') clr = [cmap(i) for i in np.linspace(0, 1, len(ilist2))] for i in range(0,len(ilist2)): ind = (kk["Entity"]==ilist2[i]) ax.scatter(kk["Skew²"][ind],kk["Kurtosis"][ind], kk["Beta"][ind], s=30, c=clr[i], label=ilist[i]) ax.set_xlabel("Skew²") ax.set_ylabel("Kurtosis") ax.set_zlabel(r"$\beta$") ax.legend() plt.title(title+": EPSB-K-means") plt.savefig("masoq2.png") plt.show() kk=pd.DataFrame({'Skew²': s2, 'Kurtosis': k, 'Alpha': alpha, 'Beta': beta, "Entity": ilist}, index=model.labels_) kk.sort_index(inplace=True) kk.to_csv("clusteringbeta.csv")
def elbow_test(): for filename in os.listdir(path): print("elbow test for: "+filename) data_file = path + filename.replace('\\', '/') X, y = pp.pre_process(data_file) # Instantiate the clustering model and visualizer model = KMeans() visualizer = KElbowVisualizer(model, k=(4, 20)) X_t, X_test, y_t, y_test = train_test_split(X, y, test_size=0.2, random_state=0) visualizer.fit(X_t) # Fit the data to the visualizer print(visualizer.elbow_value_) visualizer.show()
def kmeans_elbow(df, metric='distortion', save_fig=False): ''' Input: scaled data Run the elbow method and returns the optimal number of clusters Other metrics available: 'silhouette' and 'calinski_harabasz' Metric: - **distortion**: mean sum of squared distances to centers - **silhouette**: mean ratio of intra-cluster and nearest-cluster distance - **calinski_harabasz**: ratio of within to between cluster dispersion ''' model = KMeans() visualizer = KElbowVisualizer(model, k=(2, 12), metric=metric) visualizer.fit(df) # Fit the data to the visualizer visualizer.show() k = visualizer.elbow_value_ # optimal number of clusters return (k)
def clustering(d: pd.DataFrame, n_clusters=None, scale='linear', show_img=False): scaler = MinMaxScaler() kdata = scaler.fit_transform(d.values) km = KMeans(max_iter=400) visualizer = KElbowVisualizer(km, k=(1, 20), metric='distortion', timings=False) visualizer.fit(kdata) if show_img: visualizer.show() n = n_clusters if n_clusters is not None else visualizer.elbow_value_ kmeans = KMeans(n_clusters=n, max_iter=400) kmeans.fit(kdata) l, c = np.unique(kmeans.labels_, return_counts=True) print("Total: ", len(d), dict(zip(l, c))) centers = scaler.inverse_transform(kmeans.cluster_centers_) if show_img: cmap = plt.cm.get_cmap('hsv', n + 1) df_copy = d.copy() df_copy["cluster"] = kmeans.labels_ for i in l: plt.scatter(df_copy.loc[df_copy['cluster'] == i, 'comp1'], df_copy.loc[df_copy['cluster'] == i, 'comp2'], label="Cluster %s" % i, s=20, color=cmap(i)) plt.title(f"K-Means ({n_clusters} clusters)") plt.scatter(centers[:, 0], centers[:, 1], s=200, marker='*', c='k') plt.tick_params(axis='both', which='major', labelsize=10) plt.xscale(scale) plt.legend() plt.show() return centers, kmeans.labels_
def getEstimation2(self, k=(2, 11), metric='calinski_harabasz'): start = time.perf_counter() print('Getting estimation and metrics with Calinski-Harabasz method') model = KMeans(random_state=0) visualizer = KElbowVisualizer(model, k=k, metric=metric) visualizer.fit(self.AllShuffledStackReshaped) basefilename = os.path.basename(self.datFile).split('.')[0] calinskiFigPath = os.path.join( self.path, basefilename + '_calinskiEstimation.jpg') visualizer.show(outpath=calinskiFigPath) #This should save the fig with the visualizer in the instantiated folder for its retrieval outside the front-end app stop = time.perf_counter() print(f'Finished estimation in {stop - start:0.4f} seconds') return visualizer
def perform_elbow_method(x_scaled): """ Perform the elbow method to help us decide the number of clusters :param x_scaled: Values of our data after normalization :return: A plot of the elbow method """ # Instantiate the clustering model and visualizer mpl.rcParams['xtick.labelsize'] = 12 mpl.rcParams['ytick.labelsize'] = 12 mpl.rcParams['axes.titlesize'] = 18 mpl.rcParams['axes.labelsize'] = 14 model = KMeans() visualizer = KElbowVisualizer(model, k=(1, 12)) # Fit the data to the visualizer visualizer.fit(x_scaled) visualizer.set_title("The Elbow Method") visualizer.show()
def Elbow(sample, feature): # Generate synthetic dataset with 8 random clusters X, y = make_blobs(n_samples=sample, n_features=feature, centers=8, random_state=42) # Instantiate the clustering model and visualizer model = KMeans() visualizer = KElbowVisualizer(model, k=(4, 40), metric='calinski_harabasz', timings=False, locate_elbow=True) visualizer.fit(X) # Fit the data to the visualizer visualizer.show()
def graph_kmeans(df, message): kmean = KMeans() visualizer = KElbowVisualizer( kmean, k=(min(kmeans_range), max(kmeans_range)), title=message + " distortion score elbow method for k means clustering") visualizer.fit(df) visualizer.show() kmean = KMeans(n_clusters=visualizer.elbow_value_) kmean.fit(df) y = kmean.labels_ model = LogisticRegression().fit(x, y) print(message + "| clusters: ", visualizer.elbow_value_, "| accuracy: ", model.score(x, y), "| clusters at: ", kmean.cluster_centers_) return kmean