def crank_feats(fargs): rss, ccs, lv, installed_in, dfile, nfeatures = fargs noaa_init(installed_in) wat = pd.read_csv(dfile).set_index('station') es = ['e' + str(x) for x in range(0, nfeatures)] #ccs = [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30, 40] #rss=[0, 1] prefix='eigen' + str(nfeatures) #let's do some clustering with the six eigenvectors and see how they hold together flatnew, nmeans, nstds = flatten(wat[es]) #strictly speaking not necessary since flatold, omeans, ostds = flatten(wat[lv]) #note: this method flattens wat internally produce_kmeans_climates(wat, es, ccs, rss, prefix) for rs in rss: kf = pd.read_csv(noaafile('climates/' + prefix + '_rs_' + str(rs) + '.csv')) for cc in ccs: #this silhouettes thing gobbles memory, I'm guessing because each worker #creates an entire new metric matrix. kf['sil_eigen_' + str(cc)] = silhouette_samples(flatnew, kf['vtx'+str(cc)].values) #pull out silhouette scores on the old metric too, just for fun... kf['sil_old_' + str(cc)] = silhouette_samples(flatold, kf['vtx'+str(cc)].values) kf.to_csv(noaafile('climates/' + prefix + '_sil_rs_' + str(rs) + '.csv'), index=False)
def bestRep(dat,labels,outName): bestExample = [] silSamp = metrics.silhouette_samples(dat, labels) for num in np.unique(labels): clusterMask = labels==num bestExample.append(outName[clusterMask][np.argmax(silSamp[clusterMask])]) return bestExample
def cluster_driver(a_driver): # print a_driver['DStats'] # print "#############################DStats Above#################################ValueError: zero-size array to reduction operation minimum which has no identity#################" X = StandardScaler().fit_transform(a_driver['DStats']) # print X # print "DStats are.....::" , a_driver['DStats'] # print "X is...........::" ,['AvgDistDel', 'AvgACosDel', 'SDevDistDel', 'SDevACosDel','TotalTime','SkewDistDel','SkewACosDel'] X # print "############################Scaled X Above###################################################" # db = KMeans(n_clusters=20,n_jobs = -1).fit(X) db = DBSCAN(eps=0.45).fit(X) # core_samples_mask = np.zeros_like(db.labels_, dtype=bool) # core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print "###############################################################################" # print('Estimated number of clusters: %d' % n_clusters_) # print 'Count of Predicts::', len(X) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels,metric="mahalanobis")) # print "##############################DBSCAN X Below#################################################" # print X G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/' # try: return (metrics.silhouette_samples(X, labels,metric="mahalanobis")+1)/2
def test_silhouette_samples(self): result = self.df.metrics.silhouette_samples() expected = metrics.silhouette_samples(self.data, self.pred) self.assertTrue(isinstance(result, pdml.ModelSeries)) self.assert_index_equal(result.index, self.df.index) self.assert_numpy_array_almost_equal(result.values, expected)
def silhouette_analysis(clustering, labels=None): distance_df = clustering['distance_df'] if labels is None: labels = clustering['labels'] sample_scores = silhouette_samples(distance_df, metric='precomputed', labels=labels) score = np.mean(sample_scores) return sample_scores, score
def cluster(algorithm, data, topics, make_silhouette=False): print str(algorithm) clusters = algorithm.fit_predict(data) labels = algorithm.labels_ print 'Homogeneity: %0.3f' % metrics.homogeneity_score(topics, labels) print 'Completeness: %0.3f' % metrics.completeness_score(topics, labels) print 'V-measure: %0.3f' % metrics.v_measure_score(topics, labels) print 'Adjusted Rand index: %0.3f' % metrics.adjusted_rand_score(topics, labels) print 'Silhouette test: %0.3f' % metrics.silhouette_score(data, labels) print ' ***************** ' silhouettes = metrics.silhouette_samples(data, labels) num_clusters = len(set(clusters)) print 'num clusters: %d' % num_clusters print 'num fitted: %d' % len(clusters) # Make a silhouette plot if the flag is set if make_silhouette: order = numpy.lexsort((-silhouettes, clusters)) indices = [numpy.flatnonzero(clusters[order] == num_clusters) for k in range(num_clusters)] ytick = [(numpy.max(ind)+numpy.min(ind))/2 for ind in indices] ytickLabels = ["%d" % x for x in range(num_clusters)] cmap = cm.jet( numpy.linspace(0,1,num_clusters) ).tolist() clr = [cmap[i] for i in clusters[order]] fig = plt.figure() ax = fig.add_subplot(111) ax.barh(range(data.shape[0]), silhouettes[order], height=1.0, edgecolor='none', color=clr) ax.set_ylim(ax.get_ylim()[::-1]) plt.yticks(ytick, ytickLabels) plt.xlabel('Silhouette Value') plt.ylabel('Cluster') plt.savefig('cluster.png')
def visualize_silhouette_score(X,y_km): cluster_labels = np.unique(y_km) n_clusters = cluster_labels.shape[0] silhouette_vals = metrics.silhouette_samples(X, y_km, metric='euclidean') y_ax_lower, y_ax_upper = 0, 0 yticks = [] for i, c in enumerate(cluster_labels): c_silhouette_vals = silhouette_vals[y_km == c] c_silhouette_vals.sort() y_ax_upper += len(c_silhouette_vals) color = cm.jet(i / n_clusters) plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color) yticks.append((y_ax_lower + y_ax_upper) / 2) y_ax_lower += len(c_silhouette_vals) silhouette_avg = np.mean(silhouette_vals) plt.axvline(silhouette_avg, color="red", linestyle="--") plt.yticks(yticks, cluster_labels + 1) plt.ylabel('Cluster') plt.xlabel('Silhouette coefficient') plt.show()
def silhouette_original_clusterings(dataset='CB1', neuropil='Antennal_lobe', clusterer_or_k=60): """Returns a pandas dataframe with the silhouette index of each cluster member. The dataframe have columns (cluster_id, member_id, silhouette). """ # Read the expression matrix print('Reading expression matrix') Xdf = ExpressionDataset.dataset(dset=dataset, neuropil=neuropil).Xdf(index_type='string') # Generate a flat map cluster_id -> members print('Finding cluster assignments') clusters_df, _ = get_original_clustering(dataset=dataset, neuropil=neuropil, clusterer_or_k=clusterer_or_k) dfs = [] for cluster_id, members in zip(clusters_df.cluster_id, clusters_df.original_voxels_in_cluster): dfs.append(pd.DataFrame({'cluster_id': cluster_id, 'member_id': members})) members_df = pd.concat(dfs).set_index('member_id').loc[Xdf.index] # Compute the distance matrix - this must be parameterised print('Computing distance') import mkl mkl.set_num_threads(6) D = dicedist_metric(Xdf) # Compute silhouette # Here we could go for the faster implementation in third_party, if needed print('Computing silhouette index') members_df['silhouette'] = silhouette_samples(D.values, members_df.cluster_id.values, metric='precomputed') return (members_df. reset_index(). rename(columns=lambda col: {'index': 'member_id'}.get(col, col)) [['cluster_id', 'member_id', 'silhouette']])
def cluster_driver(a_driver): # print a_driver['DStats'] # print "#############################DStats Above##################################################" X = StandardScaler().fit_transform(a_driver['DStats']) # print X # print "DStats are.....::" , a_driver['DStats'] # print "X is...........::" , X # print "############################Scaled X Above###################################################" db = DBSCAN(eps=0.6, min_samples=5).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print "###############################################################################" # print('Estimated number of clusters: %d' % n_clusters_) # print 'Count of Predicts::', len(X) # print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) # print "##############################DBSCAN X Below#################################################" # print X G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/' # try: return (metrics.silhouette_samples(X, labels)+1)/2
def fit(self, X, y=None, **kwargs): """ Fits the model and generates the silhouette visualization. """ # TODO: decide to use this method or the score method to draw. # NOTE: Probably this would be better in score, but the standard score # is a little different and I'm not sure how it's used. # Fit the wrapped estimator self.estimator.fit(X, y, **kwargs) # Get the properties of the dataset self.n_samples_ = X.shape[0] self.n_clusters_ = self.estimator.n_clusters # Compute the scores of the cluster labels = self.estimator.predict(X) self.silhouette_score_ = silhouette_score(X, labels) self.silhouette_samples_ = silhouette_samples(X, labels) # Draw the silhouette figure self.draw(labels) # Return the estimator return self
def run_clutering(n_sites,order_dict,sim_mat): n_clusters = 6 name_file = 'clustering_sil' + str(n_clusters) output_file = open(name_file,'w') name_file1 = 'clustering_labels' + str(n_clusters) output_file1 = open(name_file1,'w') spectral = cluster.SpectralClustering(n_clusters=n_clusters, \ eigen_solver='arpack',affinity='precomputed') labels = spectral.fit_predict(sim_mat) silhouette_avg = metrics.silhouette_score(sim_mat,labels) output_file.write(" ".join(["aver silhouette_score:",str(silhouette_avg)])) # Compute the silhouette scores for each sample sample_silhouette_values = metrics.silhouette_samples(sim_mat, labels) for siteid in order_dict: stringa = ' '.join( \ [siteid, str(sample_silhouette_values[order_dict[siteid]])]) output_file.write(stringa +'\n') for siteid in order_dict: stringa = ' '.join( \ [str(siteid),str(labels[order_dict[siteid]]) ]) output_file1.write(stringa +'\n')
def calculateNumberOfIdealClusters(maxAmount, corpus): print "Initializing silhouette analysis" range_n_clusters = range(2, maxAmount) # max amount of clusters equal to amount of jobs silhouette_high = 0; silhouette_high_n_clusters = 2; for n_clusters in range_n_clusters: # Initialize the clusterer with n_clusters value cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward", affinity="euclidean") cluster_labels = cluster.fit_predict(corpus) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed clusters silhouette_avg = silhouette_score(corpus, cluster_labels) print "For n_clusters = %d, the average silhouette_score is: %.5f" % (n_clusters, silhouette_avg) if (silhouette_avg > silhouette_high): silhouette_high = silhouette_avg silhouette_high_n_clusters = n_clusters # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(corpus, cluster_labels) print ("Highest score = %f for n_clusters = %d" % (silhouette_high, silhouette_high_n_clusters)) return silhouette_high_n_clusters
def get_silhouette(df): df=df[(df.AB!=".")].copy() df.loc[:,'AB']=pd.to_numeric(df.loc[:,'AB']) df.loc[:,'CN']=pd.to_numeric(df.loc[:,'CN']) tp=df.iloc[0,:].loc['svtype'] [mn_CN, mn_AB]=df.loc[:, ['CN', 'AB']].mean(skipna=True) [sd_CN, sd_AB]=df.loc[:, ['CN', 'AB']].std(skipna=True) if df.loc[:,'GT'].unique().size==1: df.loc[:,'sil_gt_avg']=1 df.loc[:, 'sil_gt']=1 df=df[ ['var_id', 'sample', 'svtype', 'AF', 'GT', 'CN', 'AB', 'sil_gt_avg', 'sil_gt']] return df #standardize the 2 dims if sd_AB>0.01: df.loc[:, 'AB1']=(df.loc[:,'AB']-mn_AB)/sd_AB else: df.loc[:, 'AB1']=df.loc[:, 'AB'] if tp in ['DEL', 'DUP', 'MEI'] or sd_CN>0.01: df.loc[:, 'CN1']=(df.loc[:,'CN']-mn_CN)/sd_CN else: df.loc[:, 'CN1']=df.loc[:, 'CN'] gt_code={'0/0':1, '0/1':2, '1/1':3} df.loc[:,'gtn']=df.loc[:, 'GT'].map(gt_code) dist_2d_sq=spatial.distance.squareform(spatial.distance.pdist(df[['AB1', 'CN1']], metric='cityblock')) df.loc[:, 'sil_gt_avg']=metrics.silhouette_score(dist_2d_sq, df.loc[:, 'gtn'].values, metric='precomputed') df.loc[:, 'sil_gt']=metrics.silhouette_samples(dist_2d_sq, df.loc[:, 'gtn'].values, metric='precomputed') df=df[ ['var_id', 'sample', 'svtype', 'AF', 'GT', 'CN', 'AB', 'sil_gt_avg', 'sil_gt']] return df
def find_clusters(df, k_vals=[4, 9, 16, 25], how='hierarchical'): '''Find clusters, and if method is k-means run silhouette analysis to determine the value of k. Args: df (data frame): A data frame with normalised expression data. k_vals (list or range): The range over which to test k. how ('hierarchical' or 'kmeans'): Clustering method. Returns: A list of cluster numbers. ''' ## Don't run the silhouette analysis for hierarchical clustering, ## just calculate the clusters using estimate of k. if how == 'hierarchical': k = int(np.sqrt((len(df) / 2.0))) hc = hac.linkage(df, method='average') optimal_clusters = hac.fcluster(hc, t=k, criterion='maxclust') ## If method is k-means, run silhouette analysis. elif how == 'kmeans': best_combined_score = 0 optimal_k = 2 ## Try values of k from range and keep track of optimal k according ## to silhouette score. for k in k_vals: km = KMeans(n_clusters=k, random_state=10) clusters = km.fit_predict(df) silhouette_avg = silhouette_score(df, clusters) sample_silhouette_values = silhouette_samples(df, clusters) above_mean = 0 silhouette_sizes = [] for i in range(k): ith_cluster_silhouette_values = sample_silhouette_values[clusters == i] size_cluster_i = ith_cluster_silhouette_values.shape[0] silhouette_sizes.append(size_cluster_i) if max(ith_cluster_silhouette_values) > silhouette_avg: above_mean += 1 ## This combined score should pick the best value of k above_mean_score = float(above_mean) / k std_score = 1.0/np.std(silhouette_sizes) if np.std(silhouette_sizes) > 1.0 else 1.0 combined_score = (silhouette_avg + above_mean_score + std_score) / 3 ## Put the clusters in the new column in the data frame. if combined_score > best_combined_score: best_combined_score = combined_score optimal_k = k optimal_clusters = clusters optimal_clusters = [cluster + 1 for cluster in optimal_clusters] return optimal_clusters
def test_gmm(): sil = pyclust.validate.Silhouette() sil_score = sil.score(X, ypred, sample_size=None) print(sil_score[0]) print(sil.sample_scores[:10]) print(silhouette_score(X, ypred, sample_size=None)) print(silhouette_samples(X, ypred)[:10])
def compute_sil_score_vector(filelist): """returns dictionary indexed by num_clusters and values which are vectors of silscore for all samples """ silscore = dict() for f in filelist: y, X = get_labels_features(f) num_clusters = np.unique(y).shape[0] silscore[num_clusters]= sklm.silhouette_samples(X,y) return silscore
def silhouette_samples(clusters, word2vec_model): labels = [] matrix = [] for i in range(len(clusters)): words = clusters[i][-1] _, mat = get_words_matrix(words, word2vec_model) for j in range(len(mat)): matrix.append(list(mat[j])) labels.append(i) matrix = np.array(matrix) labels = np.array(labels) samples_score = metrics.silhouette_samples(matrix, labels) return labels, samples_score
def grind_kmeans(fargs): rss, ccs, mus, lv, installed_in, mdat, prefix = fargs noaa_init(installed_in) produce_kmeans_climates(mdat, lv, ccs, rss, prefix) for rs in rss: kf = pd.read_csv(noaafile('climates/' + prefix + '_rs_' + str(rs) + '.csv')) for cc in ccs: #this silhouettes thing gobbles memory, I'm guessing because each worker #creates an entire new metric matrix. kf['sil' + str(cc)] = silhouette_samples(mus, kf['vtx'+str(cc)].values, metric='precomputed') kf.to_csv(noaafile('climates/' + prefix + '_sil_rs_' + str(rs) + '.csv'), index=False)
def plotter_3d(pit, day, clusters, files): data = pickle.load(open("/usr/local/bee/beemon/beeW/Chris/" + pit + "/" + day + "/clusterdata_" + str(clusters) + "_" + str(files) + "_reduced.p", 'rb'), encoding = 'bytes') labels = pickle.load(open("/usr/local/bee/beemon/beeW/Chris/" + pit + "/" + day + "/clusterdata_" + str(clusters) + "_" + str(files) + ".pkl", 'rb'), encoding = 'bytes') try: if len(labels[0]) < 400: silhouettes = silhouette_samples(data, labels[0]) print(silhouettes) print(np.mean(silhouettes)) except Exception: print("Silhouette scoring cannot be done.") num1 = int(clusters) path1 = "Pit:" + pit + " Day:" + day + " Clusters:" + str(clusters) + " Files:" + str(files) graph_3d(data, num1, path1, labels) plt.close()
def Silhouette(D,labels,k): """ Taken from SKlearn's plot kmeans example D = matriz de distancia k = numero de clusters """ plt.ion() fig, ax1 = plt.subplots() fig.set_size_inches(18, 7) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(D) + (k + 1) * 10]) sample_silhouette_values = metrics.silhouette_samples(D , labels, metric='precomputed') y_lower = 10 for i in range(k): ith_cluster_silhouette_values = \ sample_silhouette_values[labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / k) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") silhouette_avg = metrics.silhouette_score(D , labels, metric='precomputed') ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) plt.suptitle(("Silhouette analysis with n_clusters =",k," and average = ",silhouette_avg), fontsize=14, fontweight='bold') plt.show()
def get_silhouette_scores(X, km, nc): # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters cluster_labels = km.labels_ silhouette_avg = silhouette_score(X, cluster_labels) #print ("For n_clusters =" + str(nc) + "The average silhouette_score is :" # + str(silhouette_avg)) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) return silhouette_avg, sample_silhouette_values
def identify_accurate_number_of_clusters(self, model, compounds, max_range=3): silhouette_avg_scores = [] for n_cluster in range(2, max_range): assigned_cluster = cluster.KMeans(n_clusters=n_cluster, n_init=20).fit_predict(model) silhouette_avg = silhouette_score(model, assigned_cluster) silhouette_avg_scores.append(silhouette_avg) max_silhouette_score = max(silhouette_avg_scores) index_max_score = silhouette_avg_scores.index(max_silhouette_score) final_cluster_num = range(2, max_range)[index_max_score] final_assigned_cluster = cluster.KMeans(n_init=20, n_clusters=final_cluster_num).fit_predict(model) final_sample_sil_vals = silhouette_samples(model, final_assigned_cluster) return final_assigned_cluster, final_cluster_num, final_sample_sil_vals
def kmeans(cluster_data): # Data pre-processing scaler = preprocessing.StandardScaler() km = KMeans(n_clust, init='random')#random_state=10) km.fit(cluster_data) #change variable - data with labels #clf.fit(clusterdata) #cluster_number = km.predict(scaler.fit_transform(clusterdata)) #cluster_number = clf.predict(clusterdata) centers = km.cluster_centers_ # Compute the silhouette score labels = km.labels_ score = metrics.silhouette_score(cluster_data, labels) sample_score = metrics.silhouette_samples(cluster_data, labels) sum_of_squares = km.inertia_ return(labels, score, sample_score, centers, sum_of_squares)
def fit_plot_save(k, smoothExprs, day, probeID, geneSymbol, organ, strain, path): """ Fit k-means, plot and save results Arguments ========= k - no. of clusters smoothExprs - gene expression rows = genes, columns = day day - day probeID - probeID geneSymbol - geneSymbol path - path Returns ========= None - results are plotted and saved """ model = KMeans(n_clusters=k) model.fit(smoothExprs) clustCentre = model.cluster_centers_ # Plot results plot_silhouette(silhouette_samples(smoothExprs, model.labels_), model.labels_) clust.multi_plot(smoothExprs, clustCentre, day, model.labels_) # Hierarchical clustering # Ward + Euclidean header = ["Cluster%i" % label for label in np.unique(model.labels_)] hclust = hc.linkage(clustCentre, method='ward', metric='euclidean') plt.figure(); plt.title("Hclust() Ward + Euclidean") hc.dendrogram(hclust, color_threshold=0.0, labels=header) #seed=101 #embedding = tsne.tsne(smoothExprs, no_dims = 3, initial_dims = 20, perplexity = 30.0, seed=seed) # low dimensional embedding #tsne.plot(embedding, model.labels_) # Save model io.save_pickle(os.path.join(path['Clust']['Model'], organ + strain + ".pickle"), model) # Save Gene/Probe List geneList = clust.get_gene_list(model.labels_, geneSymbol) probeList = clust.get_gene_list(model.labels_, probeID) io.write_list_to_csv(os.path.join(path['Clust']['GeneList'], organ + strain + ".csv"), header, geneList) # Gene list io.write_list_to_csv(os.path.join(path['Clust']['ProbeList'], organ + strain + ".csv"), header, probeList) # Probe list # Save Cluster "centres" dataMatrix = np.hstack((np.array(header)[:, None], clustCentre)) header = list(itertools.chain.from_iterable([["Cluster"], list(day)])) io.write_to_csv(os.path.join(path['Clust']['Centres'], organ + strain + ".csv"), header, dataMatrix) # Cluster "centres" # Save Alternate plot hFig = clust.multi_plot(smoothExprs, clustCentre, day, model.labels_) io.save_pdf(os.path.join(path['Clust']['Plot'], organ + strain + "2.pdf"), hFig) # Plot
def _internal(cluster_list, affinity_matrix, dist_matrix, idx, n_jobs, n, queue_y): for i in range(idx, n, n_jobs): sp = SpectralClustering(n_clusters=cluster_list[i], affinity='precomputed', norm_laplacian=True, n_init=1000) sp.fit(affinity_matrix) save_results_clusters("res_spectral_{:03d}_clust.csv" .format(cluster_list[i]), sample_names, sp.labels_) silhouette_list = silhouette_samples(dist_matrix, sp.labels_, metric="precomputed") queue_y[i] = np.mean(silhouette_list)
def main(): X, y = make_blobs(n_samples=150, n_features=2, centers=3, cluster_std=0.5, shuffle=True, random_state=0) km = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, tol=1e-04, random_state=0) y_km = km.fit_predict(X) cluster_labels = np.unique(y_km) n_clusters = cluster_labels.shape[0] silhouette_vals = silhouette_samples(X, y_km, metric='euclidean') y_ax_lower, y_ax_upper = 0, 0 yticks = [] for i, c in enumerate(cluster_labels): c_silhouette_vals = silhouette_vals[y_km == c] c_silhouette_vals.sort() y_ax_upper += len(c_silhouette_vals) color = cm.jet(i / n_clusters) plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color) yticks.append((y_ax_lower + y_ax_upper) / 2) y_ax_lower += len(c_silhouette_vals) silhouette_avg = np.mean(silhouette_vals) plt.axvline(silhouette_avg, color='red', linestyle='--') plt.yticks(yticks, cluster_labels + 1) plt.ylabel('Cluster') plt.xlabel('Silhouette Coefficient') plt.show() return
def eval_silhouette(self, verbose=True): """Evaluate each estimator via silhouette score.""" for k in self.n_clusters: # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters cluster_labels = self.estimators[k].labels_ results = munch.Munch() results.silhouette_avg = silhouette_score(self.X, cluster_labels) if verbose: print("For n_clusters = {k} The average silhouette_score is : {avg_sil}".format(k=k, avg_sil=results.silhouette_avg)) # Compute the silhouette scores for each sample results.sample_silhouette_values = silhouette_samples(self.X, cluster_labels) self.silhouette_results[k] = results
def silhouette_clusters(data, clusters): """ :param data: n*d where n is the number of observations and d the dimensions of each observation :param clusters: an array of length n compute silhoutte score for every cluster """ silhouette_samples_score = metrics.silhouette_samples(data, clusters, metric='euclidean') values_possible = np.unique(clusters) silhouette_mean_clusters = np.zeros((1, len(values_possible))) k = 0 for i in values_possible: index = np.where(clusters == i) silhouette_mean_clusters[k] = np.mean(silhouette_samples_score[index]) k += 1 return silhouette_mean_clusters
def optimum_clusters(range_n_clusters,y): logger.info('Start deciding Optimum Number of Clusters') dic = {} for n_clusters in range_n_clusters: s_cluster = [] cluster = KMeans(n_clusters=n_clusters, random_state=10) # # Create a subplot with 1 row and 2 columns cluster_labels = cluster.fit_predict(y) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters #silhouette_avg = silhouette_score(y, cluster_labels) #print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # centers = cluster.cluster_centers_ # print (centers) sample_silhouette_values = silhouette_samples(y, cluster_labels) for i in range(n_clusters): ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] s_cluster.append(size_cluster_i) mean_s = np.mean(s_cluster) mse = 0 for j in s_cluster: mse = mse+(s_cluster[j] - mean_s)**2 dic[n_clusters] = mse #print (dic) min_d = min(dic, key=dic.get) logger.info('Optimum number of Clusters Decided') return (min_d)
def silhouette_analysis(self): if not self.pca_reduced: self.pc_analysis() range_n_clusters = range(2, 10) for n_clusters in range_n_clusters: fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(self.pca_reduced) + (n_clusters + 1) * 10]) clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(self.pca_reduced) silhouette_avg = silhouette_score(self.pca_reduced, cluster_labels) print("For n_clusters =", n_clusters, "the average silhouette_score is :", silhouette_avg) sample_silhouette_values = silhouette_samples(self.pca_reduced, cluster_labels) y_lower = 10 for i in range(n_clusters): ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(self.pca_reduced[:, 0], self.pca_reduced[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors) centers = clusterer.cluster_centers_ ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200) for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold')
def k_medoids_over_instances(self, dataset, cols, k, distance_metric, max_iters, n_inits=5, p=1): # If we set it to default we use the pyclust package... temp_dataset = dataset[cols] if distance_metric == 'default': km = pyclust.KMedoids(n_clusters=k, n_trials=n_inits) km.fit(temp_dataset.as_matrix()) cluster_assignment = km.labels_ else: self.p = p cluster_assignment = [] best_silhouette = -1 # Compute all distances D = self.compute_distance_matrix_instances(temp_dataset, distance_metric) for it in range(0, n_inits): # First select k random points as centers: centers = random.sample(range(0, len(dataset.index)), k) prev_centers = [] points_to_cluster = [] n_iter = 0 while (n_iter < max_iters) and not (centers == prev_centers): n_iter += 1 prev_centers = centers # Assign points to clusters. points_to_centroid = D[centers].idxmin(axis=1) new_centers = [] for i in range(0, k): # And find the new center that minimized the sum of the differences. best_center = D.ix[points_to_centroid == centers[i], points_to_centroid == centers[i]].sum().idxmin(axis=1) new_centers.append(best_center) centers = new_centers # Convert centroids to cluster numbers: points_to_centroid = D[centers].idxmin(axis=1) current_cluster_assignment = [] for i in range(0, len(dataset.index)): current_cluster_assignment.append( centers.index(points_to_centroid.ix[i, :])) silhouette_avg = silhouette_score( temp_dataset, np.array(current_cluster_assignment)) if silhouette_avg > best_silhouette: cluster_assignment = current_cluster_assignment best_silhouette = silhouette_avg # And add the clusters and silhouette scores to the dataset. dataset['cluster'] = cluster_assignment silhouette_avg = silhouette_score(temp_dataset, np.array(cluster_assignment)) silhouette_per_inst = silhouette_samples(temp_dataset, np.array(cluster_assignment)) dataset['silhouette'] = silhouette_per_inst return dataset
def kmeans_base_clustering(corr: Union[np.ndarray, pd.DataFrame], names_features: list = None, max_num_clusters: int = 10, **kwargs: Any) -> (pd.DataFrame, dict, pd.Series): """ Perform base clustering with Kmeans. Arguments --------- corr: numpy.array or pd.DataFrame Correlation matrix. names_features : list of str List of names for features. max_num_clusters: int Maximum number of clusters. **kwargs Arbitrary keyword arguments for sklearn.cluster.KMeans(). Returns ------- pd.DataFrame Clustered correlation matrix. dictionary List of clusters and their content. pd.Series Silhouette scores. Notes ----- Function adapted from "Machine Learning for Asset Managers", Marcos López de Prado (2020). To learn more about sklearn.cluster.KMeans(): https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans """ # Initializations corr = pd.DataFrame(data=corr, index=names_features, columns=names_features) silh_score = pd.Series() # Define the observations matrix Xobs = (((1 - corr.fillna(0)) / 2.)**.5).values # Modify it to get an Euclidean distance matrix X = np.zeros(shape=Xobs.shape) for i, j in itertools.product(range(X.shape[0]), range(X.shape[1])): X[i, j] = np.sqrt(sum((Xobs[i, :] - Xobs[j, :])**2)) X = pd.DataFrame(data=X, index=names_features, columns=names_features) # Loop to generate different numbers of clusters for i in range(2, max_num_clusters + 1): # Define model and fit kmeans_current = cluster.KMeans(n_clusters=i, **kwargs).fit(X) # Compute silhouette score silh_current = silhouette_samples(X, kmeans_current.labels_) # Compute clustering quality q (t-statistic of silhouette score) quality_current = silh_current.mean() / silh_current.std() quality = silh_score.mean() / silh_score.std() # Keep best quality scores and clustering if np.isnan(quality) or (quality_current > quality): silh_score = silh_current kmeans = kmeans_current # Extract index according to sorted labels new_idx = np.argsort(kmeans.labels_) # Reorder rows and columns clustered_corr = corr.iloc[new_idx] clustered_corr = clustered_corr.iloc[:, new_idx] # Form clusters clusters = { i: clustered_corr.columns[np.where(kmeans.labels_ == i)[0]].tolist() for i in np.unique(kmeans.labels_) } # Define a series with the silhouette score silh_score = pd.Series(silh_score, index=X.index) return clustered_corr, clusters, silh_score
def cluster_observation_matrix(X: pd.DataFrame, n_clust_range: range, model: cluster, verbose: bool = True, **kwargs: Any) -> (dict, dict): """ Apply clustering for an arbitrary model as long as the model has an argument 'n_clusters'. Parameters ---------- X : pd.DataFrame The Observation matrix on which the clustering is based. n_clust_range: range Range of integer values for the number of clusters to be tested. model : sklearn.cluster The clustering model to be used from sklearn. verbose : bool Verbose option. **kwargs : Arguments for the clustering model. Returns ------- dict Labels corresponding to the different clusters. dict Quality corresponding to the different clusters. Notes ----- To learn more about sklearn.cluster: https://scikit-learn.org/stable/modules/classes.html?highlight=cluster#module-sklearn.cluster """ # Checks if min(n_clust_range) < 2: raise AssertionError( "Argument n_clust_range must have values starting at values >= 2.") # Initialization save_labels = {} save_quality = {} n_clust_range_min = n_clust_range[0] n_clust_range_max = n_clust_range[-1] n_clust_range_step = int(n_clust_range[-1] - n_clust_range[-2]) # Looping for k in n_clust_range: # Build clusters fitted_model = model(n_clusters=k, **kwargs).fit(X) save_labels[k] = fitted_model.labels_.tolist() # Compute scores silh = silhouette_samples(X, fitted_model.labels_) save_quality[k] = silh.mean() / silh.std() # Plot qualities if verbose: plt.xticks(ticks=n_clust_range) plt.plot(n_clust_range, list(save_quality.values())) # Make it cute plt.title("Normalized Silhouette Score") plt.xlabel("Number of clusters") plt.ylabel("Score") # Make bars containing the clusters composition m = len(save_labels) assert (m == len(n_clust_range)) bars = np.zeros(shape=(m, n_clust_range_max)) # Loop over max number of clusters for k in n_clust_range: # Count appearing values count_vals = [] for j in range(n_clust_range_max): count_vals.append(int(save_labels[k].count(j))) # Distribute these values to build bars for i in range(n_clust_range_max): bars[(k - n_clust_range_min) // n_clust_range_step, i] = count_vals[i] # Plot clusters compositions with bar plot if verbose: plt.figure(figsize=(10, 5)) m = bars.shape[0] sum_bars = [0] * m for i in range(n_clust_range_max): if i > 0: sum_bars += bars[:, i - 1] plt.bar(n_clust_range, bars[:, i], width=0.8, bottom=sum_bars) # Make it cute plt.xticks(ticks=n_clust_range) plt.title("Composition of clusters") plt.xlabel("Number of clusters") plt.ylabel("Composition") # Return labels return save_labels, save_quality
def sample_silhouette_score(self, x, cluster_clients): self.sample_silh_score = silhouette_samples(x, cluster_clients)
def SilhouetteScores(vectors, mode, cluster_range, view, output): """ Compute silhouette and elbow method (nltk/sklearn) for range of clusters param1: array of vecotors (3 dimensions) param2: mode ('nltk'/'sklearn'/'agglomerative') param3: list/range of number of clusters param4: output path of figure (needs {} to format number of clusters) output: tupel of two dicts with elbow and silhouette scores (nltk/sklearn), dict with silhouette scores (agglomerative) """ if mode.lower() == "nltk": rng = random.Random() rng.seed(123) wcss = {} # for elbow method s_scores = {} # for silhouette scores for NUM_CLUSTERS in tqdm(cluster_range): kclusterer = KMeansClusterer( NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25, rng=rng, avoid_empty_clusters=True) labels = kclusterer.cluster(vectors, assign_clusters=True) # elbow method # the centroids: kclusterer.means() centroid_array = np.vstack( [kclusterer.means()[label] for label in labels]) wcss[NUM_CLUSTERS] = nltk_inertia(vectors, centroid_array) # silhouette scores if 1 < NUM_CLUSTERS: silhouette_s = metrics.silhouette_samples(vectors, labels, metric='cosine') ss_max = max(silhouette_s) ss_min = min(silhouette_s) ss_mean = float(sum(silhouette_s) / len(silhouette_s)) s_scores[NUM_CLUSTERS] = (ss_max, ss_min, ss_mean) # plotting scatterplot3D(vectors, color=labels, view=view, output=output.format(NUM_CLUSTERS)) return (wcss, s_scores) elif mode.lower() == "sklearn": wcss = {} # for elbow method s_scores = {} # for silhouette scores for NUM_CLUSTERS in tqdm(cluster_range): kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS, n_init=25, random_state=42) kmeans.fit(vectors) # compute k-means clustering labels = kmeans.labels_ # elbow method wcss[NUM_CLUSTERS] = kmeans.inertia_ # silhouette scores if 1 < NUM_CLUSTERS: silhouette_s = metrics.silhouette_samples(vectors, labels, metric='euclidean') ss_max = max(silhouette_s) ss_min = min(silhouette_s) ss_mean = float(sum(silhouette_s) / len(silhouette_s)) s_scores[NUM_CLUSTERS] = (ss_max, ss_min, ss_mean) # plotting scatterplot3D(vectors, color=labels, view=view, output=output.format(NUM_CLUSTERS)) return (wcss, s_scores) elif mode.lower() == "agglomerative": s_scores = {} for NUM_CLUSTERS in tqdm(cluster_range): agglo = cluster.AgglomerativeClustering(n_clusters=NUM_CLUSTERS) agglo.fit(vectors) # compute k-means clustering labels = agglo.labels_ # silhouette scores if 1 < NUM_CLUSTERS: silhouette_s = metrics.silhouette_samples(vectors, labels, metric='euclidean') ss_max = max(silhouette_s) ss_min = min(silhouette_s) ss_mean = float(sum(silhouette_s) / len(silhouette_s)) s_scores[NUM_CLUSTERS] = (ss_max, ss_min, ss_mean) # plotting scatterplot3D(vectors, color=labels, view=view, output=output.format(NUM_CLUSTERS)) return s_scores
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(X) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper),
def calculateSilhouetteScore(self, dataFile): """Calculate the silhouette score for different numbers of clusters. :param self: An instance of the class SilhouetteScore. :param dataFile: An array with the input data points. :return: A list with the names of the image files created. """ instanceKmeans = KmeansRunner() X = instanceKmeans.retrieveData(dataFile) if (X.shape[0] > 10000): size = round(X.shape[0] * 0.001) idx = np.random.randint(X.shape[0], size=size) subset = X[idx, :] X = subset range_n_clusters = [2, 3, 4, 5, 6] list_images = [] for n_clusters in range_n_clusters: fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(np.array(X)) silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors) centers = clusterer.cluster_centers_ ax2.scatter(centers[:, 0], centers[:, 1], marker="o", c="white", alpha=1, s=200) for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle(("Silhouette analysis for k-means" "clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight="bold") fig.savefig("cluster_" + str(n_clusters) + ".png") list_images.append("cluster_" + str(n_clusters) + ".png") return list_images
def silhouette(dataset, n): range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10] if n < 5: range_n_clusters = [2, 3, 4, 5, 6, 7] for n_clusters in range_n_clusters: # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(dataset) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(dataset, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(dataset, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(dataset[:, 0], dataset[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k') # Labeling the clusters centers = clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200, edgecolor='k') for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor='k') ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle( ("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.show()
# plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(peopleMatrixPcaTransform) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(peopleMatrixPcaTransform) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = metrics.silhouette_score(peopleMatrixPcaTransform, cluster_labels) # Compute the silhouette scores for each sample sample_silhouette_values = metrics.silhouette_samples( peopleMatrixPcaTransform, cluster_labels) # The score is bounded between -1 for incorrect clustering and +1 for highly dense clustering. # Scores around zero indicate overlapping clusters. # The score is higher when clusters are dense and well separated, which relates to a standard concept of a cluster. print( "\n\n\nFor n_clusters =", n_clusters, "\n\nThe average silhouette_score is :", silhouette_avg, "\n\n* The silhouette score is bounded between -1 for incorrect clustering and +1 for highly dense clustering.", "\n* Scores around zero indicate overlapping clusters.", "\n* The score is higher when clusters are dense and well separated, which relates to a standard concept of a cluster", "\n\nThe individual silhouette scores were :", sample_silhouette_values, "\n\nAnd their assigned clusters were :", cluster_labels, "\n\nWhich correspond to : 'Jane', 'Bob', 'Mary', 'Mike', 'Alice', 'Skip', 'Kira', 'Moe', 'Sara', and 'Tom'"
def __plot_clusters_onto_2D(self, clusterer, X, dim_reduction_method, perplexity, plot=False): """ For high dimensional data, use t-SNE to reduce the dimensionality and plot result on a 2D plane. Args: perplexity: The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. The choice is not extremely critical since t-SNE is quite insensitive to this parameter. """ if hasattr(clusterer, 'predict'): cluster_labels = clusterer.predict(X) else: cluster_labels = clusterer.labels_ if len(set(cluster_labels)) == 1: print( "clustering failed. unable to group data into two more clusters." ) return # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) n_clusters = clusterUtilities.get_n_clusters(clusterer) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) if plot is True: cmap = cm.get_cmap("CMRmap") # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i # cm.spectral has been removed since matplotlib 2.2 #color = cm.spectral(float(i) / n_clusters) color = cmap(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # map X (high dimensional) to 2D if X.shape[1] > 2: # reduce the dimensionality of input data to 2D if dim_reduction_method == DIM_REDUCTION_METHOD_KERNEL_PCA: kpca = KernelPCA(n_components=2, kernel="rbf", gamma=10, random_state=0) X = kpca.fit_transform(X) else: tsne = manifold.TSNE(n_components=2, perplexity=perplexity, init='pca', random_state=0) X = tsne.fit_transform(X) # 2nd Plot showing the actual clusters formed colors = cmap(cluster_labels.astype(float) / n_clusters) ax2.scatter(X[:, 0], X[:, 1], marker='.', s=100, lw=0, alpha=0.7, c=colors, edgecolor='k') ax2.set_title( "The visualization of the clustered data({}).".format( dim_reduction_method)) ax2.set_xlabel("reduced feature space of 1st dimension") ax2.set_ylabel("reduced feature space of 2nd dimension") plt.suptitle(("Silhouette analysis for clustering methods " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.show()
def plot_silhouette(clf, X, title='Silhouette Analysis', metric='euclidean', copy=True, ax=None, figsize=None, title_fontsize="large", text_fontsize="medium"): """Plots silhouette analysis of clusters using fit_predict. Args: clf: Clusterer instance that implements ``fit`` and ``fit_predict`` methods. X (array-like, shape (n_samples, n_features)): Data to cluster, where n_samples is the number of samples and n_features is the number of features. title (string, optional): Title of the generated plot. Defaults to "Silhouette Analysis" metric (string or callable, optional): The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by sklearn.metrics.pairwise.pairwise_distances. If X is the distance array itself, use "precomputed" as the metric. copy (boolean, optional): Determines whether ``fit`` is used on **clf** or on a copy of **clf**. ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to plot the learning curve. If None, the plot is drawn on a new set of axes. figsize (2-tuple, optional): Tuple denoting figure size of the plot e.g. (6, 6). Defaults to ``None``. title_fontsize (string or int, optional): Matplotlib-style fontsizes. Use e.g. "small", "medium", "large" or integer-values. Defaults to "large". text_fontsize (string or int, optional): Matplotlib-style fontsizes. Use e.g. "small", "medium", "large" or integer-values. Defaults to "medium". Returns: ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was drawn. Example: >>> import scikitplot.plotters as skplt >>> kmeans = KMeans(n_clusters=4, random_state=1) >>> skplt.plot_silhouette(kmeans, X) <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490> >>> plt.show() .. image:: _static/examples/plot_silhouette.png :align: center :alt: Silhouette Plot """ if copy: clf = clone(clf) cluster_labels = clf.fit_predict(X) n_clusters = len(set(cluster_labels)) silhouette_avg = silhouette_score(X, cluster_labels, metric=metric) sample_silhouette_values = silhouette_samples(X, cluster_labels, metric=metric) if ax is None: fig, ax = plt.subplots(1, 1, figsize=figsize) ax.set_title(title, fontsize=title_fontsize) ax.set_xlim([-0.1, 1]) ax.set_ylim([0, len(X) + (n_clusters + 1) * 10 + 10]) ax.set_xlabel('Silhouette coefficient values', fontsize=text_fontsize) ax.set_ylabel('Cluster label', fontsize=text_fontsize) y_lower = 10 for i in range(n_clusters): ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontsize=text_fontsize) y_lower = y_upper + 10 ax.axvline(x=silhouette_avg, color="red", linestyle="--", label='Silhouette score: {0:0.3f}'.format(silhouette_avg)) ax.set_yticks([]) # Clear the y-axis labels / ticks ax.set_xticks(np.arange(-0.1, 1.0, 0.2)) ax.tick_params(labelsize=text_fontsize) ax.legend(loc='best', fontsize=text_fontsize) return ax
def perform_silhouette_analysis(path, segment_lengths, range_nclusters, plot_silhouette): for iseg_length in segment_lengths: print("Segment Length = " + str(iseg_length)) if plot_silhouette: fig = plt.figure(1) df_features = pd.read_csv( os.path.join( path, "Data/length" + str(iseg_length) + "/segment_features.csv")) #df_xys = pd.read_csv(os.path.join(path, "Data/length" + str(iseg_length) + "/segment_xys.csv")) numpy_features = df_features.iloc[:, 4:12].values #fit kmeans features_scaled = preprocessing.scale(numpy_features) plot_index = 0 for n_clusters in range_nclusters: if plot_silhouette: plot_index = plot_index + 1 ax1 = plt.subplot(len(range_nclusters), 1, plot_index) fig.set_size_inches(7, 18) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) plt.axes # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(features_scaled) + (n_clusters + 1) * 10]) # perform kmeans clusterer = KMeans(n_clusters=n_clusters, random_state=0, max_iter=1000) cluster_labels = clusterer.fit_predict(features_scaled) # Calculate the average silhouette value for all segments and print to screen #pdb.set_trace() silhouette_avg = silhouette_score(features_scaled, cluster_labels) print("n_clusters = " + str(n_clusters) + " Avg silhouette_score = " + str(silhouette_avg)) if plot_silhouette: # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples( features_scaled, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples #ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # # 2nd Plot showing the actual clusters formed # colors = cm.spectral(cluster_labels.astype(float) / n_clusters) # ax2.scatter(features_scaled[:, 0], features_scaled[:, 1], marker='.', s=30, lw=0, alpha=0.7, # c=colors, edgecolor='k') # # # Labeling the clusters # centers = clusterer.cluster_centers_ # # Draw white circles at cluster centers # ax2.scatter(centers[:, 0], centers[:, 1], marker='o', # c="white", alpha=1, s=200, edgecolor='k') # # for i, c in enumerate(centers): # ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, # s=50, edgecolor='k') # # ax2.set_title("The visualization of the clustered data.") # ax2.set_xlabel("Feature space for the 1st feature") # ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle(( "Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.show() if plot_silhouette: plt.savefig( os.path.join("figs/", "silhouette_length" + str(iseg_length) + ".pdf")) plt.close(fig) #perform_silhouette_analysis(path = "../../", # segment_lengths = [100,150,200,250,300], # range_nclusters = range(2,11), # plot_silhouette = False)
def test_clustering3(_x, _y, _data, _xLab, _yLab, N_CLUSTERS, _latLon_params, _basemp, **kwargs): pred_dict = {} np.random.seed(0) colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) colors = np.hstack([colors] * 20) plot_num = 1 X = _data print(_data[:10]) # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # Compute distances # create clustering estimators print(kwargs['models']) alg_list = [] for model in kwargs['models']: # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) if model in ['Ward', 'AgglomerativeClustering']: # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=10) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) if model == 'Ward': ward = cluster.AgglomerativeClustering( n_clusters=N_CLUSTERS, linkage='ward', connectivity=connectivity) alg_list.append(('Ward', ward)) if model == 'AgglomerativeClustering': average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=N_CLUSTERS, connectivity=connectivity) alg_list.append(('AgglomerativeClustering', average_linkage)) if model == 'MiniBatchKMeans': two_means = cluster.MiniBatchKMeans(n_clusters=N_CLUSTERS) alg_list.append(('MiniBatchKMeans', two_means)) if model == 'SpectralClustering': spectral = cluster.SpectralClustering(n_clusters=N_CLUSTERS, eigen_solver='arpack', affinity="nearest_neighbors") alg_list.append(('SpectralClustering', spectral)) if model == 'DBSCAN': dbscan = cluster.DBSCAN(eps=.2) alg_list.append(('DBSCAN', dbscan)) if model == 'AffinityPropagation': affinity_propagation = cluster.AffinityPropagation(damping=.9, preference=-200) alg_list.append(('AffinityPropagation', affinity_propagation)) print(alg_list) models = {} for name, algorithm in alg_list: models[name] = {} # predict cluster memberships models[name]['start'] = time.time() algorithm.fit(X) models[name]['end'] = time.time() if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) models[name]['y_pred'] = y_pred else: y_pred = algorithm.predict(X) models[name]['y_pred'] = y_pred models[name]['sil_score'] = metrics.silhouette_score( X, y_pred, metric='euclidean') models[name]['sample_sil_vals'] = silhouette_samples(X, y_pred) models[name]['model'] = algorithm models[name]['N_CLUSTERS'] = N_CLUSTERS return models
ax1.set_ylim([0, len(data_vectorized) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 1 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=1) cluster_labels = clusterer.fit_predict(data_vectorized) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(data_vectorized, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(data_vectorized, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper),
def silhouette(name, n_clusters, x): averages = [] for n_cluster in n_clusters: plot.style.use('seaborn-darkgrid') plot.title( f'Silhouette on the {name} dataset, using {n_cluster}-means') ax = plot.gca() ax.set_xlim([-0.1, 1]) ax.set_ylim([0, len(x) + (n_cluster + 1) * 10]) clusterer = KMeans(n_clusters=n_cluster, random_state=0) cluster_labels = clusterer.fit_predict(x) silhouette_avg = silhouette_score(x, cluster_labels) averages.append(silhouette_avg) sample_silhouette_values = silhouette_samples(x, cluster_labels) y_lower = 10 for i in range(n_cluster): ith_cluster_silhouette_values = sample_silhouette_values[ cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_cluster) ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 ax.set_xlabel("The silhouette coefficient values") ax.set_ylabel("Cluster labels") ax.axvline(x=silhouette_avg, color="red", linestyle="--") ax.set_yticks([]) ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) plot.show() lb = np.min(averages) ub = np.max(averages) amplitude = ub - lb lb -= 0.2 * amplitude ub += 0.2 * amplitude plot.style.use('seaborn-darkgrid') plot.title(f'Silhouette averages on the {name} dataset using k-means') plot.bar(n_clusters, averages) plot.xticks(n_clusters) plot.xlabel('Number of clusters') plot.ylabel('Silhouette averages') plot.ylim([lb, ub]) plot.show() print(f'{name}: {averages}')
def K_Means_silhouette_analysis(X, y): cluster_range = [3, 5, 7, 9, 11, 13, 15] for num_cluster in cluster_range: figure_to_show, (ax1, ax2) = plt.subplots(1, 2) figure_to_show.set_size_inches(20, 8) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(X) + (num_cluster + 1) * 10]) clusterer = KMeans(n_clusters=num_cluster, random_state=10) cluster_labels = clusterer.fit_predict(X) silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters = ", num_cluster, "The average silhouette_score is :", silhouette_avg) sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(num_cluster): ith_cluster_silhouette_values = sample_silhouette_values[ cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / num_cluster) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) colors = cm.nipy_spectral(cluster_labels.astype(float) / num_cluster) ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k') centers = clusterer.cluster_centers_ ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200, edgecolor='k') for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor='k') ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle(( "Silhouette analysis for KMeans clustering on sample data with num_cluster = %d" % num_cluster), fontsize=14, fontweight='bold') plt.show()
# Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=6, n_jobs=-1) cluster_labels = clusterer.fit_predict(sample_descriptors) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(sample_descriptors, cluster_labels, sample_size=100) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(sample_descriptors, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper),
import pandas as pd from sklearn import cluster from sklearn import metrics votes = pd.read_csv("/home/algo/Downloads/congress.csv") votes.shape cluster_model = cluster.AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward') cluster_model.fit(votes.iloc[:, 3:]) labels = cluster_model.labels_ silhouette_avg = metrics.silhouette_score(votes.iloc[:, 3:], labels, metric='euclidean') silhouette_samples = metrics.silhouette_samples(votes.iloc[:, 3:], labels, metric='euclidean') ch_score = metrics.calinski_harabaz_score(votes.iloc[:, 3:], labels) for n_clusters in range(2, 6): cluster_model = cluster.AgglomerativeClustering(n_clusters=n_clusters, random_state=10) cluster_labels = cluster_model.fit_predict(votes.iloc[:, 3:]) silhouette_avg = metrics.silhouette_score(votes.iloc[:, 3:], cluster_labels, metric='euclidean') print("For n_clusters =", n_clusters, "The average silhouette_score is:", silhouette_avg)
for i in range(100): print('Trial number ', i) start_time = time.time() for k in n_clusters: data = init_data.copy() # QUALITY CHECK DATA for j in range(1000): X = data.values kmeans = KMeans(n_clusters=k) cluster_labels = kmeans.fit_predict(X) sample_sil_coefficients = metrics.silhouette_samples( X, cluster_labels) data, count_negative = qualityCheck(data, sample_sil_coefficients) print('Number of data retained after quality check', len(data)) if count_negative == 0: X = data.values kmeans = KMeans(n_clusters=k) cluster_labels = kmeans.fit_predict(X) sil_score = metrics.silhouette_score(X, cluster_labels) ssd_center = kmeans.inertia_ scores[counter] = { 'trial': i + 1, 'cluster_number': k, 'silhouette_score': sil_score,
def vis(X, y, nameappendix, k): scaler = MinMaxScaler(feature_range=[0,100]) scaler.fit(X) X = pd.DataFrame(scaler.transform(X)) fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(15, 6) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(X) + (k + 1) * 10]) print("Num of clusters: ", k) clusters = KMeans(n_clusters = k, random_state = 10).fit(X) labels = clusters.labels_ print("NMI score: %.5f" % normalized_mutual_info_score(y, labels)) silhouette_avg = sil_score(X, labels) print("Silhouette score: ", silhouette_avg) sample_silhouette_values = silhouette_samples(X, labels) y_lower = 10 for i in range(k): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = sample_silhouette_values[labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i # color = plt.spectral(float(i) / numOfCluster) color = plt.get_cmap('Spectral')(float(i) / k) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("Silhouette Coefficients for Clusters.") ax1.set_xlabel("Silhouette Coefficient Values") ax1.set_ylabel("Cluster Labels") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed # colors = plt.spectral(labels.astype(float) / numOfCluster) colors = plt.get_cmap('Spectral')(labels.astype(float) / k) # print(X.values[:, 10]) # colors = ["b","g","r","c","m","y","k"] ax2.scatter(X.values[:, 3], X.values[:,5], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k') # Labeling the clusters centers = clusters.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 3], centers[:, 5], marker='o', c="white", alpha=1, s=200, edgecolor='k') for i, c in enumerate(centers): ax2.scatter(c[3], c[5], marker='$%d$' % i, alpha=1, s=50, edgecolor='k') ax2.set_title("Clustering Visualization") ax2.set_xlabel("1st feature: Pressure X4") ax2.set_ylabel("2nd feature: Pressure X5") plt.suptitle("Analysis for KMeans for " + str(k) + " Clusters", fontsize=14, fontweight='bold') # plt.savefig('img/kmeans_vis' + str(k) + '.png') plt.show()
def make_new_outputs(corr: Union[np.array, pd.DataFrame], clusters: dict, clusters2: dict) -> (pd.DataFrame, dict, pd.Series): """ Makes new outputs for kmeans_advanced_clustering() by recombining two sets of clusters together, recomputing their correlation matrix, distance matrix, kmeans labels and silhouette scores. Arguments --------- corr : numpy.array or pd.DataFrame Correlation matrix. clusters : dict First set of clusters. clusters2 : dict Second set of clusters. Returns ------- pd.DataFrame Clustered correlation matrix. dictionary List of clusters and their content. pd.Series Silhouette scores. Notes ----- Function adapted from "Machine Learning for Asset Managers", Marcos López de Prado (2020). """ # Initializations # Add clusters keys to the new cluster clusters_new = {} for i in clusters.keys(): clusters_new[len(clusters_new.keys())] = list(clusters[i]) for i in clusters2.keys(): clusters_new[len(clusters_new.keys())] = list(clusters2[i]) # Compute new correlation matrix new_idx = [j for i in clusters_new for j in clusters_new[i]] corr_new = corr.loc[new_idx, new_idx] # Compute the observation matrix Xobs = (((1 - corr.fillna(0)) / 2.)**.5).values # Compute the Euclidean distance matrix X = np.zeros(shape=Xobs.shape) for i, j in itertools.product(range(X.shape[0]), range(X.shape[1])): X[i, j] = np.sqrt(sum((Xobs[i, :] - Xobs[j, :])**2)) new_names_features = corr_new.columns.tolist() X = pd.DataFrame(data=X, index=new_names_features, columns=new_names_features) # Add labels together kmeans_labels = np.zeros(len(X.columns)) for i in clusters_new.keys(): idxs = [X.index.get_loc(k) for k in clusters_new[i]] kmeans_labels[idxs] = i # Compute the silhouette scores silh_new = pd.Series(silhouette_samples(X, kmeans_labels), index=X.index) return corr_new, clusters_new, silh_new
print("Algorithm failed") ### ### PLOTTING SILOUHETTE ### classes_to_test = [2, 3, 4, 5, 6] for classNumber in classes_to_test: fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(dataset) + (classNumber + 1) * 10]) cluster_labels, centroids = clusterer.predictLabels(dataset, classNumber) silhouette_avg = silhouette_score(dataset, cluster_labels) sample_silhouette_values = silhouette_samples(dataset, cluster_labels) y_lower = 10 for i in range(classNumber): ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / classNumber) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color,
__print_pca_info__(pca) # compute variance per numbers of clusters cs = range(2, n_clusters + 1) scores = np.zeros((len(cs), 2)) silhouette = np.zeros((len(cs), 2)) for i, n in enumerate(cs): # cluster the samples k = KMeans(n_clusters=n) k.fit(X) # compute total variance and average silhouette score scores[i] = [n, k.inertia_] silhouette[i] = [n, silhouette_score(X, k.labels_)] silhouette_sample_values = silhouette_samples(X, k.labels_) silhouette_sample_values = \ [sorted(silhouette_sample_values[k.labels_ == c], reverse=True) for c in range(n)] # sample cluster data for plotting clusters = [X[k.labels_ == c] for c in range(n)] cluster_samples = [ samples[:, FEATURES][k.labels_ == c] for c in range(n) ] # create object summaries of the data cluster_assignments, sample_assignments = summarize(k, samples) # store a heat map of the distribution of the original samples into the clusters per object hm = sample_heat_map(sample_assignments, n) with open(
n_clusters=n_cl, memory='/home/winz3r/Documents/Data/') t0 = time.time() model.fit(X) tim_spec = time.time() - t0 hier_labels = model.labels_ label_name = dir_name + 'hierarchical_cluster_labels_K' + str( n_cl) + '_' + str(i) + '.csv' f_out = open(label_name, 'w') for w in hier_labels: f_out.write(str(w) + '\n') f_out.close() ##Silhouette Calculation sil_spec = (silhouette_samples(X, hier_labels)).mean(axis=0) ##SSE Calculation SSE_spec = 0 for k in range(n_cl): members = hier_labels == k centre = X[members, :].mean(axis=0) for x in X[members]: SSE_spec += np.dot(x - centre, (x - centre).T) tim.append(n_cl) SSE.append(n_cl) sil.append(n_cl) tim.append(tim_spec) sil.append(sil_spec) SSE.append(SSE_spec)
str(i - reduced_data.shape[0]), color=color, fontdict={ 'weight': 'bold', 'size': size }) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(reduced_data, assigned_cluster) print("For n_clusters =", clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(reduced_data, assigned_cluster) #ax1 = plt.subplot(111) #y_lower = 10 #for i in range(clusters): # # Aggregate the silhouette scores for samples belonging to # # cluster i, and sort them # ith_cluster_silhouette_values = \ # sample_silhouette_values[assigned_cluster == i] # ith_cluster_silhouette_values.sort() # size_cluster_i = ith_cluster_silhouette_values.shape[0] # y_upper = y_lower + size_cluster_i # color = cm.nipy_spectral(float(i) / clusters)
plt.ylabel("Distortion") plt.show() # silhouette analysis km = KMeans(n_clusters=3, init="k-means++", n_init=10, max_iter=300, tol=1e-04, random_state=0) y_km = km.fit_predict(X) cluster_labels = np.unique(y_km) n_clusters = cluster_labels.shape[0] # 为每个样本计算 silhouette 系数 silhouette_vals = silhouette_samples(X, y_km, metric="euclidean") y_ax_lower, y_ax_upper = 0, 0 # 用于柱状图在 y 轴上的宽度 yticks = [] for i, c in enumerate(cluster_labels): c_silhouette_vals = silhouette_vals[y_km == c] # 某一集群的 silhouette 系数 c_silhouette_vals.sort() y_ax_upper += len(c_silhouette_vals) color = cm.jet(i / n_clusters) plt.barh( range(y_ax_lower, y_ax_upper), # 就是 len(c_silouette_vals 的长度 c_silhouette_vals, height=1.0, edgecolor="none", color=color) yticks.append((y_ax_lower + y_ax_upper) / 2) # y 轴提示的位置
cluster = KMeans(n_clusters=n_clusters, random_state=0).fit(X) y_pred = cluster.labels_ pre = cluster.fit_predict(X) cluster_smallsub = KMeans(n_clusters=n_clusters, random_state=0).fit(X[:200]) y_pred_ = cluster_smallsub.predict(X) centroid = cluster.cluster_centers_ inertia = cluster.inertia_ print("总距离", inertia) color = ["red", "pink", "orange", "gray"] fig, ax1 = plt.subplots(1) for i in range(n_clusters): ax1.scatter(X[y_pred == i, 0], X[y_pred == i, 1], marker='o', s=8, c=color[i] ) ax1.scatter(centroid[:, 0], centroid[:, 1], marker="x", s=15, c="black") plt.show() # 轮廓系数,评估模型的分类结果,越接近1越好,可以看到4是最好的聚类分组 n_clusters = 4 cluster_4 = KMeans(n_clusters=n_clusters, random_state=0).fit(X) n_clusters = 5 cluster_5 = KMeans(n_clusters=n_clusters, random_state=0).fit(X) score3 = silhouette_score(X, y_pred) score4 = silhouette_score(X, cluster_4.labels_) score5 = silhouette_score(X, cluster_5.labels_) silhouette_samples(X, y_pred) score_cal = calinski_harabasz_score(X, y_pred) # 这个运行速度更快很多 time = datetime.datetime.fromtimestamp(time()).strftime("%Y-%m-%d %H:%M:%S")
def plot_silhouettes(instance_matrix, origin_path, plot_title): """ "translated" from https://towardsdatascience.com/k-means-clustering-algorithm-applications-evaluation-methods-and-drawbacks-aa03e644b48a :param X_std: :param max_range: :return: """ files = os.listdir(f"{origin_path}") if ".DS_Store" in files: files.remove(".DS_Store") if plot_title + ".png" in files: files.remove(plot_title + ".png") sorted_files = sorted(files, key=lambda s: int(s.split("_")[0].split("=")[1])) #sorted_files = sorted(os.listdir("resources/small/clustering/m_1.5"), key=str.lower) not_odd_files = sorted_files[0::2] # return just every 2nd item print(not_odd_files) list_k = sorted( [int(file.split("_")[0].split("=")[1]) for file in not_odd_files]) for k, filename in zip(list_k, not_odd_files): file = origin_path + "/" + filename fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # Kmeans object k_means = pickle.load(open(file, "rb")) labels = k_means.cluster_mapping centroids = np.vstack(k_means.centroids) #X = np.vstack(k_means.instances) instance_matrix_less = instance_matrix[:len(labels)] # Get silhouette samples silhouette_vals = silhouette_samples(instance_matrix_less, labels) # Silhouette plot y_ticks = [] y_lower, y_upper = 0, 0 for i, cluster in enumerate(np.unique(labels)): cluster_silhouette_vals = silhouette_vals[labels == cluster] cluster_silhouette_vals.sort() y_upper += len(cluster_silhouette_vals) ax1.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1) ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1)) y_lower += len(cluster_silhouette_vals) # Get the average silhouette score and plot it avg_score = np.mean(silhouette_vals) ax1.axvline(avg_score, linestyle='--', linewidth=2, color='green') ax1.set_yticks([]) ax1.set_xlim([-0.1, 1]) ax1.set_xlabel('Silhouette coefficient values') ax1.set_ylabel('Cluster labels') ax1.set_title('Silhouette plot for the various clusters', y=1.02) # Scatter plot of data colored with labels ax2.scatter(instance_matrix[:, 0], instance_matrix[:, 1], c=labels) ax2.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='r', s=250) ax2.set_xlim([-2, 2]) ax2.set_xlim([-2, 2]) ax2.set_xlabel('Eruption time in mins') ax2.set_ylabel('Waiting time to next eruption') ax2.set_title('Visualization of clustered data', y=1.02) ax2.set_aspect('equal') plt.tight_layout() plt.suptitle(f'Silhouette analysis using k = {k}', fontsize=16, fontweight='semibold', y=1.05) plt.show()
# elbow method to reduce distortion distortions = [] for i in range(1, 11): km = KMeans(n_clusters=i, n_init=10, max_iter=300, random_state=0) km.fit(x) distortions.append(km.inertia_) plt.plot(range(1, 11), distortions, marker='o') plt.xlabel('Number of cluster') plt.ylabel('Distortion') plt.show() # quantifying qual by silhouette plots cluster_labels = np.unique(y_km) n_clusters = cluster_labels.shape[0] silhouette_vals = silhouette_samples(x, y_km, metric='euclidean') y_ax_lower, y_ax_upper = 0, 0 yticks = [] for i, c in enumerate(cluster_labels): c_silhouette_vals = silhouette_vals[y_km == c] c_silhouette_vals.sort() y_ax_upper += len(c_silhouette_vals) color = cm.jet(i / n_clusters) plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color) yticks.append((y_ax_lower + y_ax_upper) / 2) y_ax_lower += len(c_silhouette_vals)
def get_silhouette_graph(self, document_list, df_result, num_clusters): X = self.__get_tfidf_matrix(document_list) figures = [] color_list = [ '#1f77b4', # muted blue '#ff7f0e', # safety orange '#2ca02c', # cooked asparagus green '#d62728', # brick red '#9467bd', # muted purple '#8c564b', # chestnut brown '#e377c2', # raspberry yogurt pink '#7f7f7f', # middle gray '#bcbd22', # curry yellow-green '#17becf' # blue-teal ] cmap = cm.get_cmap("Spectral") fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles=('Silhouette Graph', 'Clutering Graph')) # Initialize Silhouette Graph fig['layout']['xaxis1'].update(title='Silhouette Coefficient', range=[-0.1, 1]) fig['layout']['yaxis1'].update( title='Cluster Label', showticklabels=False, range=[0, len(X) + (num_clusters + 1) * 10]) # Compute K-Means Cluster clusterer = KMeans(n_clusters=num_clusters, random_state=10) #KMeans(n_clusters = num_clutsers, init='k-means++', n_init=num_init, max_iter=max_iterations, random_state=0) cluster_labels = clusterer.fit_predict(X) # Compute Average Silhouette Score silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", num_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the Silhouette Scores for Each Sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(num_clusters): ith_cluster_silhouette_values = sample_silhouette_values[ cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i colors = cmap(cluster_labels.astype(float) / num_clusters) filled_area = go.Scatter(y=np.arange(y_lower, y_upper), x=ith_cluster_silhouette_values, mode='lines', showlegend=False, line=dict(width=0.5, color=colors), fill='tozerox') fig.append_trace(filled_area, 1, 1) y_lower = y_upper + 10 # 10 for the 0 samples # Vertical Line for Average Silhouette Score axis_line = go.Scatter(x=[silhouette_avg], y=[0, len(X) + (num_clusters + 1) * 10], showlegend=False, mode='lines', line=dict(color="red", dash='dash', width=1)) fig.append_trace(axis_line, 1, 1) # Cluster Graph colors = matplotlib.colors.colorConverter.to_rgb( cmap(float(i) / num_clusters)) colors = 'rgb' + str(colors) clusters = go.Scatter(x=df_result['x'], y=df_result['y'], showlegend=False, mode='markers', text=cluster_labels, marker=dict(color=[ color_list[cluster_label] for cluster_label in cluster_labels ], size=10)) fig.append_trace(clusters, 1, 2) fig['layout']['xaxis2'].update( title='Feature space for the 1st feature', zeroline=False) fig['layout']['yaxis2'].update( title='Feature space for the 2nd feature', zeroline=False) fig['layout'].update( title="Silhouette Analysis for KMeans Clustering - " + str(num_clusters) + " Cluster") return iplot(fig, filename='basic-line')