def compute_silhouette_score(clusters): """ Compute the euclidean silhouette score and the cosine silhouette score. Return the scores. :param clusters: clusters assignment for each tweet :type clusters: list :return: the silhouette scores :rtype: tuple """ # Load the files tfidf_matrix = pickle.load(open('TF-IDF Matrix - ' + str(n_data) + ' Tweets.p', 'rb')) # Compute the Silhouette Score start = timer() distance = 1 - cosine_similarity(tfidf_matrix) euclidean_silhouette_score = silhouette_score(tfidf_matrix, clusters, metric='euclidean') cosine_silhouette_score = silhouette_score(distance, clusters, metric='precomputed') end = timer() print('Silhouette Score (Euclidean): %.4f' % euclidean_silhouette_score) print('Silhouette Score (Cosine): %.4f' % cosine_silhouette_score) print('Obtained the Silhouette Score in %.2f seconds' % (end - start)) txt_file.write('Silhouette Score (Euclidean): %.4f. \n' % euclidean_silhouette_score) txt_file.write('Silhouette Score (Cosine): %.4f. \n' % cosine_silhouette_score) txt_file.write('Obtained the Silhouette Score in %.2f seconds. \n' % (end - start)) return euclidean_silhouette_score, cosine_silhouette_score
def test_sihouette(self): n1 = np.array([[1,2,1], [1,3,1], [7,8,2], [7,9,2], [13,19,3]]) print(Silhouette.score(n1)) print(silhouette_score(n1, n1[:,-1])) n2 = np.array([[1,2,1], [1,3,2], [7,8,2], [7,9,1], [13,19,3]]) print(Silhouette.score(n2)) print(silhouette_score(n2, n2[:,-1]))
def get_constant_height_labels(clustering, n_clusters=None): """ use silhouette analysis to select the best heigh to cut a linkage matrix :df: a correlation matrix parse_heatmap: int (optional). If defined, devides the columns of the heatmap based on cutting the dendrogram """ N_variables = len(clustering['reorder_vec']) scores = [] if n_clusters is None: for k_clusters in range(2,N_variables//3): labels = cut_tree(clustering['linkage'], n_clusters=k_clusters) try: score = silhouette_score(clustering['distance_df'], labels.ravel(), metric='precomputed') except ValueError: continue scores.append((k_clusters,score)) best_k = max(scores, key=lambda x: x[1])[0] labels = cut_tree(clustering['linkage'], n_clusters=best_k) else: labels = cut_tree(clustering['linkage'], n_clusters=n_clusters) score = silhouette_score(clustering['distance_df'], labels, metric='precomputed') scores.append((n_clusters, score)) labels = reorder_labels(labels.flatten(), clustering['linkage']) # comparison MI = adjusted_mutual_info_score(labels, clustering['labels']) return labels, scores, MI
def bench_k_means(estimator, data, labels): t0 = time() estimator.fit(data) print("time to fit: {:.5}".format(time() - t0)) homogenity = metrics.homogeneity_score(labels, estimator.labels_) completeness = metrics.completeness_score(labels, estimator.labels_) v_measure = metrics.v_measure_score(labels, estimator.labels_) print("homogenity {:.5}, completeness {:.5}, v_measure_score {:.5}".format( homogenity, completeness, v_measure) ) adj_rand_score = metrics.adjusted_rand_score( labels, estimator.labels_ ) print("adjusted_rand_score {:.5}".format(adj_rand_score)) adj_mutual_info_score = metrics.adjusted_mutual_info_score( labels, estimator.labels_ ) print("adjusted_mutual_info_score {:.5}".format( adj_mutual_info_score) ) silhouette_score = metrics.silhouette_score( data, estimator.labels_, metric='euclidean' ) print("silhouette_score {:.5}".format( metrics.silhouette_score(data, estimator.labels_, metric='euclidean')) ) return [ homogenity, completeness, v_measure, adj_rand_score, adj_mutual_info_score, silhouette_score ]
def clustering_drawing(): X,Tag = getData() n = 3 kmeans_model = KMeans(n_clusters = n).fit(X) labels = kmeans_model.labels_ score = metrics.silhouette_score(X, labels, metric='euclidean') scoreList = [score] nList = [3,4,5,6,7,8,9] for i in range(4,10):# 聚类4-10类循环 # print i kmeans_model_temp = KMeans(n_clusters=i).fit(X) labels_temp = kmeans_model_temp.labels_ score_temp = metrics.silhouette_score(X, labels_temp, metric='euclidean') print i,score_temp scoreList.append(float(score_temp)) if float(score_temp) > score: kmeans_model = kmeans_model_temp labels = labels_temp n = i print n,labels plt.axis([3,9,0.8,1.0]) plt.plot(nList, scoreList, 'r--') plt.show()
def print_cluster(clusterTrainClass, labels, clusterTestStory): print("Homogeneity: %0.3f" % metrics.homogeneity_score(clusterTrainClass, labels)) print("Completeness: %0.3f" % metrics.completeness_score(clusterTrainClass, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(clusterTrainClass, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(clusterTrainClass, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(clusterTrainClass, labels)) print "Silhouette Coefficient:" print metrics.silhouette_score(clusterTestStory, labels, metric='euclidean')
def benchmark(estimator, name, data): t0 = time() estimator.fit(data) print('% 9s %.2fs %i %.3f' % (name, (time() - t0), estimator.inertia_, metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=None))) return [time() - t0, estimator.inertia_, metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=None)]
def drawwDBSCAN(newarray,comparearray,cityname): X = StandardScaler().fit_transform(newarray) # print newarray # print "#########" # print X # X = newarray ############################################################################## # Compute DBSCAN db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) ############################################################################## # Plot result matplotlib.style.use('ggplot') # Black removed and is used for noise instead. unique_labels = set(labels) colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = 'k' class_member_mask = (labels == k) xy = X[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) xy = X[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) plt.title('Estimated number of clusters: %d' % n_clusters_) imgname = "./clusterimage/hourcondimention/" +cityname+'.png' fig = plt.gcf() fig.set_size_inches(16.5, 12.5) fig.savefig(imgname) ScandARI = drawlableCLuster(comparearray,labels,cityname.split('_')[3]) print ScandARI with open('summary_hour_total_dimention.csv','a') as f: write = csv.writer(f) # write.writerow(['name','clusters','SC']) write.writerow([cityname,n_clusters_,metrics.silhouette_score(X, labels, metric='sqeuclidean')]) write.writerow(["hour_dimention_twitterinfo"+cityname.split('_')[3],ScandARI[0],ScandARI[1],ScandARI[2]])
def eval_perf(self): X_tst, y_tst = self.data.get_test_set() code = self.dbn.f_code(X_tst) from sklearn import metrics sil_c = metrics.silhouette_score(code, y_tst) sil_X = metrics.silhouette_score(X_tst, y_tst) print 'Silhouette code y', sil_c print 'Silhouette X y', sil_X
def cluster_kmeans1(): dataset = datasets.load_iris() X = dataset.data kmeans_model = KMeans(n_clusters=4,random_state=1).fit(X) labels = kmeans_model.labels_ print X print kmeans_model.cluster_centers_ print labels print metrics.silhouette_score(X,labels,metric="euclidean")
def kmeans(input_file, n_clusters, Output): lvltrace.lvltrace("LVLEntree dans kmeans unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] sample_size, n_features = X.shape k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) k_means.fit(X) reduced_data = k_means.transform(X) values = k_means.cluster_centers_.squeeze() labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ print "#########################################################################################################\n" #print y #print labels print "K-MEANS\n" print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels)) print('completeness_score: %f'%metrics.completeness_score(y, labels)) print('v_measure_score: %f'%metrics.v_measure_score(y, labels)) print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels)) print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y, labels)) print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) print('\n') print "#########################################################################################################\n" results = Output+"kmeans_scores.txt" file = open(results, "w") file.write("K-Means Scores\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y, labels)) file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) file.write("\n") file.write("True Value, Cluster numbers, Iteration\n") for n in xrange(len(y)): file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1))) file.close() import pylab as pl from itertools import cycle # plot the results along with the labels k_means_cluster_centers = k_means.cluster_centers_ fig, ax = plt.subplots() im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.') for k in xrange(n_clusters): my_members = labels == k cluster_center = k_means_cluster_centers[k] ax.plot(cluster_center[0], cluster_center[1], 'w', color='b', marker='x', markersize=6) fig.colorbar(im) plt.title("Number of clusters: %i"%n_clusters) save = Output + "kmeans.png" plt.savefig(save) lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
def acc_silhouette(X, lbls_true, lbls_pred, reject, strat_lbl_inds, use_strat=False, metric='euclidean'): if use_strat: dists = sc.distances(X[:, strat_lbl_inds], gene_ids=np.arange(strat_lbl_inds.size), metric=metric ) sil = metrics.silhouette_score(dists, lbls_pred[strat_lbl_inds], metric='precomputed') perc = np.int(np.float(len(strat_lbl_inds))/np.float(lbls_true.size) * 100.0) desc = ('Silhouette (strat={0},{1})'.format(perc, metric), 'Silhouette ({0})'.format(metric)) else: dists = sc.distances(X, gene_ids=np.arange(X.shape[1]), metric=metric ) sil = metrics.silhouette_score(dists, lbls_pred, metric='precomputed') desc = ('Silhouette ({0})'.format(metric), 'Silhouette ({0})'.format(metric)) return sil, desc
def fit(self, X, Y=None): proj = skl_cluster.KMeans(**self.params) if isinstance(X, Table): proj = proj.fit(X.X, Y) proj.silhouette = silhouette_score(X.X, proj.labels_) else: proj = proj.fit(X, Y) proj.silhouette = silhouette_score(X, proj.labels_) proj.inertia = proj.inertia_ / len(X) cluster_dist = Euclidean(proj.cluster_centers_) proj.inter_cluster = np.mean(cluster_dist[np.triu_indices_from(cluster_dist, 1)]) return KMeansModel(proj, self.preprocessors)
def bench_k_means(estimator, name, data, silhouette_results): t0 = time() estimator.fit(data) print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette') print('% 9s\t %.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % \ (name, (time() - t0), \ estimator.inertia_, \ metrics.homogeneity_score(labels, estimator.labels_), \ metrics.completeness_score(labels, estimator.labels_), \ metrics.v_measure_score(labels, estimator.labels_), \ metrics.adjusted_rand_score(labels, estimator.labels_), \ metrics.adjusted_mutual_info_score(labels, estimator.labels_), \ metrics.silhouette_score(data, estimator.labels_, metric='euclidean'))) return str(metrics.silhouette_score(data,estimator.labels_, metric='euclidean'))
def cluster_driver(a_driver): # print a_driver['DStats'] # print "#############################DStats Above#################################ValueError: zero-size array to reduction operation minimum which has no identity#################" X = StandardScaler().fit_transform(a_driver['DStats']) # print X # print "DStats are.....::" , a_driver['DStats'] # print "X is...........::" ,['AvgDistDel', 'AvgACosDel', 'SDevDistDel', 'SDevACosDel','TotalTime','SkewDistDel','SkewACosDel'] X # print "############################Scaled X Above###################################################" # db = KMeans(n_clusters=20,n_jobs = -1).fit(X) db = DBSCAN(eps=0.45).fit(X) # core_samples_mask = np.zeros_like(db.labels_, dtype=bool) # core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print "###############################################################################" # print('Estimated number of clusters: %d' % n_clusters_) # print 'Count of Predicts::', len(X) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels,metric="mahalanobis")) # print "##############################DBSCAN X Below#################################################" # print X G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/' # try: return (metrics.silhouette_samples(X, labels,metric="mahalanobis")+1)/2
def save_cluster_metrics(self, points, predictions, key, level2_mode = False): try: silhoutte_global = metrics.silhouette_score(points, predictions, metric='euclidean') silhoutte_weighted = utils.silhoutte_weighted(points, predictions) self.silhouette_scores_global[key] = silhoutte_global self.silhouette_scores_weighted[key] = silhoutte_weighted if level2_mode: self.level2_silhoutte_global.append(silhoutte_global) self.level2_silhoutte_weighted.append(silhoutte_weighted) except ValueError as e: pass # dunn_scores = cluster_evaluation.dunn_index(points, predictions, means) dunn_scores = [0, 0, 0] if (dunn_scores[0] is not None) and (dunn_scores[1] is not None) and (dunn_scores[2] is not None): self.dunn_scores_1[key] = dunn_scores[0] self.dunn_scores_2[key] = dunn_scores[1] self.dunn_scores_3[key] = dunn_scores[2] if level2_mode: self.level2_dunn_1.append(dunn_scores[0]) self.level2_dunn_2.append(dunn_scores[1]) self.level2_dunn_3.append(dunn_scores[2])
def find_best(df, cls, norm): ''' INPUTS: Pandas DataFrame, String of which dataset is being used, Boolean if data is normalized OUTPUTS: Prints score to screen. Saves model to RESULTS_DIR ''' if norm == True: df = StandardScaler(copy=False).fit_transform(df) files = [f for f in os.listdir(RESULTS_DIR) if f.endswith('{}_{}.pkl'.format(cls, norm)) and if f.startswith('k')]] scores = [] for f in files: model = pickle.load(open(RESULTS_DIR + f, 'rb')) labels = model.predict(df) score = silhouette_score(df.values, labels, sample_size=10000) name = f.split('_')[1] scores.append((score, name)) print "{} {} {} {}".format(f.split('_')[1], float(score), cls, norm) del labels del model ranked_scores = sorted(scores, reverse=True) ranked_scores = [(item[1], item[0]) for item in ranked_scores] with open('{}_{}.pkl'.format(cls, norm), 'wb') as f: pickle.dump(ranked_scores, f) for item in ranked_scores: print item
def Create_Ext_Agg_cluster(self,stem,stop,processing,remS): Allrow_dicts=data_pkg.FileHandling.read_csv(self.ExtStringCSv) Allstrings=list() #Allstrings=[rowdict_str["Text_original"] for rowdict_str in Allrow_dicts] for row_dict in Allrow_dicts: if self.POS =="ALL_EXT": Stringrow=row_dict["Text_original"]+row_dict["Adj_Extended"]+row_dict["Noun_Extended"] +row_dict["Verb_Extended"] Allstrings.append(Stringrow) else: Stringrow=row_dict["Adj"]+row_dict["Adj_Extended"]+row_dict["Noun"]+row_dict["Noun_Extended"]#+row_dict["Verb"]#+row_dict["Verb_Extended"] Allstrings.append(Stringrow) Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in Allstrings] if remS: Allstrings_process=[preprocess_text.removeS(text) for text in Allstrings_process] vectorizer = CountVectorizer() term_doc=vectorizer.fit_transform(Allstrings_process) #-------------------------- feature_names=vectorizer.get_feature_names() #--z---------------------------------------------- Array=term_doc.toarray if self.affinity=='euclidean': Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,affinity='euclidean') if self.affinity=='cosine': Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity='cosine') Res_Labels=Agg_cluster.fit_predict(term_doc.toarray()) self.cluster_tup_list=self.tuple_Ext_cluster_doc(Res_Labels,Allstrings,Allrow_dicts) #term_doc_lsa = lsa.fit_transform(term_doc) print type (term_doc) self.metric=metrics.silhouette_score(term_doc.toarray(), Res_Labels, metric=self.affinity) print Res_Labels print("n_samples: %d, n_features: %d" % term_doc.shape)
def ap(data): X = data af = AffinityPropagation( damping=0.8, max_iter=200, convergence_iter=15, preference=None, affinity='euclidean', verbose=True).fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) # print( # "Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) # print("Completeness: %0.3f" % metrics.completeness_score( # labels_true, labels)) # print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) # print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score( # labels_true, labels)) # print("Adjusted Mutual Information: %0.3f" % # metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score( X, labels, metric='sqeuclidean'))
def __initialize_clusters(self): # Load the data self.companies, self.descriptions, self.company_idx_map = load_data(self.data_dir, has_header=True) # Vectorize the data using TF-IDF to help reduce dimensionality self.vectorizer = TfidfVectorizer(max_df=0.5, stop_words='english', use_idf=True, ngram_range=(1, 2)) #min_df=2 X = self.vectorizer.fit_transform(self.descriptions) self.instance_vector_array = X.toarray() print("n_samples: %d, n_features: %d" % X.shape) # Initialize K-means algorithm preferences print("Initializing clusters, this takes a few seconds ...") self.km = KMeans(n_clusters=self.k, init='k-means++', max_iter=100, n_init=1, verbose=False) # Use k-means to generate clusters self.km.fit(X) # initialize results dictionary labels = self.km.labels_ for i in range(0, self.k, 1): self.results[i] = [] # assign results by label for i in range(0, len(labels), 1): self.results[labels[i]].append(self.companies[i]) self.company_to_cluster[self.companies[i].name] = labels[i] print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, self.km.labels_, sample_size=1000)) # Write cluster results to Output file output_results(self.km, self.k, self.vectorizer, self.results) pass
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10): ############################################################################## # Extract Y true labels_true = y_true ############################################################################## # transform distance matrix into a similarity matrix S = 1 - D ############################################################################## # compute DBSCAN #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S) db = Ward(n_clusters=n_clusters).fit(S) #core_samples = db.core_sample_indices_ labels = db.labels_ # number of clusters in labels, ignoring noise if present n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print 'Number of clusters: %d' % n_clusters_ print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels) print 'Completeness: %0.3f' % metrics.completeness_score(labels_true, labels) print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels) print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(labels_true, labels) print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(labels_true, labels) print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score(D, labels, metric='precomputed')
def test_KMeans_scores(self): digits = datasets.load_digits() df = pdml.ModelFrame(digits) scaled = pp.scale(digits.data) df.data = df.data.pp.scale() self.assert_numpy_array_almost_equal(df.data.values, scaled) clf1 = cluster.KMeans(init='k-means++', n_clusters=10, n_init=10, random_state=self.random_state) clf2 = df.cluster.KMeans(init='k-means++', n_clusters=10, n_init=10, random_state=self.random_state) clf1.fit(scaled) df.fit_predict(clf2) expected = m.homogeneity_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.homogeneity_score(), expected) expected = m.completeness_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.completeness_score(), expected) expected = m.v_measure_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.v_measure_score(), expected) expected = m.adjusted_rand_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.adjusted_rand_score(), expected) expected = m.homogeneity_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.homogeneity_score(), expected) expected = m.silhouette_score(scaled, clf1.labels_, metric='euclidean', sample_size=300, random_state=self.random_state) result = df.metrics.silhouette_score(metric='euclidean', sample_size=300, random_state=self.random_state) self.assertAlmostEqual(result, expected)
def optimal_cutoff(Y,dist_mat,min_size): labels = np.array([sch.fcluster(Y,c,criterion='distance') for c in Y[:,2]]) score = np.array([metrics.silhouette_score(dist_mat,l) for l in labels[:-min_size]]) c = Y[:-min_size,2] f = interp(c,-score,kind='linear') opt_c = opt.fmin(f,x0=c[2*min_size]) return opt_c
def getBestGMM(X,n_components,cv_types): ''' Function that finds the best GMM cluster trying different gaussians and different number of clusters ''' lowest_bic = np.infty bic = [] silhouette = [] for cv_type in cv_types: for n_components in n_components_range: # Fit a mixture of Gaussians with EM gmm = mixture.GMM(n_components=n_components, covariance_type=cv_type) gmm.fit(X) bic.append(gmm.bic(X)) Y_predicted=gmm.predict(X) if cv_type =='tied': silhouette.append(metrics.silhouette_score(X, Y_predicted, metric='euclidean')) #I only save the values for tied, because i know from the first run that its the best gaussian if n_components>=1: if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm bic = np.array(bic) color_iter = itertools.cycle(['k', 'r', 'g', 'b', 'c', 'm','y']) return best_gmm,color_iter,bic,silhouette
def kmean_clusters(min_list, d, show, thresh): np.random.seed(0) k_rng = range(1, len(min_list)) est = [KMeans(n_clusters=k).fit(d) for k in k_rng] silhouette_score = [ metrics.silhouette_score(d, e.labels_, metric='euclidean') for e in est[1:]] within_sum_squares = [e.inertia_ for e in est] diff_sq = [sq / within_sum_squares[0] for sq in within_sum_squares] diff_sq_pd = pd.Series(diff_sq) k_list = list(k_rng) select_k = k_list[len(k_list) - 1] thresh_pd = diff_sq_pd[diff_sq_pd < thresh] if thresh_pd.shape[0] > 0: select_k = k_list[thresh_pd.index[0]] if show: TLineDrawer.plot_elow_k_choice(k_rng, silhouette_score, within_sum_squares, select_k) select_est = est[select_k - 1] y_kmean = select_est.predict(d) return y_kmean, select_k
def findClusterSize(data): #also returns the value of the Silhouette Coefficient (I trust it more than elbow method) K = range(2,10) meandistortions = [] silCoeffs = [] for k in K: kmeans = KMeans(n_clusters=k) kmeans.fit(data) meandistortions.append(sum(np.min(cdist(data,kmeans.cluster_centers_,'euclidean'),axis=1))/data.shape[0]) silCoeffs.append(metrics.silhouette_score(data,labels=kmeans.labels_,metric='euclidean')) print "\n\nMean Distortions" print "------------------" for i in K: print 'K:',i,'\t',meandistortions[i-2],'\t Silhouette Coeff: ',silCoeffs[i-2] kDistTuple = [] print "\nDistortion Decline" print "-----------------------" for i in range(1,len(meandistortions)): difference = meandistortions[i-1]-meandistortions[i] kDistTuple.append([difference,i+1]) for i in range(len(kDistTuple)-1): print 'K: %2d -> %2d %10.5f'%(kDistTuple[i][1],kDistTuple[i+1][1],kDistTuple[i][0]) kDistTuple.sort() print "\n\nElbow Method Suggestion for Clusters: ",kDistTuple[0][1] kSilCoeff = zip(silCoeffs,K) kSilCoeff.sort() print "Best Silhouette Coeff and cluster size: ",kSilCoeff[-1] return kSilCoeff[-1]
def compute_silhouette_score(X, tree, metric_measure): ''' n : sample sizes |X| num of clusters, k = [1..n] for each value of k P_k: partition of X having k cluster (based on the maximum distance (or the radius) of a cluster) compute silhouette score for P_k input: X : data tree: ward tree matric_measure ('euclidean', ...) output: float array 1D size n value of silhouette score of partion P_k ''' n = len(X) score = np.zeros(n-1) print 'Length : ', n for i in range(n-1): #canot calculate the silhouette score for only one cluster #should start from 2 clusters k = i + 2 print '\n Cutting at k = ', k label = _hc_cut(k,tree.children_, tree.n_leaves_) print '\n Compute score ...' s = metrics.silhouette_score(X, label, metric = metric_measure) #s = silhouette_score_block(X, label, metric= metric_measure , sample_size=None ) score[k-2] = s
def fit_dbscan(data, eps, min_samples, normalize=True, show=True, juxta_cluster_indices_grouped=None, threshold_legend=None): X = np.transpose(data) if normalize: from sklearn.preprocessing import minmax_scale minmax_scale(X, feature_range=(-1, 1), axis=0, copy=False) from sklearn.cluster import DBSCAN from sklearn import metrics db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) score = metrics.silhouette_score(X, labels, sample_size=5000) print('For eps={}, min_samples={}, estimated number of clusters={}'.format(eps, min_samples, n_clusters_)) print("Silhouette Coefficient: {}".format(score)) if show: pf.show_clustered_tsne(db, X, juxta_cluster_indices_grouped, threshold_legend) return db, n_clusters_, labels, core_samples_mask, score
def calculateNumberOfIdealClusters(maxAmount, corpus): print "Initializing silhouette analysis" range_n_clusters = range(2, maxAmount) # max amount of clusters equal to amount of jobs silhouette_high = 0; silhouette_high_n_clusters = 2; for n_clusters in range_n_clusters: # Initialize the clusterer with n_clusters value cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward", affinity="euclidean") cluster_labels = cluster.fit_predict(corpus) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed clusters silhouette_avg = silhouette_score(corpus, cluster_labels) print "For n_clusters = %d, the average silhouette_score is: %.5f" % (n_clusters, silhouette_avg) if (silhouette_avg > silhouette_high): silhouette_high = silhouette_avg silhouette_high_n_clusters = n_clusters # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(corpus, cluster_labels) print ("Highest score = %f for n_clusters = %d" % (silhouette_high, silhouette_high_n_clusters)) return silhouette_high_n_clusters
def _cluster(params): cls = None method = sh.getConst('method') if method=='kmedoid': assert False # from kmedoid import kmedsoid # cls = kmedoid elif method=='dbscan': from sklearn.cluster import DBSCAN cls = DBSCAN(eps=params['eps'],min_samples=params['min_samples'], metric='precomputed') else: assert False, 'FATAL: unknown cluster method' ## mat = sh.getConst('mat') labels = cls.fit_predict(mat) nLabels = len(set(labels)) ## sil = None; cal = None if (nLabels >= 2)and(nLabels <= len(labels)-1): sil = met.silhouette_score(mat,labels,'precomputed') cal = met.calinski_harabaz_score(mat,labels) perf = dict(silhouette_score=sil,calinski_harabaz_score=cal) return (labels,perf)
pca = PCA(n_components=0.95) X_pca = pca.fit_transform(cc_clean) # Kmeans for 2 to 8 clusters KS = range(2, 10) # storage inertia = [] silo = [] for k in KS: km = KMeans(k) km.fit(X_pca) labs = km.predict(X_pca) inertia.append(km.inertia_) silo.append(silhouette_score(X_pca, labs)) print(silo) # plot plt.figure(figsize=(15,5)) plt.subplot(1, 2, 1) plt.title("Inertia") sns.lineplot(KS, inertia) plt.subplot(1, 2, 2) plt.title("Silohouette Score") sns.lineplot(KS, silo)
for i in range(49): cluster_num = i + 2 print('the cluster num is: %d' % cluster_num) # Set the K-Means input vector here # Kmeans = KMeans(n_clusters=cluster_num, random_state=None).fit(costKmeans_norm) labels = np.array(Kmeans.labels_) BC, WC = My_harabaz_score(costKmeans_norm, Kmeans.labels_, cluster_num) temp_score = metrics.calinski_harabaz_score(costKmeans_norm, Kmeans.labels_) print('the harabaz score is: %f' % temp_score) harabaz_score.append(temp_score) temp_score1 = metrics.silhouette_score(costKmeans_norm, Kmeans.labels_, metric='euclidean') sil_coe.append(temp_score1) print('the sil_coe is: %f' % temp_score1) BC_score.append(BC) WC_score.append(WC) print(BC) print(WC) print(labels) labelsforKmeans.append(labels) sil_coe_diff = np.diff(sil_coe) harabaz_score_diff = np.diff(harabaz_score)
centers = pca.transform(clusterer.means_) figname = create_path("fig", sys.argv[1], "GMM", sys.argv[2], filename=("%d_%s_gmm_vis.png" % (n_clusters, covariance_type))) visualize_cluster(X_vis, cluster_labels, n_clusters, centers, figname) ari = metrics.adjusted_rand_score(y, cluster_labels) ami = metrics.adjusted_mutual_info_score(y, cluster_labels) nmi = metrics.normalized_mutual_info_score(y, cluster_labels) fms = metrics.fowlkes_mallows_score(y, cluster_labels) sil = metrics.silhouette_score(X, cluster_labels, metric='euclidean') chi = metrics.calinski_harabaz_score(X, cluster_labels) dbi = metrics.davies_bouldin_score(X, cluster_labels) print("Adjusted Rand index: %.6f" % ari) print("Adjusted Mutual Information: %.6f" % ami) print("Normalized Mutual Information: %.6f" % nmi) print("Fowlkes-Mallows score: %.6f" % fms) print("Silhouette Coefficient: %.6f" % sil) print("Calinski-Harabaz Index: %.6f" % chi) print("Davies-Bouldin Index: %.6f" % dbi) ari_score.append(ari) ami_score.append(ami) nmi_score.append(nmi)
core_samples = db.core_sample_indices_ labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) ############################################################################## # Plot result import pylab as pl # Black removed and is used for noise instead. unique_labels = set(labels) colors = pl.cm.Spectral(np.linspace(0, 1, len(unique_labels))) for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = 'k' markersize = 6 class_members = [index[0] for index in np.argwhere(labels == k)] cluster_core_samples = [index for index in core_samples
threedee.set_ylabel('y') threedee.set_zlabel('z') fig_2 = threedee.get_figure() fig_2.suptitle('3D Clustered Data', fontsize=16) fig_2.savefig('Q6_1.png') plt.show() #given that we know/can know the averages of these can also work an error from this #and see if this is a better indicator #compare the created samples mean and the predicted means means_3 = np.array(clf_3.means_) covariance_3 = clf_3.covariances_ print("The silhouette score is "+ str(silhouette_score(gen_df, labels_3, metric = 'euclidean'))) print("examining the means of the generated GMMs and the predicted ones") print("counter mean") print(counter_mean) print("predicted mean") print(means_3) mean_difference = means_3 - counter_mean print("The differences between the means in this model") print(mean_difference) print("This is because the large values dominate the mean") #Large values are over represented in the mean #The problem here is that there is far too much overlap in these distributions #so while the model can fit it the means it provides mean nothing #want to test that the model would work if the distribution were spread out
elapsed = timeit.default_timer() - start_time print('Execution time: {0:.4f} sec'.format(elapsed)) x, y = zip(*sorted( sil_coef.items())) # unpack a list of pairs into two tuples plt.plot(x, y) plt.xlabel('Number of Clusters') plt.ylabel('Silhouette Score') plt.show() return sil_coef sil_coef = cluster(df_N, 10) #%% print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(phi_true, phi_predict)) print("Completeness: %0.3f" % metrics.completeness_score(phi_true, phi_predict)) print("V-measure: %0.3f" % metrics.v_measure_score(phi_true, phi_predict)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(phi_true, phi_predict)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(phi_true, phi_predict)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(dfHeurN, phi_predict, metric='sqeuclidean'))
def silhouette_coefficient(dataSet): # List of number of clusters range_n_clusters = [2, 3, 4, 5, 6] X = dataSet # pca = decomposition.PCA(n_components=2) # pca.fit(X) # X = pca.transform(X) # For each number of clusters, perform Silhouette analysis and visualize the results. for n_clusters in range_n_clusters: # Perform k-means. kmeans = KMeans(n_clusters=n_clusters, random_state=10) y_pred = kmeans.fit_predict(X) # Compute the cluster homogeneity and completeness. homogeneity = metrics.homogeneity_score(y_pred, y_pred) completeness = metrics.completeness_score(y_pred, y_pred) # Compute the Silhouette Coefficient for each sample. s = metrics.silhouette_samples(X, y_pred) # Compute the mean Silhouette Coefficient of all data points. s_mean = metrics.silhouette_score(X, y_pred) # For plot configuration ----------------------------------------------------------------------------------- fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # Configure plot. plt.suptitle( 'Silhouette analysis for K-Means clustering with n_clusters: {}'. format(n_clusters), fontsize=14, fontweight='bold') # Configure 1st subplot. ax1.set_title('Silhouette Coefficient for each sample') ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") ax1.set_xlim([-1, 1]) ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Configure 2st subplot. ax2.set_title( 'Homogeneity: {}, Completeness: {}, Mean Silhouette score: {}'. format(homogeneity, completeness, s_mean)) ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") # For 1st subplot ------------------------------------------------------------------------------------------ # Plot Silhouette Coefficient for each sample y_lower = 10 for i in range(n_clusters): ith_s = s[y_pred == i] ith_s.sort() size_cluster_i = ith_s.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_s, facecolor=color, edgecolor=color, alpha=0.7) ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 # Plot the mean Silhouette Coefficient using red vertical dash line. ax1.axvline(x=s_mean, color="red", linestyle="--") # For 2st subplot ------------------------------------------------------------------------------------------- #pca = decomposition.PCA(n_components=2) #pca.fit(X) #plot_X = pca.transform(X) # Plot the predictions colors = cm.spectral(y_pred.astype(float) / n_clusters) ax2.scatter(X[:, 0], X[:, 1], c=colors) return X
centers = kmeans.cluster_centers_ score = kmeans.score(df_scaled) # Compute Clustering Metrics n_clusters_ = len(centers) print('Number of clusters: %d' % n_clusters_) #print("Homogeneity: %0.3f" % metrics.homogeneity_score(phi_true, phi_predict)) #print("Completeness: %0.3f" % metrics.completeness_score(phi_true, phi_predict)) #print("V-measure: %0.3f" % metrics.v_measure_score(phi_true, phi_predict)) #print("Adjusted Rand Index: %0.3f" # % metrics.adjusted_rand_score(phi_true, phi_predict)) #print("Adjusted Mutual Information: %0.3f" # % metrics.adjusted_mutual_info_score(phi_true, phi_predict)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(df_scaled, phi_predict, metric='sqeuclidean')) # timeit statement elapsed = timeit.default_timer() - start_time #%% import math df = df_metrics plt.scatter(df.AvgStart, df.AvgEnergy) #%% def round10(x): return int(math.ceil(x / 5.0)) * 5
def main(): dataset = pd.read_csv('dataset.csv') positive = dataset.loc[dataset['Lab Status'] == 'Positive ID'] latitude = get_column_value(positive, 'Latitude').tolist() longitude = get_column_value(positive, 'Longitude').tolist() date = get_column_value(positive, 'Detection Date').tolist() date = pd.to_datetime(date) interval = (date - date[0]).days interval = interval - np.min(interval) data = [] for i, la in enumerate(latitude): data.append([latitude[i], longitude[i], interval[i]]) data = np.array(data) data = data[np.argsort(data[:, 2])] data_scale = preprocessing.scale(data) SSE = [] for k in range(2, 9): kmeans = KMeans(n_clusters=k, random_state=0).fit(data_scale) SSE.append(kmeans.inertia_) X = range(2, 9) plt.xlabel('Number of Clusters(k)') plt.ylabel('SSE') plt.title('SSE vs k') plt.plot(X, SSE, 'o-') plt.show() Scores = [] for k in range(2, 9): kmeans = KMeans(n_clusters=k, random_state=0).fit(data) Scores.append( silhouette_score(data, kmeans.labels_, metric='euclidean')) X = range(2, 9) plt.xlabel('Number of Clusters(k)') plt.ylabel('Silhouette Coefficient') plt.title('Silhouette Coefficient vs k') plt.plot(X, Scores, 'o-') plt.show() cluster_num = 3 kmeans = KMeans(n_clusters=cluster_num, random_state=0).fit(data_scale) label = kmeans.labels_ centers = [] label_list = [] for i in range(cluster_num): label_list.append(data[label == i, 0:2].tolist()) centers.append(np.mean(data[label == i], axis=0).tolist()) centers = np.array(centers) centers_list = np.delete(centers, -1, axis=1).tolist() centers = centers[np.argsort(centers[:, 2])] print(centers) ax1 = plt.axes(projection='3d') ax1.scatter3D(data[:, 1], data[:, 0], data[:, 2], c=kmeans.labels_, cmap='rainbow') ax1.scatter3D(centers[:, 1], centers[:, 0], centers[:, 2], c='black', s=150, alpha=0.5) plt.show() x = centers[:, 1].reshape((-1, 1)) y = centers[:, 0] reg = LinearRegression().fit(x, y) k = reg.coef_[0] b = reg.intercept_ print("Y = %.5fX + (%.5f)" % (k, b)) plt.scatter(data[:, 1], data[:, 0], c=label, cmap='rainbow') plt.scatter(centers[:, 1], centers[:, 0], c='black', s=150, alpha=0.5) data = data[np.argsort(data[:, 1])] plt.plot(data[np.argsort(data[:, 1])][:, 1].reshape((-1, 1)), reg.predict(data[np.argsort(data[:, 1])][:, 1].reshape((-1, 1))), c='b', linestyle='--') plt.xlabel('Longitude') plt.ylabel('Latitude') plt.title('Linear Regression of Cluster Centers(k=%d)' % cluster_num) plt.grid() plt.show() cluster_foot_x, cluster_foot_y = get_foot_point(centers[-1, 1], centers[-1, 0], k, b) print("center-%d distance to line:%.5f" % (cluster_num, get_distance([centers[-1, 1], centers[-1, 0]], [cluster_foot_x, cluster_foot_y]))) sum_dis = 0 for i in range(data.shape[0]): foot_x, foot_y = get_foot_point(data[i, 1], data[i, 0], k, b) sum_dis += get_distance([data[i, 1], data[i, 0]], [foot_x, foot_y]) print("sum_dis:%.5f" % sum_dis) colors = ['blue', 'green', 'orange', 'pink', 'purple', 'red'] map = folium.Map(location=[48.9938, -122.702], zoom_start=8, tiles="OpenStreetMap") for i in range(len(label_list)): point_list = label_list[i] for point in range(len(point_list)): folium.CircleMarker(radius=2.5, location=label_list[i][point], color=colors[i], fill=True, fill_color=colors[i], fill_opacity=1).add_to(map) for i in range(len(centers_list)): folium.CircleMarker(cradius=6, location=centers_list[i], color=colors[i], fill=True, fill_color=colors[i], fill_opacity=0.3).add_to(map) map.save('map_cluster%d.html' % cluster_num)
labels = d.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) print(db.core_sample_indices_) print(db.labels_ == d.labels_) if (db.labels_ == d.labels_).all(): print('lists the same') else: print('lists differ') print('self.x from Dbscan') print(d.x) # ############################################################################# # Plot result import matplotlib.pyplot as plt # Black removed and is used for noise instead. unique_labels = set(labels) colors = [
def silhouette_elbow__analysis(X, range_n_clusters, all_cluster_labels, all_centers): """ :param X: 原始样本 :param range_n_clusters: K的取值情况, list :param all_cluster_labels: 簇标签结果 list :param all_centers: 每种K值情况下的簇中心 list :return: """ assert len(all_cluster_labels) == len(all_centers) == len(range_n_clusters) plt.figure(figsize=(10, 8)) row_plot = 3 # 子图的行数 all_dist = [] for n, n_clusters in enumerate(range_n_clusters): # ================= 轮廓分析法 ============================ cluster_labels = all_cluster_labels[n] plt.subplot(row_plot, (len(range_n_clusters) + 1) // row_plot, n + 1) plt.xlim([-0.1, 1]) # 设置x轴的范围(轮廓系数) plt.ylim([0, len(X) + (n_clusters + 1) * 10]) # 顶端的间隙 silhouette_avg = silhouette_score(X, cluster_labels) # 所有样本的轮廓系数均值 print(" 当 n_clusters = ", n_clusters, "时,轮廓系数为: ", silhouette_avg) # 计算每个样本对应的轮廓系数 sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): # 遍历每一个簇 # 取第i个簇中对应所有样本的轮廓系数,并进行排序 s_values = sample_silhouette_values[cluster_labels == i] s_values.sort() size_cluster_i = s_values.shape[0] # 得到第i个簇的样本数量 y_upper = y_lower + size_cluster_i # 图中每个簇在y轴上的宽度 # 限定y的范围,填充x1和x2所围成的区域 plt.fill_betweenx(y=np.arange(y_lower, y_upper), x1=0, x2=s_values, alpha=0.7) # 在y轴右侧标记每个簇的序号 plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # 计算下一个条形图y轴的其实值 y_lower = y_upper + 10 # 10 for the 0 samples fm.fontManager.addfont('../data/SimHei.ttf') plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来 正常显示中文标签 plt.title(f"K = {n_clusters} 时的轮廓系数图", fontsize=12) plt.xlabel("轮廓系数", fontsize=12) plt.ylabel("聚类簇序号", fontsize=12) # 以x=silhouette_avg 画一条平行于y轴的线 plt.axvline(x=silhouette_avg, color="red", linestyle="--") plt.yticks([]) # 去掉y轴的刻度 plt.xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 设置x轴的刻度 # ============ 肘部法计算簇内距离和并可视化 ========================= dist = 0 centers = all_centers[n] for i in range(n_clusters): # 遍历每一个簇,计算当前簇的簇内距离 x_data = X[cluster_labels == i] tmp = np.sum((x_data - centers[i]) ** 2, axis=1) dist += np.sum(np.sqrt(tmp)) # 累计当前聚类结果下所有簇的簇内距离和 all_dist.append(dist) plt.subplot(row_plot, (len(range_n_clusters) + 1) // row_plot, len(range_n_clusters) + 1) plt.title("肘部法结果") plt.plot(range_n_clusters, all_dist) # 绘制肘部曲线 plt.scatter(range_n_clusters, all_dist) # 绘制各个点 for i in range(len(range_n_clusters)): # 在图上进行K值标记 plt.annotate(f"k = {range_n_clusters[i]}", xy=(range_n_clusters[i], all_dist[i]), fontsize=14, xytext=(range_n_clusters[i] + 0.1, all_dist[i])) plt.hlines(all_dist[i], xmin=0, xmax=range_n_clusters[i], color="red", linestyle="--") plt.xlim(range_n_clusters[0] - 0.5, range_n_clusters[-1] + 0.8) # 调整范围 plt.ylim(all_dist[-1] * 0.9, all_dist[0] + all_dist[-1] * 0.1) plt.yticks([]) # 去掉y轴上的刻度显示 plt.xlabel("K", fontsize=12) plt.ylabel("distance", fontsize=12) plt.tight_layout() plt.show()
def getSilhouetteCoeff(data, labels): silCoeff = silhouette_score(data, labels, metric='euclidean') return silCoeff
def my_score(X, y): return mutual_info_classif(X, y, random_state=724) selectedfeatures = SelectKBest(my_score, k=25) selectedfeatures.fit(data_train, label_train) smalldata_train = selectedfeatures.transform( data_train) #ti qu guo te zheng de 25 wei xun lian shu ju #kmeans = KMeans(n_clusters=50, random_state=0).fit_predict(data_train) smalldata_train = np.hstack( (smalldata_train, data_train[0:, original_length:])) best_silhouette_score = -2 for nclusters in range(40, 120, 20): kmeans_model = KMeans(n_clusters=nclusters, random_state=926) kmeans_model.fit(smalldata_train) tmp = silhouette_score(smalldata_train, kmeans_model.labels_, sample_size=40000) print("n score: %0.3f %0.3f" % (nclusters, tmp)) if (tmp > best_silhouette_score): best_silhouette_score = tmp best_n_clusters = nclusters print("%0.3f best_n_clusters" % (best_n_clusters)) kmeans_model = KMeans(n_clusters=best_n_clusters, random_state=926) kmeans_model.fit(smalldata_train) kmeans = kmeans_model.labels_ # print kmeans #print>>f, kmeans.size split = np.zeros(10000) nosplit = np.zeros(10000) clusters = 0 len1 = int(data_train.size /
def vis_cluster(X,X_pca,n_clusters): y_lower = 10 # y값의 기준 kmeans = KMeans(n_clusters= n_clusters, random_state=42) cluster_labels = kmeans.fit_predict(X_pca) # clustering silhouette_avg = silhouette_score(X, cluster_labels) sample_silhouette_values = silhouette_samples(X, cluster_labels) # visual fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X_pca[:, 0], X_pca[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k') # Labeling the clusters centers = kmeans.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200, edgecolor='k') for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor='k') ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.show()
clusters = fcluster(distance, k, criterion='maxclust') plt.figure(figsize=(10, 8)) plt.scatter(x[:, 0], x[:, 1], c=clusters, cmap='prism') '''K-Means''' from sklearn.cluster import KMeans model = KMeans(n_clusters=5) model.fit(x) y_kmeans = model.predict(x) plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=10, cmap='inferno') centers = model.cluster_centers_ plt.scatter(centers[:, 0], centers[:, 1], c='cyan', s=300) from mlxtend.plotting import plot_decision_regions print(model.inertia_) elbow = [] for i in range(1, 15): kmeans = KMeans(n_clusters=i).fit(x) elbow.append([i, kmeans.inertia_]) plt.plot(pd.DataFrame(elbow)[0], pd.DataFrame(elbow)[1]) from sklearn.metrics import silhouette_score silhoutte = [] for i in range(2, 8): kmeans = KMeans(n_clusters=i).fit(x) silhoutte.append([i, silhouette_score(x, kmeans.labels_)]) plt.plot(pd.DataFrame(silhoutte)[0], pd.DataFrame(silhoutte)[1])
model.fit(X) # append model to cluster list clusters.append(model) inertia_vals.append(model.inertia_) # plot the inertia vs K values plt.plot(range(2, 15, 1), inertia_vals, marker='*') plt.show() from sklearn.metrics import silhouette_score # # print(clusters[1]) # print("Silhouette score for k=4", silhouette_score(X, clusters[1].predict(X))) print(clusters[2]) print("Silhouette score for k=4", silhouette_score(X, clusters[2].predict(X))) print(clusters[3]) print("Silhouette score for k=5", silhouette_score(X, clusters[3].predict(X))) print(clusters[4]) print("Silhouette score for k=6", silhouette_score(X, clusters[4].predict(X))) print(clusters[5]) print("Silhouette score for k=7", silhouette_score(X, clusters[5].predict(X))) # K means clustering using the term vector kmeans = KMeans(n_clusters=6, random_state=rs).fit(X) # function to visualise text cluster. Useful for the assignment too
# In[9]: from sklearn.cluster import KMeans Cluster = KMeans(n_clusters=3, random_state=2) Cluster.fit(data) y_pred = Cluster.predict(data) plt.scatter(data_arr[:, 0], data_arr[:, 1], c=y_pred, s=50, cmap='plasma') plt.rcParams.update({'figure.figsize': (10, 7.5), 'figure.dpi': 100}) # In[10]: Cluster.fit(data) y_pred = Cluster.predict(test) plt.scatter(test[:, 0], test[:, 1], c=y_pred, s=50, cmap='plasma') plt.rcParams.update({'figure.figsize': (10, 7.5), 'figure.dpi': 100}) # In[11]: from sklearn.metrics import silhouette_score for i in range(2, 10): clusterer = KMeans(n_clusters=i, random_state=i) cluster_labels = clusterer.fit_predict(data) silhouette_avg = silhouette_score(data, cluster_labels) print("For n_clusters =", i, "The average silhouette_score is :", silhouette_avg) # In[ ]:
# The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(X) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort()
def tune_hyperparameters(data_list, if_tune_m=True, m_range=None, if_tune_dim=True, dim_range=None, if_tune_p=False, p_range=None, log_norm=True, l2_norm=True, true_labels=None, verbose=True): # Specify data normalization data_list = preprocess(data_list, log_norm=log_norm, l2_norm=l2_norm) num_datasets = len(data_list) # Impute m if None if m_range==None: m_est = max(m_estimate(data_list)) if if_tune_m: m_range = [m_est+i*5 for i in range(-3, 3)] else: m_range = [m_est] print('WARNING no value of m is given, default m={} for the dataset(s) from estimation.'.format(m_est)) # Impute dim if None if dim_range==None: dim_est = dim_estimate(data_list) if if_tune_dim: dim_range = [dim_est+i*10 for i in range(-2, 2)] else: dim_range = [dim_est] print('WARNING no value of dim is given, default dim={} for the dataset(s) from estimation.'.format(dim_est)) # Impute p if None if p_range==None: if if_tune_p: p_range = [0.1, 0.3, 0.5] else: p_range = [0.3] print('WARNING no value of p is given, default p=0.3 for the dataset(s) from estimation.') # If ground truth given, find n_clusters if true_labels is not None: n_clusters = len(np.unique(true_labels)) out = [] if verbose: print('Testing hyperparameters in the range below:') print('Range for m: {}'.format(m_range)) print('Range for dim: {}'.format(dim_range)) print('Range for p: {}'.format(p_range)) for m in m_range: for n_dim in dim_range: for p in p_range: if m*p < 3: print('Skip m={} and p={} as the number of ghost cells is smaller than 3.'.format(m, p)) continue ZW = run_OCAT(data_list=data_list, m_list=[m]*num_datasets, dim=n_dim, p=p, log_norm=False, l2_norm=False) if true_labels is None: labels_pred, n_clusters = evaluate_clusters(ZW, return_num_cluster=True) sil_score = silhouette_score(ZW, labels_pred) out.append([m, n_dim, p, n_clusters, sil_score]) else: labels_pred = evaluate_clusters(ZW, num_cluster=n_clusters) NMI_cell = normalized_mutual_info_score(true_labels, labels_pred) AMI_cell = adjusted_mutual_info_score(true_labels, labels_pred) ARI_cell = adjusted_rand_score(true_labels, labels_pred) out.append([m, n_dim, p, NMI_cell, AMI_cell, ARI_cell]) out = np.array(out) if true_labels is not None: df = pd.DataFrame(data=out, columns=['m', 'n_dim', 'p', 'NMI_score', 'AMI_score', 'ARI_score']) else: df = pd.DataFrame(data=out, columns=['m', 'n_dim', 'p', 'n_clusters', 'silhoutte_score']) if verbose: print(df) return df
import matplotlib.pyplot as plt for i in clases: dlabels = np.where(dy[:, 0] == i)[0] plt.plot(data1[dlabels, 0], data1[dlabels, 1], 'x') plt.xlabel('Agrupamiento original') plt.show() # ### Silhoulette Score de los datos con la clasificacion original # # In[527]: from sklearn.metrics import silhouette_score silhouette_score(data1, dy[:, 0] - 1, metric='sqeuclidean') # #### Clasifico Con las 2 mejores caracteristicas y grafico # In[528]: cm = cmeans(nclusters=clases.shape[0]) cm.fit(datas[0], m=3) # ## Matriz de pertenencias de los datos a los clusters # In[531]: membership = cm.predict(datas[0], m=3) np.round(membership, 1)
def illustration(data, range_n_clusters): """ TBD """ # Scale des données obligatoire avant la réduction des dimensions std_scale = preprocessing.StandardScaler().fit(data) X = std_scale.transform(data) for n_clusters in range_n_clusters: # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(36, 14) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(X) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = sample_silhouette_values[ cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k') # Labeling the clusters centers = clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200, edgecolor='k') for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor='k') ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle( ("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.show()
silhouette = {} D = pairwise_distances(training_dataset, metric='euclidean') for n_clusters in range_n_clusters: print('number of clusters :{}'.format(n_clusters)) M, C = kmedoids.kMedoids(D, n_clusters) labels = np.zeros(1173) for label in C: #print(label) for point_idx in C[label]: labels[point_idx] = label silhouette_avg = silhouette_score(training_dataset, labels) vcr_avg = calinski_harabasz_score(training_dataset, labels) silhouette[n_clusters] = silhouette_avg print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) print("For n_clusters =", n_clusters, "The average silhouette_score is :", vcr_avg) #sample_silhouette_values = silhouette_samples(training_dataset, cluster_labels) plt.figure() plt.plot(list(inertia.keys()), list(inertia.values())) plt.xlabel("Number of cluster") plt.ylabel("SSE")
plt.ylim([0, 10]) plt.title('Instances') plt.scatter(x1, x2) colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'b'] markers = ['o', 's', 'D', 'v', '^', 'p', '*', '+'] clusters = [2, 3, 4, 5, 8] subplot_counter = 1 sc_scores = [] for t in clusters: subplot_counter += 1 plt.subplot(3, 2, subplot_counter) kmeans_model = KMeans(n_clusters=t).fit(X) for i, l in enumerate(kmeans_model.labels_): plt.plot(x1[i], x2[i], color=colors[l], marker=markers[l], ls='None') plt.xlim([0, 10]) plt.ylim([0, 10]) sc_score = silhouette_score(X, kmeans_model.labels_, metric='euclidean') sc_scores.append(sc_score) # 绘制轮廓系数与不同类簇数量的直观显示图。 plt.title('K = %s, silhouette coefficient= %0.03f' % (t, sc_score)) # 绘制轮廓系数与不同类簇数量的关系曲线。 plt.figure() plt.plot(clusters, sc_scores, '*-') plt.xlabel('Number of Clusters') plt.ylabel('Silhouette Coefficient Score') plt.show()
cah = AgglomerativeClustering(n_clusters=k) cah.fit(centroids) labels = cah.labels_ nv_centroids = pd.DataFrame(centroids) nv_centroids["labels"] = labels nv_centroids = nv_centroids.groupby("labels").mean() # Consolidation du KMeans clf_2 = KMeans(n_clusters=k, init=nv_centroids) clf_2.fit(sv_data_scaled) labels_final = clf_2.labels_ s_score = silhouette_score(sv_data_scaled, labels_final, metric="sqeuclidean") s_scores.append(s_score) plt.plot(k_clust, s_scores) # Score de silhouette le plus élevé pour n_clusters = 2 # => meilleur nombre pour l'homogénéité intra-cluster et la séparation inter-cluster # MAIS ne permet pas une bonne séparation inter-cluster, comme vu au dessus (van et bus très proche) # Choisir 3 clusters semble donc pertinent quitte à avoir moins d'homogénéité intra-cluster clf = KMeans(n_clusters=3) clf.fit(sv_data_scaled) labels = clf.labels_
init='k-means++', max_iter=500, n_init=1, verbose=opts.verbose) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, sample_size=1000)) print() if not (opts.n_components or opts.use_hashing): print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print()
def get(self, x, labels): return silhouette_score(x, labels, **self.kwargs)
print(i) W = W[0:4] biases= biases[0:4] def encode(encoder_weights,encoder_biases,data): res=data for index, (w,b) in enumerate(zip(encoder_weights,encoder_biases)): if index+1 == len(encoder_weights): res= np.dot(res,w) + b else: res = np.maximum(0,np.dot(res,w) + b) return res res = encode(W,biases,X_test) print(res.shape) unique_labels = np.unique(y_test) for index,unique_label in enumerate(unique_labels): data_latent_space = res[y_test==unique_label] plt.scatter(data_latent_space[:,0],data_latent_space[:,1],alpha=0.3,c =cmap(index)) plt.xlabel("Latent X") plt.ylabel("Latest Y") plt.title("Autoencoder results") print(silhouette_score(res,y_test)) print("PCA silouette score( how good the clustering is made") print(silhouette_score(res_pca,y_test))
def silhouette(pairWisePointDistance, clusterLabels): # pairWisePointDistance: Array of pairwise distances between samples, or a feature array. # clusterLabels: Predicted labels for each sample. return silhouette_score(pairWisePointDistance, clusterLabels)
df_cluster[features] = scaler.fit_transform(df_cluster[features]) df_cluster.describe().transpose() # Elbow Method: determine the appropriate number of K inertias = {} silhouettes = {} for k in range(2, 11): kmeans = KMeans(init='k-means++', n_init=10, n_clusters=k, max_iter=1000, random_state=42).fit(df_cluster) inertias[ k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center silhouettes[k] = silhouette_score(df_cluster, kmeans.labels_, metric='euclidean') plt.figure() plt.grid(True) plt.plot(list(inertias.keys()), list(inertias.values())) plt.title('K-Means, Elbow Method') plt.xlabel("Number of clusters, K") plt.ylabel("Inertia") plt.figure() plt.grid(True) plt.plot(list(silhouettes.keys()), list(silhouettes.values())) plt.title('K-Means, Elbow Method') plt.xlabel("Number of clusters, K") plt.ylabel("Silhouette")
print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print( "Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_) ) print( "Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000) ) print() if not opts.use_hashing: print("Top terms per cluster:") if opts.n_components: original_space_centroids = svd.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names()