def adjusted_rand_index(): #The text file is updated by a stream of data #inputf=Streaming_AbstractGenerator.StreamAbsGen("USBWWAN_stream","USBWWAN") #inputf=Streaming_AbstractGenerator.StreamAbsGen("file","StreamingData.txt") #inputf=Streaming_AbstractGenerator.StreamAbsGen("Spark_Parquet","Spark_Streaming") #inputf=Streaming_AbstractGenerator.StreamAbsGen("AsFer_Encoded_Strings","NeuronRain") #inputf=Streaming_AbstractGenerator.StreamAbsGen("Socket_Streaming","localhost") inputf1=Streaming_AbstractGenerator.StreamAbsGen("TextHistogramPartition",["/var/log/kern.log","/var/log/syslog","/var/log/ufw.log","/var/log/dmesg","/var/log/kern.log"]) histograms=[] for p in inputf1: histograms.append(p) ari=adjusted_rand_score(tocluster(histograms[0],"Text")[:20000],tocluster(histograms[1],"Text")[:20000]) print "Adjusted Rand Index of first two histogram set partitions(truncated):",ari prev=0 for n in range(1,len(histograms)): truncatedlen=int(min(len(histograms[prev]),len(histograms[n]))*0.9) ari=adjusted_rand_score(tocluster(histograms[prev],"Text")[:truncatedlen],tocluster(histograms[n],"Text")[:truncatedlen]) print "Adjusted Rand Index(truncated):",ari ami=adjusted_mutual_info_score(tocluster(histograms[prev],"Text")[:truncatedlen],tocluster(histograms[n],"Text")[:truncatedlen]) print "Adjusted Mutual Info Index(truncated):",ami prev=n ################################################################# histograms=[] inputf2=Streaming_AbstractGenerator.StreamAbsGen("DictionaryHistogramPartition","Streaming_SetPartitionAnalytics.txt") for p in inputf2: histograms.append(p) prev=0 print "histograms:",histograms for n in range(1,len(histograms)): truncatedlen=int(min(len(histograms[prev]),len(histograms[n]))*0.9) ari=adjusted_rand_score(tocluster(histograms[prev],"Dict")[:truncatedlen],tocluster(histograms[n],"Dict")[:truncatedlen]) print "Adjusted Rand Index (truncated):",ari ami=adjusted_mutual_info_score(tocluster(histograms[prev],"Dict")[:truncatedlen],tocluster(histograms[n],"Dict")[:truncatedlen]) print "Adjusted Mutual Info Index (truncated):",ami prev=n
def kmeans(input_file, n_clusters, Output): lvltrace.lvltrace("LVLEntree dans kmeans unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] sample_size, n_features = X.shape k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) k_means.fit(X) reduced_data = k_means.transform(X) values = k_means.cluster_centers_.squeeze() labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ print "#########################################################################################################\n" #print y #print labels print "K-MEANS\n" print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels)) print('completeness_score: %f'%metrics.completeness_score(y, labels)) print('v_measure_score: %f'%metrics.v_measure_score(y, labels)) print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels)) print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y, labels)) print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) print('\n') print "#########################################################################################################\n" results = Output+"kmeans_scores.txt" file = open(results, "w") file.write("K-Means Scores\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y, labels)) file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) file.write("\n") file.write("True Value, Cluster numbers, Iteration\n") for n in xrange(len(y)): file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1))) file.close() import pylab as pl from itertools import cycle # plot the results along with the labels k_means_cluster_centers = k_means.cluster_centers_ fig, ax = plt.subplots() im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.') for k in xrange(n_clusters): my_members = labels == k cluster_center = k_means_cluster_centers[k] ax.plot(cluster_center[0], cluster_center[1], 'w', color='b', marker='x', markersize=6) fig.colorbar(im) plt.title("Number of clusters: %i"%n_clusters) save = Output + "kmeans.png" plt.savefig(save) lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
def compute_cluster_metrics_raw(chains, cells): all_chains = [] for chain_i, chain in enumerate(chains): sample_latent = chain['state'] cell_assignment = np.array(sample_latent['domains']['d1']['assignment']) ca = irm.util.canonicalize_assignment(cell_assignment) cells['cluster'] = ca canon_true_fine = irm.util.canonicalize_assignment(cells['type_id']) canon_true_coarse = irm.util.canonicalize_assignment(cells['coarse']) ari = metrics.adjusted_rand_score(canon_true_fine, ca) ari_coarse = metrics.adjusted_rand_score(canon_true_coarse, ca) ami = metrics.adjusted_mutual_info_score(canon_true_fine, ca) ami_coarse = metrics.adjusted_mutual_info_score(canon_true_coarse, ca) jaccard = rand.compute_jaccard(canon_true_fine, ca) jaccard_coarse = rand.compute_jaccard(canon_true_coarse, ca) ss = rand.compute_similarity_stats(canon_true_fine, ca) # other statistics # cluster count # average variance x vars = cells.groupby('cluster').var() # average variance y # average variance z chain_info = {'ari' : ari, 'ari_coarse' : ari_coarse, 'ami' : ami, 'ami_coarse' : ami_coarse, 'jaccard' : jaccard, 'jaccard_coarse' : jaccard_coarse, 'n11' : ss['n11'], 'vars' : vars, 'cluster_n' : len(np.unique(cells['cluster'])), 'chain_i' : chain_i, 'score' : chain['scores'][-1], 'df' : cells, } all_chains.append(chain_info) df = pandas.DataFrame(all_chains) return df
def results(self, algo, hasgnc = False, filename="_"): title = self.__class__.__name__ AMI_increase = [] ARI_increase = [] rounds = 1 if hasgnc: rounds = 10 print "Runing ", algo.__name__, "for", rounds, "rounds" for i in range(rounds): vd = algo(self.g, weights = [ (lambda w: max(w,0) )(w) for w in self.g.es["weight"]] ) try: vc = vd.as_clustering() except: vc = vd #in case a VertexCluster instance is returned self.write_vertex_clustering(vc, "_weighted%s" % filename) if hasgnc: for cc in range(len(vc)): for cci in vc[cc]: self.g.vs[cci]["fastgreedy_withweight"] = str(cc) vd = algo(self.g) try: vc = vd.as_clustering() except: vc = vd #in case a VertexCluster instance is returned self.write_vertex_clustering(vc, "_unweighted%s" % filename) if hasgnc: for cc in range(len(vc)): for cci in vc[cc]: self.g.vs[cci]["fastgreedy_withoutweight"] = str(cc) #self.g.write_gml("%s.gml" % title) #print "%s.gml written with attributes" % title, #print self.g.vs.attributes() if hasgnc: #print "Weighted:" #print "Adjusted Mutual Information:", ami_weight = metrics.adjusted_mutual_info_score(self.g.vs["fastgreedy_withweight"], self.g.vs["comm"]) #print "Adjusted Rand index:", ari_weight = metrics.adjusted_rand_score(self.g.vs["fastgreedy_withweight"], self.g.vs["comm"]) #print "~"*30 #print "Unweighted:" #print "Adjusted Mutual Information:", ami_unweight = metrics.adjusted_mutual_info_score(self.g.vs["fastgreedy_withoutweight"], self.g.vs["comm"]) #print "Adjusted Rand index:", ari_unweight = metrics.adjusted_rand_score(self.g.vs["fastgreedy_withoutweight"], self.g.vs["comm"]) AMI_increase.append(ami_weight - ami_unweight) ARI_increase.append(ari_weight - ari_unweight) if hasgnc: print "Adjusted Mutual Information increases by", print 1.0 * sum(AMI_increase) / len(AMI_increase) print "Adjusted Rand index increases by", print 1.0 * sum(ARI_increase) / len(ARI_increase) print "-" * 20 return AMI_increase
def tracking(self, d_start=gb.D_START_TRACKING, d_end=gb.D_END_TRACKING, path=""): print("\n --------- tracking ...") times_fsp, axes_fsp, labels_fsp = [], [], [] times_ssp, axes_ssp, labels_ssp = [], [], [] timedelta = datetime.timedelta( milliseconds=60 * 60 * 1000) # read chunk by chunk (each chunk is of 'timedelta' milliseconds) date = d_start while date < d_end: if date + timedelta >= d_end: timedelta = d_end - date times, axes, labels = self.predict_fsp(d_start=date, d_end=date + timedelta) # self.plot_colored_signals(times, axes, labels, path, figname="_FSP.png") times_fsp += times; axes_fsp += axes; labels_fsp += labels times, axes, labels = self.predict_ssp(d_start=date, d_end=date + timedelta, update=True) # self.plot_colored_signals(times, axes, labels, path, figname="_SSP.png") times_ssp += times; axes_ssp += axes; labels_ssp += labels date += timedelta # ---------------------------- if gb.ARTIFICIAL: times, values, true_labels = self.sigReaders[0].getSignal(start=d_start, end=d_end, dated=gb.DATED, get_modes=True) ari_fps = adjusted_rand_score(true_labels, labels_fsp); ari_sps = adjusted_rand_score(true_labels, labels_ssp) ami_fps = adjusted_mutual_info_score(true_labels, labels_fsp); ami_sps = adjusted_mutual_info_score(true_labels, labels_ssp) ho_fps, com_fps, vm_fps = homogeneity_completeness_v_measure(true_labels, labels_fsp); ho_sps, com_sps, vm_sps = homogeneity_completeness_v_measure(true_labels, labels_ssp) print("---------------------------------------------------") print("adjusted_rand_score \t (ari_fps, ari_sps)", (ari_fps, ari_sps)) print("adjusted_mutual_info \t (ami_fps, ami_sps)", (ami_fps, ami_sps)) print("homogeneity \t (ho_fps, ho_sps)", (ho_fps, ho_sps)) print("completeness \t (com_fps, com_sps)", (com_fps, com_sps)) print("v_measure \t (vm_fps, vm_sps)", (vm_fps, vm_sps)) #return (ari_fps, ari_sps), (ami_fps, ami_sps), (ho_fps, ho_sps), (com_fps, com_sps), (vm_fps, vm_sps) return ((ari_fps, ari_sps), (ami_fps, ami_sps), (ho_fps, ho_sps), (com_fps, com_sps), (vm_fps, vm_sps)), (times_fsp,axes_fsp,labels_fsp,times_ssp,axes_ssp,labels_ssp) else: return 0., 0.
def kmeans_setup(data): if pca_f == 1: pca = PCA(n_components = num_clusters).fit(data) initializer = pca.components_ name = 'PCA' else: initializer = 'k-means++' name = 'k-means++' t0 = time() estimator = KMeans(init=initializer, n_clusters=num_clusters, n_init = num_init, max_iter = num_iterations) estimator.fit(data) if debug == True: sample_size = 300 print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size))) return estimator
def affin_test(): savefile = open('traindata.pkl', 'rb') (x_train, y_train, t1) = cPickle.load(savefile) savefile.close() x_train, X_valid, y_train, y_valid = cross_validation.train_test_split( x_train, y_train, test_size=0.9, random_state=42) labels_true = y_train x_train = StandardScaler().fit_transform(x_train) af = AffinityPropagation(preference=-50).fit(x_train) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(x_train, labels, metric='sqeuclidean'))
def evaluate(self): ARI = round(metrics.adjusted_rand_score(self.labels, self.pred), 4) AMI = round(metrics.adjusted_mutual_info_score(self.labels, self.pred), 4) NMI = round(metrics.normalized_mutual_info_score(self.labels, self.pred), 4) print("Adjusted Rand index:", "%.4f" % ARI) print("Adjusted Mutual Information:", "%.4f" % AMI) print("Normalized Mutual Information:", "%.4f" % NMI)
def predictAffinityPropagation(X, labels_true): #ranX, ranY = shuffle(X, y, random_state=0) af = AffinityPropagation(preference=-50).fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean')) plt.close('all') plt.figure(1) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = X[cluster_centers_indices[k]] plt.plot(X[class_members, 0], X[class_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) for x in X[class_members]: plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.show()
def bench_k_means(estimator, name, data, sample_size, labels,postIds): data=sparse.csr_matrix(data) t0 = time() print("Performing dimensionality reduction using LSA") t0 = time() lsa = TruncatedSVD(500) data = lsa.fit_transform(data) data = Normalizer(copy=False).fit_transform(data) print("done in %fs" % (time() - t0)) print() #sData=sparse.csr_matrix(data) val=estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f ' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_))) print("Parsing USer File:") parseUserFile() print("extracting User File:") clusterDict=extractCluster(postIds,estimator.labels_) print("writing Cluster Data to File") writeCluterToFile(clusterDict)
def bench_k_means(estimator, name, data, target_labels, sample_size): """For benchmarking K-Means estimators. Prints different clustering metrics and train accuracy ARGS estimator: K-Means clustering algorithm <sklearn.cluster.KMeans> name: estimator name <str> data: array-like or sparse matrix, shape=(n_samples, n_features) target_labels: labels of data points <number array> sample_size: size of the sample to use when computing the Silhouette Coefficient <int> """ t0 = time() estimator.fit(data) _, _, train_accuracy = compute_residuals_and_rsquared(estimator.labels_, target_labels) print('% 9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(target_labels, estimator.labels_), metrics.completeness_score(target_labels, estimator.labels_), metrics.v_measure_score(target_labels, estimator.labels_), metrics.adjusted_rand_score(target_labels, estimator.labels_), metrics.adjusted_mutual_info_score(target_labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_,metric='euclidean',sample_size=sample_size), train_accuracy ) )
def my_clustering(X, y, n_clusters, pca): # ======================================= # Complete the code here. # return scores like this: return [score, score, score, score] # ======================================= from sklearn.cluster import KMeans #print('f**k X ', X.shape) #print('f**k y ', y.shape) clf = KMeans(n_clusters) clf.fit(X) from sklearn import metrics ari = metrics.adjusted_rand_score(y, clf.labels_) mri = metrics.adjusted_mutual_info_score(y, clf.labels_) v_measure = metrics.v_measure_score(y, clf.labels_) ''' silhouette_coeff = metrics.silhouette_score(X, clf.labels_, metric='euclidean', sample_size=300) ''' silhouette_coeff = metrics.silhouette_score(X, clf.labels_) show_images(n_clusters, clf, pca) return [ari,mri,v_measure,silhouette_coeff]
def bench_k_means(estimator, data, labels): t0 = time() estimator.fit(data) print("time to fit: {:.5}".format(time() - t0)) homogenity = metrics.homogeneity_score(labels, estimator.labels_) completeness = metrics.completeness_score(labels, estimator.labels_) v_measure = metrics.v_measure_score(labels, estimator.labels_) print("homogenity {:.5}, completeness {:.5}, v_measure_score {:.5}".format( homogenity, completeness, v_measure) ) adj_rand_score = metrics.adjusted_rand_score( labels, estimator.labels_ ) print("adjusted_rand_score {:.5}".format(adj_rand_score)) adj_mutual_info_score = metrics.adjusted_mutual_info_score( labels, estimator.labels_ ) print("adjusted_mutual_info_score {:.5}".format( adj_mutual_info_score) ) silhouette_score = metrics.silhouette_score( data, estimator.labels_, metric='euclidean' ) print("silhouette_score {:.5}".format( metrics.silhouette_score(data, estimator.labels_, metric='euclidean')) ) return [ homogenity, completeness, v_measure, adj_rand_score, adj_mutual_info_score, silhouette_score ]
def intersubjectconsensus(): """Compute inter-subjects clustering consensus. """ base_dir = r'/nfs/h1/workingshop/huanglijie/uni_mul_analysis' db_dir = os.path.join(base_dir, 'multivariate', 'detection', 'mvpcluster') n_clusters = 60 mask_file = os.path.join(base_dir, 'multivariate', 'detection', 'mask.nii.gz') mask = nib.load(mask_file).get_data() for n in range(1, n_clusters): n += 1 merged_file = os.path.join(db_dir, 'merged_cluster_'+str(n)+'.nii.gz') merged_data = nib.load(merged_file).get_data() n_subjs = merged_data.shape[3] mtx = np.zeros((n_subjs, n_subjs)) for i in range(n_subjs): for j in range(n_subjs): data_i = merged_data[..., i] data_j = merged_data[..., j] vtr_i = data_i[np.nonzero(mask)] vtr_j = data_j[np.nonzero(mask)] tmp = metrics.adjusted_mutual_info_score(vtr_i, vtr_j) mtx[i, j] = tmp outfile = os.path.join(db_dir, 'consensus_'+str(n)+'.csv') np.savetxt(outfile, mtx, delimiter=',')
def cluster(Z, K=4, algo='kmeans'): descr = Z.columns X = Imputer().fit_transform(Z) ############################################################################## if algo == 'dbscan': # Compute DBSCAN db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) elif algo == 'kmeans': km = KMeans(n_clusters=K) km.fit(X) print(km.labels_) return km
def cluster(model, uids): ############################################################################## # Generate sample data X = [] for uid in uids: X.append(model.docvecs[uid]) labels_true = uids ############################################################################## # Compute Affinity Propagation af = AffinityPropagation(preference=-50).fit(X) pickle.dump(af, open('data/af.pick', 'w')) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10): ############################################################################## # Extract Y true labels_true = y_true ############################################################################## # transform distance matrix into a similarity matrix S = 1 - D ############################################################################## # compute DBSCAN #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S) db = Ward(n_clusters=n_clusters).fit(S) #core_samples = db.core_sample_indices_ labels = db.labels_ # number of clusters in labels, ignoring noise if present n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print 'Number of clusters: %d' % n_clusters_ print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels) print 'Completeness: %0.3f' % metrics.completeness_score(labels_true, labels) print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels) print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(labels_true, labels) print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(labels_true, labels) print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score(D, labels, metric='precomputed')
def run_clustering( clusterer, data, labels ): """ Cluster: Using a predefined and parameterized clustering algorithm, fit some dataset and perform metrics given a set of ground-truth labels. clusterer: the clustering algorithm, from sklearn data: array-like dataset input labels: vector of ground-truth labels """ # Time the operation t0 = time() clusterer.fit(data) t1 = time() # Perform metrics runtime = (t1 - t0) homogeneity = metrics.homogeneity_score( labels, clusterer.labels_ ) completeness = metrics.completeness_score( labels, clusterer.labels_ ) v_measure = metrics.v_measure_score( labels, clusterer.labels_ ) adjusted_rand = metrics.adjusted_rand_score( labels, clusterer.labels_ ) adjusted_mutual = metrics.adjusted_mutual_info_score( labels, clusterer.labels_ ) # Output to logs logging.info(" |- Execution time: %fs" % runtime) logging.info(" |- Homogeneity: %0.3f" % homogeneity) logging.info(" |- Completeness: %0.3f" % completeness) logging.info(" |- V-measure: %0.3f" % v_measure) logging.info(" |- Adjusted Rand-Index: %.3f" % adjusted_rand) logging.info(" |- Adjusted Mutual Info: %.3f" % adjusted_mutual)
def get_result(km, labels): homo_score = metrics.homogeneity_score(labels, km.labels_) complete_score = metrics.completeness_score(labels, km.labels_) v_score = metrics.v_measure_score(labels, km.labels_) rand_score = metrics.adjusted_rand_score(labels, km.labels_) mutual_info = metrics.adjusted_mutual_info_score(labels, km.labels_) return homo_score, complete_score, v_score, rand_score, mutual_info
def get_constant_height_labels(clustering, n_clusters=None): """ use silhouette analysis to select the best heigh to cut a linkage matrix :df: a correlation matrix parse_heatmap: int (optional). If defined, devides the columns of the heatmap based on cutting the dendrogram """ N_variables = len(clustering['reorder_vec']) scores = [] if n_clusters is None: for k_clusters in range(2,N_variables//3): labels = cut_tree(clustering['linkage'], n_clusters=k_clusters) try: score = silhouette_score(clustering['distance_df'], labels.ravel(), metric='precomputed') except ValueError: continue scores.append((k_clusters,score)) best_k = max(scores, key=lambda x: x[1])[0] labels = cut_tree(clustering['linkage'], n_clusters=best_k) else: labels = cut_tree(clustering['linkage'], n_clusters=n_clusters) score = silhouette_score(clustering['distance_df'], labels, metric='precomputed') scores.append((n_clusters, score)) labels = reorder_labels(labels.flatten(), clustering['linkage']) # comparison MI = adjusted_mutual_info_score(labels, clustering['labels']) return labels, scores, MI
def compareClusters(labelsA, labelsB, method='ARI', alignFirst=True, useCommon=False): """Requre that labelsA and labelsB have the same index""" if useCommon: labelsA, labelsB = labelsA.align(labelsB, join='inner') assert len(labelsA.index) == len(labelsB.index) assert (labelsA.index == labelsB.index).sum() == len(labelsA.index) uLabels = np.unique(labelsA) assert (uLabels == np.unique(labelsB)).sum() == uLabels.shape[0] if alignFirst: alignedB = alignClusters(labelsA, labelsB) else: alignedB = labelsB if method == 'ARI': s = metrics.adjusted_rand_score(labelsA.values, alignedB.values) elif method == 'AMI': s = metrics.adjusted_mutual_info_score(labelsA.values, alignedB.values) elif method == 'overlap': s = np.zeros(uLabels.shape[0]) for labi, lab in enumerate(uLabels): membersA = labelsA.index[labelsA == lab] membersB = alignedB.index[alignedB == lab] accA = np.sum([1 for cy in membersA if cy in membersB]) / len(membersA) accB = np.sum([1 for cy in membersB if cy in membersA]) / len(membersB) s[labi] = (accA + accB) / 2 return s
def drawlableCLuster(yyaxis,twitterlabel,cityname): ############################################################################## # Compute Affinity Propagation X = yyaxis db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(twitterlabel, labels)) ############################################################################## # Plot result matplotlib.style.use('ggplot') # Black removed and is used for noise instead. unique_labels = set(labels) colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = 'k' class_member_mask = (labels == k) xy = X[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) xy = X[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) plt.title('Estimated number of clusters: %d' % n_clusters_) imgname = "./clusterimage/hourcondimention/" +"hour_dimention_twitterinfo_"+cityname+'.png' fig = plt.gcf() fig.set_size_inches(16.5, 12.5) fig.savefig(imgname) # plt.show() return [n_clusters_,metrics.silhouette_score(X, labels),metrics.adjusted_mutual_info_score(twitterlabel, labels)]
def plot_clustering_similarity(results, plot_dir=None, verbose=False, ext='png'): HCA = results.HCA # get all clustering solutions clusterings = HCA.results.items() # plot cluster agreement across embedding spaces names = [k for k,v in clusterings] cluster_similarity = np.zeros((len(clusterings), len(clusterings))) cluster_similarity = pd.DataFrame(cluster_similarity, index=names, columns=names) distance_similarity = np.zeros((len(clusterings), len(clusterings))) distance_similarity = pd.DataFrame(distance_similarity, index=names, columns=names) for clustering1, clustering2 in combinations(clusterings, 2): name1 = clustering1[0].split('-')[-1] name2 = clustering2[0].split('-')[-1] # record similarity of distance_df dist_corr = np.corrcoef(squareform(clustering1[1]['distance_df']), squareform(clustering2[1]['distance_df']))[1,0] distance_similarity.loc[name1, name2] = dist_corr distance_similarity.loc[name2, name1] = dist_corr # record similarity of clustering of dendrogram clusters1 = clustering1[1]['labels'] clusters2 = clustering2[1]['labels'] rand_score = adjusted_rand_score(clusters1, clusters2) MI_score = adjusted_mutual_info_score(clusters1, clusters2) cluster_similarity.loc[name1, name2] = rand_score cluster_similarity.loc[name2, name1] = MI_score with sns.plotting_context(context='notebook', font_scale=1.4): clust_fig = plt.figure(figsize = (12,12)) sns.heatmap(cluster_similarity, square=True) plt.title('Cluster Similarity: TRIL: Adjusted MI, TRIU: Adjusted Rand', y=1.02) dist_fig = plt.figure(figsize = (12,12)) sns.heatmap(distance_similarity, square=True) plt.title('Distance Similarity, metric: %s' % HCA.dist_metric, y=1.02) if plot_dir is not None: save_figure(clust_fig, path.join(plot_dir, 'cluster_similarity_across_measures.%s' % ext), {'bbox_inches': 'tight'}) save_figure(dist_fig, path.join(plot_dir, 'distance_similarity_across_measures.%s' % ext), {'bbox_inches': 'tight'}) plt.close(clust_fig) plt.close(dist_fig) if verbose: # assess relationship between two measurements rand_scores = cluster_similarity.values[np.triu_indices_from(cluster_similarity, k=1)] MI_scores = cluster_similarity.T.values[np.triu_indices_from(cluster_similarity, k=1)] score_consistency = np.corrcoef(rand_scores, MI_scores)[0,1] print('Correlation between measures of cluster consistency: %.2f' \ % score_consistency)
def evaluate(labels_true, labels): homogeneity = metrics.homogeneity_score(labels_true, labels) completeness = metrics.completeness_score(labels_true, labels) v_measure = metrics.v_measure_score(labels_true, labels) adjusted_rand = metrics.adjusted_rand_score(labels_true, labels) adjusted_mutual_info = metrics.adjusted_mutual_info_score(labels_true, labels) #silhouette = metrics.silhouette_score(data, labels, metric='sqeuclidean') return homogeneity, completeness, v_measure, adjusted_rand, adjusted_mutual_info#, silhouette
def print_cluster(clusterTrainClass, labels, clusterTestStory): print("Homogeneity: %0.3f" % metrics.homogeneity_score(clusterTrainClass, labels)) print("Completeness: %0.3f" % metrics.completeness_score(clusterTrainClass, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(clusterTrainClass, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(clusterTrainClass, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(clusterTrainClass, labels)) print "Silhouette Coefficient:" print metrics.silhouette_score(clusterTestStory, labels, metric='euclidean')
def ami_score_op(s, s_hat): scores = [] for i in range(s.shape[1]): true_labels = s[:, i, :].argmax(0) m = s[:, i, :].max(0) > 0.9 pred_labels = s_hat[:, i, :].argmax(0) scores.append(adjusted_mutual_info_score(true_labels[m], pred_labels[m])) return np.array(scores, dtype=np.float32)
def cluseval(label, truth): rand = metrics.adjusted_rand_score(truth, label) mutual = metrics.adjusted_mutual_info_score(truth, label) h**o = metrics.homogeneity_score(truth, label) complete = metrics.completeness_score(truth, label) v = metrics.v_measure_score(truth, label) result = [rand, mutual, h**o, complete, v] return result
def compare_clusters(self): # compares original to consensus clustering if self.consensus_clustering is None: print("First run consensusCluster!") return else: orig_labels = self.orig_clustering['labels'] new_labels = self.consensus_clustering['labels'] return adjusted_mutual_info_score(orig_labels, new_labels)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Perform spectral clustering.') parser.add_argument("--clusters", "-c", type=int, help='Number of clusters.') parser.add_argument("--knn", "-k", type=int, default=0, help='Number of nearest neighbors, 0 means all.') parser.add_argument("--sm", "-s", help='File containing similarity matrix') parser.add_argument("--iterations", "-i", type=int, default=10, help='Number of KMeans iterations.') parser.add_argument("--true_labels", "-t", help='File containing the true labels.') parser.add_argument("--output", "-o", help='Name of the file to write' + ' the labels to.') parser.add_argument("--normalize", "-n", action='store_true', help='Normalize each row so that the max value is one.') args = parser.parse_args() sm = np.load(args.sm) if args.normalize: sm /= sm.max(axis=1)[:, np.newaxis] # Ensure symmetric sm = (sm + sm.T) / 2 labels = [] if args.knn > 0: labels = SpectralClustering(n_clusters=args.clusters, affinity='nearest_neighbors', n_neighbors=args.knn, n_init=args.iterations).fit(sm).labels_ else: labels = SpectralClustering(n_clusters=args.clusters, affinity='precomputed', n_init=args.iterations).fit(sm).labels_ with open(args.output, 'w') as fout: for l in labels: fout.write(str(l) + '\n') # Load the true labels. if args.true_labels: true_labels = [] with open(args.true_labels, 'r') as fin: for line in fin: true_labels.append(int(line.strip())) # Run the metrics. print("Homogeneity: %0.3f" % metrics.homogeneity_score(true_labels, labels)) print("Completeness: %0.3f" % metrics.completeness_score(true_labels, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(true_labels, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(true_labels, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(true_labels, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(sm, labels))
def evaluateAllAlgorithms(self): algs = [self.labels_db,self.labels_ap] t**s =['DBASE','AP'] for i in range(2): print 'Algorithm:',t**s[i] print("\tHomogeneity: %0.3f" % metrics.homogeneity_score(self.labels_gt, algs[i])) print("\tCompleteness: %0.3f" % metrics.completeness_score(self.labels_gt, algs[i])) print("\tV-measure: %0.3f" % metrics.v_measure_score(self.labels_gt, algs[i])) print("\tAdjusted Rand Index: %0.3f"% metrics.adjusted_rand_score(self.labels_gt, algs[i])) print("\tAdjusted Mutual Information: %0.3f"% metrics.adjusted_mutual_info_score(self.labels_gt, algs[i]))
def ami_score(U, V): return metrics.adjusted_mutual_info_score(U, V)
def calcMaxState(G_data, B_data, name, encoder): index = 0 max_value = 0 if name not in 'email': iterations = 1001 else: iterations = 300 for r_state in range(0, iterations): B_data_X = encoder.detach().numpy() kmeans = KMeans(n_clusters=get_clusters(G_data, name), init='k-means++', random_state=r_state) kmeans.fit(B_data_X) X_ae = kmeans.labels_ # Calculated labels # Finding truth values if name == 'karate': c_groups = [ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ] elif name == 'email': c_groups = [ 1, 1, 21, 21, 21, 25, 25, 14, 14, 14, 9, 14, 14, 26, 4, 17, 34, 1, 1, 14, 9, 9, 9, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 5, 34, 14, 14, 17, 17, 10, 10, 36, 37, 5, 7, 4, 22, 22, 21, 21, 21, 21, 7, 7, 36, 21, 25, 4, 8, 15, 15, 15, 37, 37, 9, 1, 1, 10, 10, 3, 3, 3, 29, 15, 36, 36, 37, 1, 36, 34, 20, 20, 8, 15, 9, 4, 5, 4, 20, 16, 16, 16, 16, 16, 38, 7, 7, 34, 38, 36, 8, 27, 8, 8, 8, 10, 10, 13, 13, 6, 26, 10, 1, 36, 0, 13, 16, 16, 22, 6, 5, 4, 0, 28, 28, 4, 2, 13, 13, 21, 21, 17, 17, 14, 36, 8, 40, 35, 15, 23, 0, 0, 7, 10, 37, 27, 35, 35, 0, 0, 19, 19, 36, 14, 37, 24, 17, 13, 36, 4, 4, 13, 13, 10, 4, 38, 32, 32, 4, 1, 0, 0, 0, 7, 7, 4, 15, 16, 40, 15, 15, 15, 15, 0, 21, 21, 21, 21, 5, 4, 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 22, 19, 19, 22, 34, 14, 0, 1, 17, 37, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 23, 0, 4, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 10, 14, 14, 1, 14, 7, 13, 20, 31, 40, 6, 4, 0, 8, 9, 9, 10, 0, 10, 14, 14, 14, 14, 39, 17, 4, 28, 17, 17, 17, 4, 4, 0, 0, 23, 4, 21, 36, 36, 0, 22, 21, 15, 37, 0, 4, 4, 4, 14, 4, 7, 7, 1, 15, 15, 38, 26, 20, 20, 20, 21, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 19, 7, 7, 17, 16, 14, 9, 9, 9, 8, 8, 13, 39, 14, 10, 17, 17, 13, 13, 13, 13, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 27, 8, 8, 14, 14, 14, 10, 14, 35, 37, 14, 36, 10, 7, 20, 10, 16, 36, 36, 14, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 4, 9, 4, 0, 4, 16, 38, 14, 14, 21, 26, 27, 28, 21, 4, 1, 1, 9, 10, 15, 4, 26, 14, 35, 10, 34, 4, 4, 12, 17, 17, 14, 37, 37, 37, 34, 6, 13, 13, 13, 13, 4, 14, 10, 10, 10, 3, 17, 17, 17, 1, 4, 14, 14, 6, 27, 22, 21, 4, 4, 1, 34, 17, 30, 30, 4, 23, 14, 15, 1, 22, 12, 31, 6, 15, 15, 8, 15, 8, 8, 1, 15, 22, 2, 3, 4, 10, 4, 14, 14, 25, 6, 6, 40, 4, 36, 23, 14, 3, 14, 14, 14, 14, 14, 14, 14, 14, 14, 31, 15, 15, 14, 0, 23, 35, 8, 4, 1, 1, 35, 23, 21, 2, 4, 4, 9, 14, 4, 10, 25, 14, 14, 3, 21, 35, 4, 9, 15, 6, 9, 3, 15, 23, 4, 4, 4, 11, 35, 10, 6, 15, 15, 15, 22, 2, 2, 14, 4, 3, 14, 27, 31, 34, 4, 4, 19, 14, 14, 4, 4, 14, 14, 21, 4, 14, 4, 0, 4, 27, 27, 17, 3, 15, 2, 4, 4, 21, 21, 11, 23, 11, 23, 17, 5, 36, 15, 23, 23, 2, 19, 4, 36, 14, 1, 22, 1, 21, 34, 14, 13, 6, 4, 37, 6, 24, 35, 6, 17, 16, 6, 4, 0, 21, 4, 26, 21, 4, 15, 7, 1, 20, 19, 7, 21, 21, 21, 19, 38, 19, 16, 23, 6, 37, 25, 1, 22, 6, 14, 1, 26, 8, 37, 4, 0, 17, 6, 17, 14, 16, 4, 32, 14, 15, 0, 23, 21, 29, 14, 14, 1, 17, 26, 15, 0, 0, 0, 22, 34, 21, 6, 16, 4, 15, 21, 0, 36, 4, 1, 1, 22, 14, 14, 30, 4, 9, 10, 4, 4, 14, 16, 16, 15, 21, 0, 4, 15, 29, 24, 21, 14, 11, 11, 9, 13, 10, 31, 4, 22, 14, 23, 1, 4, 9, 17, 27, 28, 22, 14, 20, 7, 23, 1, 6, 15, 15, 23, 4, 20, 5, 36, 10, 21, 39, 41, 31, 17, 7, 21, 34, 1, 14, 2, 18, 16, 27, 16, 38, 7, 38, 21, 1, 9, 15, 15, 15, 0, 6, 23, 28, 11, 23, 34, 24, 4, 4, 4, 24, 23, 17, 10, 17, 1, 1, 15, 15, 4, 21, 14, 14, 20, 28, 20, 22, 26, 3, 32, 4, 0, 21, 13, 4, 15, 17, 5, 4, 14, 0, 9, 21, 14, 38, 4, 14, 31, 21, 14, 6, 4, 4, 6, 17, 0, 4, 7, 16, 4, 4, 21, 1, 10, 3, 21, 4, 0, 1, 7, 17, 15, 14, 0, 9, 32, 13, 5, 2, 21, 28, 21, 22, 22, 7, 7, 33, 0, 1, 15, 4, 31, 30, 15, 11, 19, 21, 9, 21, 13, 21, 9, 32, 9, 32, 38, 9, 38, 38, 14, 9, 10, 38, 10, 22, 21, 13, 21, 4, 0, 1, 1, 23, 0, 5, 4, 4, 15, 14, 14, 13, 11, 1, 5, 5, 10, 23, 21, 14, 9, 20, 10, 19, 19, 21, 17, 19, 19, 36, 17, 35, 16, 4, 16, 4, 6, 4, 41, 6, 7, 23, 9, 23, 7, 6, 22, 36, 14, 15, 11, 35, 5, 14, 14, 15, 4, 6, 4, 9, 19, 11, 4, 29, 14, 15, 15, 5, 32, 15, 14, 5, 9, 10, 19, 13, 23, 12, 10, 21, 10, 35, 7, 22, 22, 22, 8, 21, 32, 4, 21, 21, 6, 14, 11, 14, 15, 4, 21, 1, 6, 22 ] else: c_attributes = nx.get_node_attributes(G_data, 'value') c_groups = [] for i, val in enumerate(c_attributes.values()): c_groups.append(val) X_gt = np.array(c_groups) ami = metrics.adjusted_mutual_info_score(X_gt, X_ae, average_method='arithmetic') if (ami > max_value): index = r_state max_value = ami if (r_state % 100 == 0): print("Index:{}\tMax AMI till now:{}".format(index, max_value)) return index
import rcc import pdb import numpy as np from sklearn.metrics import adjusted_mutual_info_score X = [] Y = [] with open('pendigits.txt', 'r') as f: for line in f: line_split = line.strip().replace(' ', '').split(',') x = np.array([int(s) for s in line_split[:-1]]) y = int(line_split[-1]) X.append(x) Y.append(y) X = np.array(X).astype(np.float32) Y = np.array(Y) clusterer = rcc.rcc_cluster(measure='cosine') P = clusterer.fit(X) P = clusterer.labels_ print('AMI: {}'.format(adjusted_mutual_info_score(Y, P)))
fcm.fit(X_train) # outputs fcm_centers = fcm.centers # 첫번째는 'Bengin' cetroid, 두번쨰는 'attack' centroid fcm_labels = fcm.u.argmax(axis=1) probability = fcm.predict(X_test) result_df = pd.DataFrame(data=probability, columns=[0, 1, 'pre_class']) result_df['class'] = y_test print(color.BOLD + "Result" + color.END) print(result_df.head()) print(color.BOLD + "\nScoring" + color.END) h_score = homogeneity_score(result_df['class'], result_df['pre_class']) ar_score = adjusted_rand_score(result_df['class'], result_df['pre_class']) ami_socre = adjusted_mutual_info_score(result_df['class'], result_df['pre_class']) print("Homogeneity_score : %.4f" % h_score) print("Adjusted_rand_score : %.4f" % ar_score) print("Adjusted_mutual_info_score : %.4f" % ami_socre) print('Accuracy : %0.4f' % accuracy_score(result_df['class'], result_df['pre_class'])) Precision = precision_score(result_df['class'], result_df['pre_class'], average=None) Precision = sum(Precision) / 2 print('Precision : %0.4f' % Precision) Recall = recall_score(result_df['class'], result_df['pre_class'], average=None) Recall = sum(Recall) / 2 print('Recall : %0.4f' % Recall) F1 = f1_score(result_df['class'], result_df['pre_class'], average=None) F1 = sum(F1) / 2
core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score( labels_true, labels, average_method='arithmetic')) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) # ############################################################################# # Plot result import matplotlib.pyplot as plt # Black removed and is used for noise instead. unique_labels = set(labels) colors = [ plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels)) ] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1]
X, y = make_blobs(n_samples=500, n_features=2, centers=4, cluster_std=1, center_box=(-10.0, 10.0), shuffle=True, random_state=1) plot_data(X, y) kmeans_model = cluster.KMeans(n_clusters=2, random_state=1) kmeans_model.fit(X) kmeans_model.cluster_centers_ kmeans_model.labels_ #metrics when target labels are not known silhouette_avg = metrics.silhouette_score(X, kmeans_model.labels_, metric='euclidean') print(silhouette_avg) silhouette_samples = metrics.silhouette_samples(X, kmeans_model.labels_, metric='euclidean') print(silhouette_samples) ch_score = metrics.calinski_harabaz_score(X, kmeans_model.labels_) print(ch_score) #metrics when target labels are known print(metrics.adjusted_rand_score(y, kmeans_model.labels_)) print(metrics.adjusted_mutual_info_score(y, kmeans_model.labels_))
def ami(X1, X2): return adjusted_mutual_info_score(X1, X2)
elist.append([v_o, v_t]) vlist.remove(v_o) g.add_edge_list(elist) state = gt.minimize_blockmodel_dl(g, deg_corr=False) #write_classes('sim/sim_SBM.tsv', g, state) #state.draw(output="sim/sim_SBM.png") blocks = state.get_blocks() preds = get_blocksCC(g, blocks) nmi_sbm.append([normalized_mutual_info_score(g.vp.RealClass.a, blocks.a), normalized_mutual_info_score(g.vp.RealClass.a, list(preds))]) print(" NMI_SBM = %.5f\tNMI_SBMCC = %.5f" % (nmi_sbm[i][0], nmi_sbm[i][1]), flush=True) print(" NMI_SBMavg = %.5f\tNMI_SBMCCavg = %.5f" % (np.mean(np.asarray(nmi_sbm), 0)[0], np.mean(np.asarray(nmi_sbm), 0)[1]), flush=True) if i > 2: print(" NMI_SBMstd = %.5f\tNMI_SBMCCstd = %.5f" % (np.std(np.asarray(nmi_sbm), 0, ddof=1)[0], np.std(np.asarray(nmi_sbm), 0, ddof=1)[1]), flush=True) print(flush=True) ami_sbm.append([adjusted_mutual_info_score(g.vp.RealClass.a, blocks.a), adjusted_mutual_info_score(g.vp.RealClass.a, list(preds))]) print(" AMI_SBM = %.5f\tAMI_SBMCC = %.5f" % (ami_sbm[i][0], ami_sbm[i][1]), flush=True) print(" AMI_SBMavg = %.5f\tAMI_SBMCCavg = %.5f" % (np.mean(np.asarray(ami_sbm), 0)[0], np.mean(np.asarray(ami_sbm), 0)[1]), flush=True) if i > 2: print(" AMI_SBMstd = %.5f\tAMI_SBMCCstd = %.5f" % (np.std(np.asarray(ami_sbm), 0, ddof=1)[0], np.std(np.asarray(ami_sbm), 0, ddof=1)[1]), flush=True) print(flush=True) ar_sbm.append([adjusted_rand_score(g.vp.RealClass.a, blocks.a), adjusted_rand_score(g.vp.RealClass.a, list(preds))]) print(" AR_SBM = %.5f\tAR_SBMCC = %.5f" % (ar_sbm[i][0], ar_sbm[i][1]), flush=True) print(" AR_SBMavg = %.5f\tAR_SBMCCavg = %.5f" % (np.mean(np.asarray(ar_sbm), 0)[0], np.mean(np.asarray(ar_sbm), 0)[1]), flush=True) if i > 2: print(" AR_SBMstd = %.5f\tAR_SBMCCstd = %.5f" % (np.std(np.asarray(ar_sbm), 0, ddof=1)[0], np.std(np.asarray(ar_sbm), 0, ddof=1)[1]), flush=True) print(flush=True) state_nested = gt.minimize_nested_blockmodel_dl(g, deg_corr=False) #write_classes_hierarchical('sim/sim_NSBM.tsv', g, state_nested) state_nested_l0 = state_nested.get_levels()[0]
continue # Split 20% test - 80% training #train, test = train_test_split(dmn, test_size=0.1, stratify=c.loc[:,l]) # What we try to predict is the count of the activity l (i.e., target) y_train = c.loc[:, l] tree = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes) tree.fit(dmn, y_train) prediction = tree.predict(dmn) f1 = f1_score(prediction, y_train, average='micro') mutual_info = round( adjusted_mutual_info_score(prediction, y_train, average_method='arithmetic'), 3) tree_score = tree.score(dmn, y_train) if f1 > 0.95 and mutual_info > 0.2 and tree_score > 0.95: print() print(l) print('tree1:::::') print('tree score:', tree_score) print('f1_score:', f1) print('adjusted_mutual_info_score:', mutual_info) print( list( reversed( sorted(zip(tree.feature_importances_.round(2), dmn.columns))))) print(prediction)
def kmeans_model(self, test_size, random_state,show=None): # pre-process the data standardized_data = scale(self.data) # splitting the data into training and testing sets # typically 3/4 of the data is used to train, 1/4 of the data is used to test # x is the data you are testing : y is the target values of the corresponding data x_train, x_test, y_train, y_test, images_train, images_test = train_test_split(standardized_data, self.target, self.images, test_size=test_size, random_state=random_state) # gets the number of training features n_samples, n_features = x_train.shape # print out the number of samples and features print("# of training samples: ", n_samples) print("# of training features: ", n_features) # num_digits is the amount of unique targets n_digits = len(np.unique(y_train)) # create the KMeans model. # init defaults to init='k-means++' # add n-init argument to determine how many different centroid configurations the algorithm will try clf = cluster.KMeans(init='k-means++', n_clusters=n_digits, random_state=random_state) # fit the x_train data to the model clf.fit(x_train) if show: # create the figure with a size of 8x3 inches fig = plt.figure(figsize=(8, 4)) # Add title fig.suptitle('Cluster Center Images', fontsize=14, fontweight='bold') # For all labels (0-9) for i in range(10): # Initialize subplots in a grid of 2X5, at i+1th position ax = fig.add_subplot(2, 5, 1 + i) # Display images ax.imshow(clf.cluster_centers_[i].reshape((8, 8)), cmap=plt.cm.binary, interpolation="nearest") # Don't show the axes plt.axis('off') # Show the plot plt.show() # predict the labels for x_test y_pred = clf.predict(x_test) # print out the first 50 predicted and test values print("Predicted Values:\n",y_pred[:50]) print("Target Values:\n",y_test[:50]) print("Shape of Data:\n",clf.cluster_centers_.shape) # Create an isomap and fit the `digits` data to it x_iso = Isomap(n_neighbors=10).fit_transform(x_train) # Compute cluster centers and predict cluster index for each sample clusters = clf.fit_predict(x_train) if show: # Create a plot with subplots in a grid of 1X2 fig = plt.figure(1, (8, 4)) gs = gridspec.GridSpec(1, 2) ax = [fig.add_subplot(ss) for ss in gs] # Adjust layout fig.suptitle('Predicted Versus Training Labels(ISOMAP)', fontsize=14, fontweight='bold') # Add scatterplots to the subplots ax[0].scatter(x_iso[:, 0], x_iso[:, 1], c=clusters, edgecolors='black') ax[0].set_title('Predicted Training Labels') ax[1].scatter(x_iso[:, 0], x_iso[:, 1], c=y_train, edgecolors='black') ax[1].set_title('Actual Training Labels') gs.tight_layout(fig, rect=[0, 0.03, 1, 0.95]) # Show the plots plt.show() # Model and fit the `digits` data to the PCA model x_pca = PCA(n_components=2).fit_transform(x_train) # Compute cluster centers and predict cluster index for each sample clusters = clf.fit_predict(x_train) if show: # Create a plot with subplots in a grid of 1X2 fig = plt.figure(1, (8, 4)) gs = gridspec.GridSpec(1, 2) ax = [fig.add_subplot(ss) for ss in gs] # Adjust layout fig.suptitle('Predicted Versus Training Labels (PCA)', fontsize=14, fontweight='bold') fig.subplots_adjust(top=0.85) # Add scatterplots to the subplots ax[0].scatter(x_pca[:, 0], x_pca[:, 1], c=clusters, edgecolors='black') ax[0].set_title('Predicted Training Labels') ax[1].scatter(x_pca[:, 0], x_pca[:, 1], c=y_train, edgecolors='black') ax[1].set_title('Actual Training Labels') gs.tight_layout(fig, rect=[0, 0.03, 1, 0.95]) # Show the plots plt.show() # Print out the confusion matrix to see how the model is incorrect print("Classification Report:\n",metrics.classification_report(y_test, y_pred)) print("Confusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred)) # So looking at these numbers we can see that the kmeans model is not a good fit for our problem # this means that we must pick a different model for our data print('% 9s' % 'inertia h**o compl v-meas ARI AMI silhouette') print('%i %.3f %.3f %.3f %.3f %.3f %.3f' % (clf.inertia_, homogeneity_score(y_test, y_pred), completeness_score(y_test, y_pred), v_measure_score(y_test, y_pred), adjusted_rand_score(y_test, y_pred), adjusted_mutual_info_score(y_test, y_pred), silhouette_score(x_test, y_pred, metric='euclidean')))
op.add_option("--show_fig", default=False, help="Show visual quality assessment.") (opts, args) = op.parse_args(sys.argv[1:]) X = np.load("comparison/MNIST/wip_MNIST_X_org_41.npy") Xadv_s = np.load("comparison/MNIST/wip_MNIST_X_adv_41.npy") X = torch.from_numpy(X).unsqueeze(2) Xadv_s = torch.from_numpy(Xadv_s).unsqueeze(2) eps_s = Xadv_s - X h = Hierarchical(n_clusters=2) model = ClusteringWrapper3Dto2D(h) yhat = model.fit_predict(X) yadv_s = model.fit_predict(Xadv_s) print(adjusted_mutual_info_score(yhat, yadv_s)) print((yhat != yadv_s).sum()) set_seed(4) T = ConstrainedAdvPoisoningGlobal( delta=(Xadv_s - X).norm(float("inf")), s=1, clst_model=model, lb=1.0, G=150, mutation_rate=0.01, crossover_rate=0.85, zero_rate=0.10, domain_cons=[0, 255], objective="AMI", mode="guided",
# KMeans km = KMeans(n_clusters=100, n_init=1) itime = time.perf_counter() kmlabels = km.fit_predict(citypos) etime = time.perf_counter() print('K-means Time = ', etime - itime) # Minibatch Kmeans itime = time.perf_counter() mbkm = MiniBatchKMeans(n_clusters=100, batch_size=1000, n_init=1, max_iter=5000) mbkmlabels = mbkm.fit_predict(citypos) etime = time.perf_counter() print('MB K-means Time = ', etime - itime) print('Similarity Km vs MBKm', adjusted_mutual_info_score(kmlabels, mbkmlabels)) # Birch itime = time.perf_counter() birch = Birch(threshold=0.02, n_clusters=100, branching_factor=100) birchlabels = birch.fit_predict(citypos) etime = time.perf_counter() print('BIRCH Time = ', etime - itime) print('Similarity Km vs BIRCH', adjusted_mutual_info_score(kmlabels, birchlabels))
print("Top 10 terms per cluster:") for i in range(kvalue): print("Cluster %d:" % i, end='') for j in order_centroids[i, :10]: print(' %s' % terms[j], end='') print() print("Confusion matrix:") print(cm) print("Homogeneity score: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness score: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("Adjusted rand score: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Adjusted mutual info score: %0.3f" % metrics.adjusted_mutual_info_score(labels, km.labels_)) print( "------------------------------------------------------------------------") print() # Plot imformation plt.figure() plot_confusion_matrix(cm, classes=class_names, clusters=cluster_names, title='Confusion matrix after LSA without normalization') # Print information print("Clustering sparse data with k-means with k = 2...") print()
# ############################################################################# # Compute Affinity Propagation af = AffinityPropagation(preference=-10).fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean')) # ############################################################################# # Plot result import matplotlib.pyplot as plt from itertools import cycle plt.close('all') plt.figure(1) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = labels == k
plt.title('Dominant set + SVM Clustering') return labels if __name__ == '__main__': np.random.seed(6) nclust = 3 N = 1000 # number of samples d = 2 # dimension of samples (number of features) weights = np.ones(nclust) weights /= sum(weights) X, y = make_classification(weights=weights.tolist(), n_classes=nclust, n_samples=N, n_features=d, n_redundant=0, class_sep=1, n_clusters_per_class=1, n_informative=d) dist_metric = 'mahalanobis' #cosine, euclidean, l1, l2, manhattan, mahalanobis labels = ds_svm_clustering(X, n_clust=nclust, plot=True, metric=dist_metric) print 'Adjusted Mutual Information Score: ', adjusted_mutual_info_score( y, labels) plt.show()
from sklearn.datasets import load_iris from sklearn.cluster import KMeans data = load_iris() X = data.data y = data.target cl = KMeans(3) cl.fit(X) print(cl.cluster_centers_) print(cl.inertia_) print(cl.labels_) print(y) from sklearn.metrics import adjusted_mutual_info_score print(adjusted_mutual_info_score(y, cl.labels_))
print("\t n_samples %d, \t n_features %d" % (n_samples, n_features)) print(82 * '_') print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette') t0 = time.time() kmeans = KMeans(init='random', n_clusters=10, n_init=10) kmeans.fit(data) print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % ('Random', (time.time() - t0), kmeans.inertia_, metrics.homogeneity_score(labels, kmeans.labels_), metrics.completeness_score(labels, kmeans.labels_), metrics.v_measure_score(labels, kmeans.labels_), metrics.adjusted_rand_score(labels, kmeans.labels_), metrics.adjusted_mutual_info_score( labels, kmeans.labels_, average_method='arithmetic'), metrics.silhouette_score( data, kmeans.labels_, metric='euclidean', sample_size=sample_size))) print(82 * '_') # Visualize the results on PCA-reduced data - random_raw reduced_data = PCA(n_components=2).fit_transform(data) kmeans = KMeans(init='random', n_clusters=10, n_init=10) kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each
def calculate_AMI(self, query_labels, cluster_labels, **kwargs): return adjusted_mutual_info_score(c_f.to_numpy(query_labels), cluster_labels)
np.random.seed(1) # Get your mentioned graph G = nx.karate_club_graph() # Get ground-truth: club-labels -> transform to 0/1 np-array # (possible overcomplicated networkx usage here) gt_dict = nx.get_node_attributes(G, 'club') gt = [gt_dict[i] for i in G.nodes()] gt = np.array([0 if i == 'Mr. Hi' else 1 for i in gt]) # Get adjacency-matrix as numpy-array adj_mat = nx.to_numpy_matrix(G) print('ground truth') print(gt) # Cluster sc = SpectralClustering(2, affinity='precomputed', n_init=100) sc.fit(adj_mat) # Compare ground-truth and clustering-results print('spectral clustering') print(sc.labels_) print('just for better-visualization: invert clusters (permutation)') print(np.abs(sc.labels_ - 1)) # Calculate some clustering metrics print(metrics.adjusted_rand_score(gt, sc.labels_)) print(metrics.adjusted_mutual_info_score(gt, sc.labels_))
def experiments(PORCENTAJE_VECINOS, ALGORITHM, MODELO, normalizar=None): vecinos = algorithms[ALGORITHM] algoritmos = "coseno" if PORCENTAJE_VECINOS in ["boost", "maxsim", "dist"]: algoritmos = ALGORITHM + "-" + PORCENTAJE_VECINOS elif PORCENTAJE_VECINOS != 0: algoritmos = "%s-%.1f" % (ALGORITHM, PORCENTAJE_VECINOS) titulo = MODELO + "-" + algoritmos if normalizar is not None: titulo += "-" + normalizar fname = sys.argv[2] + "/" + titulo + ".out" if os.path.isfile(fname): return print(titulo) print("-" * 20) if PORCENTAJE_VECINOS == 0: X = coseno if MODELO == "dbscan": # Solo sirve para coseno! X = 1 - X else: neighbour_file_name = sys.argv[2] + "/" + ALGORITHM + ".npy" if os.path.isfile(neighbour_file_name): NEIGHBOURS = np.load(neighbour_file_name) else: print("Calculando vecinos") NEIGHBOURS = np.zeros((len(service_number), len(service_number))) for i in range(0, len(service_number)): for j in range(i, len(service_number)): NEIGHBOURS[i][j] = vecinos(followers, users, i, j) if i != j: NEIGHBOURS[j][i] = NEIGHBOURS[i][j] np.save(neighbour_file_name, NEIGHBOURS) if normalizar is not None: print("Normalizando Vecinos") if normalizar == 'minmax': NEIGHBOURS = preprocessing.minmax_scale(NEIGHBOURS) elif normalizar == 'scale': NEIGHBOURS = preprocessing.scale(NEIGHBOURS) elif normalizar == 'robust': NEIGHBOURS = preprocessing.robust_scale(NEIGHBOURS) elif normalizar == 'softmax': NEIGHBOURS = np.exp(NEIGHBOURS) / np.sum(np.exp(NEIGHBOURS), axis=1, keepdims=True) elif normalizar == 'matrixminmax': NEIGHBOURS = (NEIGHBOURS - np.min(NEIGHBOURS)) / (np.max(NEIGHBOURS) - np.min(NEIGHBOURS)) elif normalizar == 'matrixmax': NEIGHBOURS = NEIGHBOURS / np.max(NEIGHBOURS) if MODELO == "dbscan": # Si es distancia if normalizar is not None: NEIGHBOURS = 1 - NEIGHBOURS else: NEIGHBOURS = - NEIGHBOURS X = (1 - PORCENTAJE_VECINOS) * (1 - coseno) + PORCENTAJE_VECINOS * NEIGHBOURS else: # Si es afinidad if PORCENTAJE_VECINOS == "boost": X = np.multiply(coseno, NEIGHBOURS) elif PORCENTAJE_VECINOS == "maxsim": X = np.maximum(coseno, NEIGHBOURS) elif PORCENTAJE_VECINOS == "dist": NEIGHBOURS_SORTED = np.argsort(np.argsort(NEIGHBOURS)) COSINE_SORTED = np.argsort(np.argsort(coseno)) POS_BOOST = np.log(1 / (1 + np.abs(NEIGHBOURS_SORTED - COSINE_SORTED))) X = POS_BOOST else: X = (1 - PORCENTAJE_VECINOS) * coseno + PORCENTAJE_VECINOS * NEIGHBOURS print("Generando Modelo") if MODELO == 'kmedoids': model = KMedoids(n_clusters=1500).fit(X) if MODELO == 'kmedoids470': model = KMedoids(n_clusters=470).fit(X) elif MODELO == 'ap': model = AffinityPropagation(affinity='precomputed').fit(X) elif MODELO == 'dbscan': model = DBSCAN(metric='precomputed').fit(X) labels = model.labels_ clusters = defaultdict(list) for index, classif in enumerate(labels): clusters[classif].append(index) n_clusters_ = len(clusters) info = "" info += 'Clusters: %d\n' % n_clusters_ # info += 'Cohesiveness: %0.3f\n' % cohesiveness(X, labels) info += 'Entropy: %0.3f\n' % entropy(labels_true, labels) info += "Homogeneity: %0.3f\n" % metrics.homogeneity_score(labels_true, labels) info += "Completeness: %0.3f\n" % metrics.completeness_score(labels_true, labels) info += "V-measure: %0.3f\n" % metrics.v_measure_score(labels_true, labels) info += 'Purity: %0.3f\n' % purity(labels_true, labels) info += "F-Measure: %0.3f\n" % fmeasure(labels_true, labels) info += "Adjusted Rand Index: %0.3f\n" % metrics.adjusted_rand_score(labels_true, labels) info += "Adjusted Mutual Information: %0.3f\n" % metrics.adjusted_mutual_info_score(labels_true, labels) clustersize = Counter(labels) salida = open(fname, 'w', encoding='UTF-8') print(info) salida.write(titulo + "\n") for cluster, services in clusters.items(): countcat = Counter([labels_true[svc] for svc in services]) max_key, num = countcat.most_common(1)[0] salida.write("%i (%s - %i/%i): %s \n" % ( cluster, max_key, num, clustersize[cluster], ",".join([service_list[svc] for svc in services]))) salida.write("-" * 20 + "\n") salida.write(info) salida.close()
def Clu_Eval_Givenlabels(labels_true, labels_pred): #The format should be list ARI = metrics.adjusted_rand_score(labels_true, labels_pred) AMI = metrics.adjusted_mutual_info_score(labels_true, labels_pred) return ARI, AMI
# In[447]: #check out just the target cells #d[(d['x']>2) & (d['y']>7)] temp = subset[(subset['cell_type']=="blast") | (subset['cell_type']=="healthy")] kmeans_temp, scores_temp = k_means_optimized(temp[colsOfInterest].as_matrix(),scale=True) temp['kmeans_temp'] = kmeans_temp.labels_ print kmeans_temp print scores_temp plt.bar(range(len(scores_temp)), scores_temp.keys(), align='center') plt.xticks(range(len(scores_temp)), scores_temp.values()) plt.show() print 'KMEANS DAD NMI:', adjusted_mutual_info_score(temp['cell_type'], kmeans_temp.labels_) temp.groupby(['cell_type',"kmeans_temp"]).count() # In[448]: #check out just the target cells forced k=2 kmeans_temp = KMeans(2) kmeans_temp.fit(temp[colsOfInterestFlow].as_matrix()) temp['kmeans_temp2'] = kmeans_temp.labels_ print kmeans_temp print scores_temp plt.bar(range(len(scores_temp)), scores_temp.keys(), align='center') plt.xticks(range(len(scores_temp)), scores_temp.values()) plt.show()
continue feat_name = 'gsdmm' best_acc = 0.0 best_pred = None all_pred = [] all_acc = [] all_nmi = [] all_ari = [] for i in range(trial_num): print(corpora_id, n_topics, i) # pred = gsdmm_cluster_alg(corpora_name, n_topics, alpha, beta, iter_nums) pred = gsdmm_cluster_alg(train_path, n_topics, alpha, beta, iter_nums) acc = cluster_acc(labels, pred) nmi = normalized_mutual_info_score(labels, pred) ari = adjusted_mutual_info_score(labels, pred) all_pred.append(pred.tolist()) all_acc.append(acc) all_nmi.append(nmi) all_ari.append(ari) if acc > best_acc: best_pred = pred best_acc = acc print('{} best acc is {}'.format(feat_name, best_acc)) dump_mongo(corpora=corpora_name, feat_name=feat_name, n_topics=n_topics, pred=best_pred.tolist(), acc=best_acc, all_pred=all_pred, all_acc=all_acc,
def compute_results(G_data, B_data, name, encoder, r_state=0, only_kmeans=False): B_data_X = encoder.detach().numpy() kmeans = KMeans(n_clusters=get_clusters(G_data, name), init='k-means++', random_state=r_state) if not only_kmeans: kmeans.fit(B_data_X) else: kmeans.fit(B_data) X_ae = kmeans.labels_ # Calculated labels # Finding truth values if name == 'karate': c_groups = [ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ] elif name == 'email': c_groups = [ 1, 1, 21, 21, 21, 25, 25, 14, 14, 14, 9, 14, 14, 26, 4, 17, 34, 1, 1, 14, 9, 9, 9, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 5, 34, 14, 14, 17, 17, 10, 10, 36, 37, 5, 7, 4, 22, 22, 21, 21, 21, 21, 7, 7, 36, 21, 25, 4, 8, 15, 15, 15, 37, 37, 9, 1, 1, 10, 10, 3, 3, 3, 29, 15, 36, 36, 37, 1, 36, 34, 20, 20, 8, 15, 9, 4, 5, 4, 20, 16, 16, 16, 16, 16, 38, 7, 7, 34, 38, 36, 8, 27, 8, 8, 8, 10, 10, 13, 13, 6, 26, 10, 1, 36, 0, 13, 16, 16, 22, 6, 5, 4, 0, 28, 28, 4, 2, 13, 13, 21, 21, 17, 17, 14, 36, 8, 40, 35, 15, 23, 0, 0, 7, 10, 37, 27, 35, 35, 0, 0, 19, 19, 36, 14, 37, 24, 17, 13, 36, 4, 4, 13, 13, 10, 4, 38, 32, 32, 4, 1, 0, 0, 0, 7, 7, 4, 15, 16, 40, 15, 15, 15, 15, 0, 21, 21, 21, 21, 5, 4, 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 22, 19, 19, 22, 34, 14, 0, 1, 17, 37, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 23, 0, 4, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 10, 14, 14, 1, 14, 7, 13, 20, 31, 40, 6, 4, 0, 8, 9, 9, 10, 0, 10, 14, 14, 14, 14, 39, 17, 4, 28, 17, 17, 17, 4, 4, 0, 0, 23, 4, 21, 36, 36, 0, 22, 21, 15, 37, 0, 4, 4, 4, 14, 4, 7, 7, 1, 15, 15, 38, 26, 20, 20, 20, 21, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 19, 7, 7, 17, 16, 14, 9, 9, 9, 8, 8, 13, 39, 14, 10, 17, 17, 13, 13, 13, 13, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 27, 8, 8, 14, 14, 14, 10, 14, 35, 37, 14, 36, 10, 7, 20, 10, 16, 36, 36, 14, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 4, 9, 4, 0, 4, 16, 38, 14, 14, 21, 26, 27, 28, 21, 4, 1, 1, 9, 10, 15, 4, 26, 14, 35, 10, 34, 4, 4, 12, 17, 17, 14, 37, 37, 37, 34, 6, 13, 13, 13, 13, 4, 14, 10, 10, 10, 3, 17, 17, 17, 1, 4, 14, 14, 6, 27, 22, 21, 4, 4, 1, 34, 17, 30, 30, 4, 23, 14, 15, 1, 22, 12, 31, 6, 15, 15, 8, 15, 8, 8, 1, 15, 22, 2, 3, 4, 10, 4, 14, 14, 25, 6, 6, 40, 4, 36, 23, 14, 3, 14, 14, 14, 14, 14, 14, 14, 14, 14, 31, 15, 15, 14, 0, 23, 35, 8, 4, 1, 1, 35, 23, 21, 2, 4, 4, 9, 14, 4, 10, 25, 14, 14, 3, 21, 35, 4, 9, 15, 6, 9, 3, 15, 23, 4, 4, 4, 11, 35, 10, 6, 15, 15, 15, 22, 2, 2, 14, 4, 3, 14, 27, 31, 34, 4, 4, 19, 14, 14, 4, 4, 14, 14, 21, 4, 14, 4, 0, 4, 27, 27, 17, 3, 15, 2, 4, 4, 21, 21, 11, 23, 11, 23, 17, 5, 36, 15, 23, 23, 2, 19, 4, 36, 14, 1, 22, 1, 21, 34, 14, 13, 6, 4, 37, 6, 24, 35, 6, 17, 16, 6, 4, 0, 21, 4, 26, 21, 4, 15, 7, 1, 20, 19, 7, 21, 21, 21, 19, 38, 19, 16, 23, 6, 37, 25, 1, 22, 6, 14, 1, 26, 8, 37, 4, 0, 17, 6, 17, 14, 16, 4, 32, 14, 15, 0, 23, 21, 29, 14, 14, 1, 17, 26, 15, 0, 0, 0, 22, 34, 21, 6, 16, 4, 15, 21, 0, 36, 4, 1, 1, 22, 14, 14, 30, 4, 9, 10, 4, 4, 14, 16, 16, 15, 21, 0, 4, 15, 29, 24, 21, 14, 11, 11, 9, 13, 10, 31, 4, 22, 14, 23, 1, 4, 9, 17, 27, 28, 22, 14, 20, 7, 23, 1, 6, 15, 15, 23, 4, 20, 5, 36, 10, 21, 39, 41, 31, 17, 7, 21, 34, 1, 14, 2, 18, 16, 27, 16, 38, 7, 38, 21, 1, 9, 15, 15, 15, 0, 6, 23, 28, 11, 23, 34, 24, 4, 4, 4, 24, 23, 17, 10, 17, 1, 1, 15, 15, 4, 21, 14, 14, 20, 28, 20, 22, 26, 3, 32, 4, 0, 21, 13, 4, 15, 17, 5, 4, 14, 0, 9, 21, 14, 38, 4, 14, 31, 21, 14, 6, 4, 4, 6, 17, 0, 4, 7, 16, 4, 4, 21, 1, 10, 3, 21, 4, 0, 1, 7, 17, 15, 14, 0, 9, 32, 13, 5, 2, 21, 28, 21, 22, 22, 7, 7, 33, 0, 1, 15, 4, 31, 30, 15, 11, 19, 21, 9, 21, 13, 21, 9, 32, 9, 32, 38, 9, 38, 38, 14, 9, 10, 38, 10, 22, 21, 13, 21, 4, 0, 1, 1, 23, 0, 5, 4, 4, 15, 14, 14, 13, 11, 1, 5, 5, 10, 23, 21, 14, 9, 20, 10, 19, 19, 21, 17, 19, 19, 36, 17, 35, 16, 4, 16, 4, 6, 4, 41, 6, 7, 23, 9, 23, 7, 6, 22, 36, 14, 15, 11, 35, 5, 14, 14, 15, 4, 6, 4, 9, 19, 11, 4, 29, 14, 15, 15, 5, 32, 15, 14, 5, 9, 10, 19, 13, 23, 12, 10, 21, 10, 35, 7, 22, 22, 22, 8, 21, 32, 4, 21, 21, 6, 14, 11, 14, 15, 4, 21, 1, 6, 22 ] else: c_attributes = nx.get_node_attributes(G_data, 'value') c_groups = [] for i, val in enumerate(c_attributes.values()): c_groups.append(val) X_gt = np.array(c_groups) # print(X_ae) # print(X_gt) return metrics.adjusted_mutual_info_score(X_gt, X_ae, average_method='arithmetic')
def unSupervised(x_data, y_data, x, n): fNames = ['Heart', 'Credit Card'] clusterValues = [] silScores = [] noComponents = [] bic = [] aic = [] arScore = [] amiScore = [] homogeneityScore = [] completenessScore = [] fmScore = [] for a in range(2, 7): ## K-means kmeans = KMeans(n_clusters=a) kmeans.fit(x_data) kmeans.predict(x_data) labels = kmeans.labels_ silScore = silhouette_score(x_data, labels) clusterVisuals(x_data, labels, a, fNames[x] + ': ' + n + ': K-Means') clusterValues.append(a) silScores.append(silScore) arScore.append(adjusted_rand_score(y_data, labels)) amiScore.append(adjusted_mutual_info_score(y_data, labels)) homogeneityScore.append(homogeneity_score(y_data, labels)) completenessScore.append(completeness_score(y_data, labels)) fmScore.append(fowlkes_mallows_score(y_data, labels)) ###Expected maximization em = GaussianMixture(n_components=a) em.fit(x_data) labels = em.predict(x_data) noComponents.append(a) bic.append(em.bic(x_data)) aic.append(em.aic(x_data)) clusterVisuals(x_data, labels, a, fNames[x] + ': ' + n + ': EM') plt.plot(clusterValues, silScores, label='Silhouette') plt.plot(clusterValues, arScore, label='Adjusted Rand Index') plt.plot(clusterValues, amiScore, label='Ajusted Mutual Index') plt.plot(clusterValues, homogeneityScore, label='Homogeneity') plt.plot(clusterValues, completenessScore, label='Completeness') plt.plot(clusterValues, fmScore, label='Fowlkes-Mallows') plt.xlabel('No. of Clusters') plt.ylabel('Scores') plt.legend() plt.title(n + ': K-Means: ' + fNames[x]) plt.savefig(n + ' K-Means ' + fNames[x] + '.png') plt.figure() plt.title(n + ': ' + 'Expected Maximzation: ' + fNames[x]) plt.plot(noComponents, bic, label="BIC") plt.plot(noComponents, aic, label="AIC") plt.xlabel("No. of Components") plt.ylabel("BIC & AIC") plt.legend() plt.savefig(n + ' ' + 'Expected Maximzation: ' + fNames[x] + '.png') plt.figure()
kmeans = KMeans(n_clusters=2, random_state=0, n_init=20).fit(X_scaled_transformed) align_clusters_labels(kmeans.labels_) gmm = GaussianMixture(n_components=2).fit_predict(X_scaled_transformed) align_clusters_labels(gmm) metrics_report = {'kmeans': {}, 'gmm': {}} labels = {'kmeans': kmeans.labels_, 'gmm': gmm} for each in metrics_report.keys(): metrics_report[each]['ARI'] = round( metrics.adjusted_rand_score(y, labels[each]), 2) metrics_report[each]['AMI'] = round( metrics.adjusted_mutual_info_score(y, labels[each]), 2) metrics_report[each]['homogeneity'] = round( metrics.homogeneity_score(y, labels[each]), 2) metrics_report[each]['completeness'] = round( metrics.completeness_score(y, labels[each]), 2) metrics_report[each]['v_measure'] = round( metrics.v_measure_score(y, labels[each]), 2) metrics_report[each]['silhouette'] = round( metrics.silhouette_score(X, labels[each]), 2) metrics_report[each]['accuracy'] = round( metrics.accuracy_score(y, labels[each]) * 100, 2) print(metrics_report) #visualizing - k-means clustering of ICA transformed dataset plt.scatter(X_scaled_transformed[kmeans.labels_ == 1, 0],
# ---------------------------------------------------------------------- # stats # Number of clusters in labels, ignoring noise if present. n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_clusters_true = len(set(labels_true)) - (1 if -1 in labels else 0) printlog('\t Estimated number of clusters: {0}'.format(n_clusters)) # print stats args = [labels_true, labels] pargs = [ metrics.homogeneity_score(*args), metrics.completeness_score(*args), metrics.v_measure_score(*args), metrics.adjusted_rand_score(*args), metrics.adjusted_mutual_info_score(*args) ] printlog("\t Homogeneity: {0:.3f}\n\t Completeness: {1:.3f}" "\n\t V-measure: {2:.3f}\n\t Adjusted Rand Index: {3:.3f}" "\n\t Adjusted Mutual Information: {4:.3f}".format(*pargs)) # ---------------------------------------------------------------------- # comparing results printlog('Comparing results...') merged = compare_results(groups, labels_true, labels) # ---------------------------------------------------------------------- # Plot result printlog('Plotting graphs...') if PLOT_3D_ALL:
def train_test_model(run_id, hparams, X_train, y_train, X_test, y_test): # hp.hparams(hparams) # record the values used in this trial seed = hparams[HP_seed] tf.random.set_seed(seed) params = { "components": hparams[HP_components], "input_dimension": X_train.shape[1], "embedding_dimensions": eval(hparams[HP_encoder_dims])[0], "latent_dimensions": eval(hparams[HP_encoder_dims])[1], "mixture_embedding_dimensions": eval(hparams[HP_mixture_dims])[0], "mixture_latent_dimensions": eval(hparams[HP_mixture_dims])[1], "embedding_activations": tf.nn.relu, "kind": "binary", "learning_rate": 1.0, "gradient_clip": None, "bn_before": True if hparams[HP_bn] == "before" else False, "bn_after": True if hparams[HP_bn] == "after" else False, "categorical_epsilon": 0.0, "reconstruction_epsilon": 0.0, "latent_epsilon": 0.0, "latent_prior_epsilon": 0.0, "z_kl_lambda": 1.0, "c_kl_lambda": 1.0, "cat_latent_bias_initializer": None, "connected_weights": hparams[HP_connected_weights], # "optimizer":tf.keras.optimizers.Adam(lr_schedule, epsilon=1e-16), "optimizer": tf.keras.optimizers.Adam(1e-3, epsilon=1e-16), "categorical_latent_embedding_dropout": 0.2, "mixture_latent_mu_embedding_dropout": 0.2, "mixture_latent_var_embedding_dropout": 0.2, "mixture_posterior_mu_dropout": 0.2, "mixture_posterior_var_dropout": 0.2, "recon_dropouut": 0.2, #'latent_fixed_var': 0.01, } z_cooling = lambda: 1.0 y_cooling = lambda: 1.0 m1 = model.Gmvae(**params) params["embedding_activations"] = "relu" params["optimizer"] = "adam_1e-3_1e-9" param_string = ( "/seed__" + str(seed) + "/" + "/".join([str(k) + "_" + str(v) for k, v in params.items()])) train( m1, X_train, y_train, X_test, y_test, num=100, samples=hparams[HP_samples], epochs=110, iter_train=1, num_inference=1000, save="model_w_5", batch=True, temperature_function=lambda x: exponential_multiplicative_cooling( x, 1.0, 0.5, 0.99), # temperature_function = lambda x: 0.1 save_results="./gumble_results.txt", beta_z_method=z_cooling, beta_y_method=y_cooling, tensorboard=run_id, ) idx_tr = m1.predict(X_train).numpy().argmax(1) idx_te = m1.predict(X_test).numpy().argmax(1) ami_tr = adjusted_mutual_info_score(y_train, idx_tr, average_method="arithmetic") ami_te = adjusted_mutual_info_score(y_test, idx_te, average_method="arithmetic") attch_te = np.array(np.unique(idx_te, return_counts=True)[1]).max() / len(idx_te) purity_train = purity_score(y_train, idx_tr) purity_test = purity_score(y_test, idx_te) return ami_tr, ami_te, purity_train, purity_test
raw_data = np.loadtxt('cluster.txt') # 导入数据文件 X = raw_data[:, :-1] # 分割要聚类的数据 y_true = raw_data[:, -1] # 训练聚类模型 n_clusters = 3 # 设置聚类数量 model_kmeans = KMeans(n_clusters=n_clusters, random_state=0) # 建立聚类模型对象 model_kmeans.fit(X) # 训练聚类模型 y_pre = model_kmeans.predict(X) # 预测聚类模型 # 模型效果指标评估 n_samples, n_features = X.shape # 总样本量,总特征数 inertias = model_kmeans.inertia_ # 样本距离最近的聚类中心的总和 adjusted_rand_s = metrics.adjusted_rand_score(y_true, y_pre) # 调整后的兰德指数 mutual_info_s = metrics.mutual_info_score(y_true, y_pre) # 互信息 adjusted_mutual_info_s = metrics.adjusted_mutual_info_score(y_true, y_pre) # 调整后的互信息 homogeneity_s = metrics.homogeneity_score(y_true, y_pre) # 同质化得分 completeness_s = metrics.completeness_score(y_true, y_pre) # 完整性得分 v_measure_s = metrics.v_measure_score(y_true, y_pre) # V-measure得分 silhouette_s = metrics.silhouette_score(X, y_pre, metric='euclidean') # 平均轮廓系数 calinski_harabaz_s = metrics.calinski_harabaz_score( X, y_pre) # Calinski和Harabaz得分 print('samples: %d \t features: %d' % (n_samples, n_features)) # 打印输出样本量和特征数量 print(70 * '-') # 打印分隔线 print('ine\tARI\tMI\tAMI\thomo\tcomp\tv_m\tsilh\tc&h') # 打印输出指标标题 print('%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d' % (inertias, adjusted_rand_s, mutual_info_s, adjusted_mutual_info_s, homogeneity_s, completeness_s, v_measure_s, silhouette_s, calinski_harabaz_s)) # 打印输出指标值 print(70 * '-') # 打印分隔线 print('short name \t full name') # 打印输出缩写和全名标题
def main(cvfold=0, alpha_T=1.0, alpha_E=1.0, lambda_TE=1.0, root_node='n88', start_i=0, stop_i=11000, embedding='zE', latent_dim=3, rand_seed=0, exp_name='LR_v2_bal'): exp_name = exp_name+'_ld'+str(latent_dim) alpha_M=alpha_E cvfold_fname='v2_aT_'+str(alpha_T)+\ '_aE_'+str(alpha_E)+\ '_aM_'+str(alpha_M)+\ '_cs_'+str(lambda_TE)+\ '_ld_'+str(latent_dim)+\ '_bs_200_se_500_ne_1500_cv_'+str(cvfold)+\ '_ri_0500_ft-summary' cvfold_fname=cvfold_fname.replace('.','-')+'.mat' dir_pth = set_paths(exp_name=exp_name) #Load pruned tree, embeddings, and cell type annotations with open(dir_pth['data']+"PS_v4_beta_0-4_matched_well-sampled_dend_RData_Tree_20181220_pruned_n88_n60_classifications.json") as f: all_classifications = json.load(f) O = sio.loadmat(dir_pth['data']+'PS_v4_beta_0-4_matched_well-sampled.mat',squeeze_me=True) CV = sio.loadmat(dir_pth['cvfolds']+cvfold_fname,squeeze_me=True) htree_df = pd.read_csv(dir_pth['data']+'dend_RData_Tree_20181220_pruned.csv') htree = HTree(htree_df=htree_df) all_descendants = htree.get_all_descendants() result_fname = 'cv_classification_results_' + \ embedding + \ '_aT_'+str(alpha_T) + \ '_aE_'+str(alpha_E) + \ '_aM_'+str(alpha_M)+ \ '_csTE_'+str(lambda_TE) + \ '_ld_'+str(latent_dim) + \ '_randseed_'+str(rand_seed) + \ '_start_'+str(start_i) + \ '_stop_'+str(stop_i) + \ '_cv_'+str(cvfold) +\ '_rn_'+root_node result_fname = result_fname.replace('.','-')+'.csv' max_i = min(stop_i,len(all_classifications[root_node])) write_header=True for i in range(start_i,max_i,1): print('Iter {:6d} in range {:6d} to {:6d}'.format(i,start_i,max_i)) classification_id = root_node+'_'+str(i) this_classification = all_classifications[root_node][i] n_classes=len(this_classification) #Classifier only works for n_classes > 1 if n_classes>1: X = relabel_restrict_inputs(CV=CV,O=O,this_classification=this_classification,descendant_dict=all_descendants) clf = LogisticRegression(penalty='none', random_state=rand_seed, solver='lbfgs', max_iter=10000, multi_class='multinomial', class_weight='balanced').fit(X['train'][embedding], X['train']['cluster']) result={} for ds in ['train','val','test']: pred_label = clf.predict(X[ds][embedding]) result[ds+'_acc'] = np.sum(pred_label==X[ds]['cluster'])/X[ds]['cluster'].size result[ds+'_ari'] = adjusted_rand_score(X[ds]['cluster'], pred_label) result[ds+'_ami'] = adjusted_mutual_info_score(X[ds]['cluster'], pred_label) result[ds+'_nmi'] = normalized_mutual_info_score(X[ds]['cluster'], pred_label) result[ds+'_samples'] = pred_label.size result_list = [result['train_acc'], result['val_acc'], result['test_acc'], result['train_ari'], result['val_ari'], result['test_ari'], result['train_ami'], result['val_ami'], result['test_ami'], result['train_nmi'], result['val_nmi'], result['test_nmi'], result['train_samples'], result['val_samples'], result['test_samples'], cvfold, classification_id, n_classes] with open(dir_pth['result']+result_fname,'a') as f: writer = csv.writer(f) if write_header: writer.writerow(['train_acc', 'val_acc', 'test_acc', 'train_ari', 'val_ari', 'test_ari', 'train_ami', 'val_ami', 'test_ami', 'train_nmi', 'val_nmi', 'test_nmi', 'train_samples', 'val_samples', 'test_samples', 'cvfold', 'classification_id', 'n_classes']) write_header=False writer.writerow(result_list) return