def evalutate(self, memberships): groundTruth = self.groundTruth n_graphs = self.n_graphs individual_nmi = np.zeros([n_graphs]) individual_ari = np.zeros([n_graphs]) individual_mcr = np.zeros([n_graphs]) for n in range(n_graphs): # print(n) individual_nmi[n] = nmi(memberships[n], groundTruth[n]) individual_ari[n] = ari(memberships[n], groundTruth[n]) individual_mcr[n] = mcr(memberships[n], groundTruth[n]) trueMemberships_stacked = np.reshape(np.hstack(groundTruth), [-1]) memberships_stacked = np.hstack(memberships) overall_nmi = nmi(memberships_stacked, trueMemberships_stacked) overall_ari = ari(memberships_stacked, trueMemberships_stacked) overall_mcr = mcr(memberships_stacked, trueMemberships_stacked) return { "NMI": { 'nmi': np.mean(individual_nmi), 'overall_nmi': overall_nmi }, "ARI": { 'ari': np.mean(individual_ari), 'overall_ari': overall_ari }, "MCR": { 'mcr': np.mean(individual_mcr), 'overall_mcr': overall_mcr } }
def clust(data_path, label_path, pca_com, phate_com): input_path = data_path + ".csv" label_path = label_path + ".csv" X = pd.read_csv(input_path, header=None) X = X.drop(0) X = np.array(X) X = X.transpose() pca = PCA(n_components=pca_com) b = pca.fit_transform(X) phate_op = phate.PHATE(n_components=phate_com) data_phate = phate_op.fit_transform(b) label = pd.read_csv(label_path) y = np.array(label) label = y.ravel() c = label.max() centList, clusterAssment = biKmeans(data_phate, c) julei = clusterAssment[:, 0] y = np.array(julei) julei = y.ravel() print('NMI value is %f \n' % nmi(julei.flatten(), label.flatten())) print('ARI value is %f \n' % ari(julei.flatten(), label.flatten())) print('HOM value is %f \n' % metrics.homogeneity_score(julei, label)) print('AMI value is %f \n' % metrics.adjusted_mutual_info_score(label, julei)) return julei
def training_and_testing(): k = 40 #X_train,y_train=data_collect_for_wefcm() X_train, X_test, y_train, y_test = data_collection_from_file() #print np.shape(X) #X_train,X_test,y_train,y_test=train_test_split(X,Y, test_size=0.2) start = timeit.default_timer() #U=ARWEFCM(X_train,k) print(np.shape(X_train)) U, C = WEFCM(X_train, k) #X_test,y_test=data_collection_from_test_file() #this is ofcm #print "c start" #C=calculateClusterCenter(U,X_train,k) #my_df = pd.DataFrame(C) #my_df.to_csv('out.csv',index=False, header=False) #print C[0:2] #print "c end" y1_train = label_cluster(X_train, y_train, C) pre_labels = getpredicted_labels(U, len(y_train)) stop = timeit.default_timer() print('run time:= ', stop - start) r1 = nmi(y_train, pre_labels) r2 = ari(y_train, pre_labels) print('NMI:= ', r1) print('ARI:= ', r2) #print y1_train #print len(X_train) y1_test = test_data(X_test, C, y1_train) #print C accuracy = (float(np.sum(y1_test == y_test))) / len(y_test) print('accuracy:= ', accuracy) #print(classification_report(y_test, y1_test, target_names=y_test)) '''f = open("result1.ods","a+")
""" import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn import metrics from sklearn.metrics.cluster import adjusted_rand_score as ari from sklearn.metrics.cluster import normalized_mutual_info_score as nmi X = pd.read_csv('yan/yan.csv', header=None) X = np.array(X) X = X.transpose() label = pd.read_csv('yan/yan_label.csv') y = np.array(label) label = y.ravel() pca = PCA(n_components=2) A = pca.fit_transform(X) c = label.max() kk = KMeans(n_clusters=c) julei = kk.fit(A) julei = julei.labels_ print('NMI value is %f \n' % nmi(julei.flatten(), label.flatten())) print('ARI value is %f \n' % ari(julei.flatten(), label.flatten())) print('HOM value is %f \n' % metrics.homogeneity_score(julei, label)) print('AMI value is %f \n' % metrics.adjusted_mutual_info_score(label, julei))
sca.cell_subset.shape labels = sca.w.argmax(0) from sklearn.metrics.cluster import normalized_mutual_info_score as nmi import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns labels_b = pd.read_csv('80k_cluster_numbers.csv') labels_b = labels_b.iloc[:, 1] labels_b = labels_b.values labels_b_subset = labels_b[sca.cell_subset] nmi(labels_b_subset, labels) from sklearn.metrics.cluster import adjusted_rand_score as ari ari(labels_b_subset, labels) cluster_counts = np.zeros((len(set(labels_b)), len(set(labels)))) for i, j in zip(labels_b_subset, labels): cluster_counts[i, j] += 1 plt.figure(figsize=(10, 25)) sns.heatmap(cluster_counts / cluster_counts.sum(1)[:, np.newaxis], yticklabels=sorted(list(set(labels_b_subset))), vmin=0, vmax=1, linewidths=0.5) plt.xlabel('UNCURL clusters') plt.ylabel('Seurat clusters') plt.title('SCH Cerebellum Clusters') plt.savefig('uncurl_vs_seurat_clusters.png', dpi=200)
def run_experiment(methods, data, n_classes, true_labels, n_runs=10, use_purity=True, use_nmi=False, use_ari=False, use_nne=False, consensus=False): """ runs a pre-processing + clustering experiment... exactly one of use_purity, use_nmi, or use_ari can be true Args: methods: list of 2-tuples. The first element is either a single Preprocess object or a list of Preprocess objects, to be applied in sequence to the data. The second element is either a single Cluster object, a list of Cluster objects, or a list of lists, where each list is a sequence of Preprocess objects with the final element being a Cluster object. data: genes x cells array true_labels: 1d array of length cells consensus: if true, runs a consensus on cluster results for each method at the very end. use_purity, use_nmi, use_ari, use_nne: which error metric to use (at most one can be True) Returns: purities (list of lists) names (list of lists) other (dict): keys: timing, preprocessing, clusterings """ results = [] names = [] clusterings = {} other_results = {} other_results['timing'] = {} other_results['preprocessing'] = {} if use_purity: purity_method = purity elif use_nmi: purity_method = nmi elif use_ari: purity_method = ari elif use_nne: purity_method = nne for i in range(n_runs): print('run {0}'.format(i)) purities = [] r = 0 method_index = 0 for preproc, cluster in methods: t0 = time.time() if isinstance(preproc, Preprocess): preprocessed, ll = preproc.run(data) output_names = preproc.output_names else: # if the input is a list, only use the first preproc result p1 = data output_names = [''] for p in preproc: p1, ll = p.run(p1) p1 = p1[0] if output_names[0] != '': output_names[ 0] = output_names[0] + '_' + p.output_names[0] else: output_names[0] = p.output_names[0] preprocessed = [p1] t1 = time.time() - t0 for name, pre in zip(output_names, preprocessed): starting_index = method_index if isinstance(cluster, Cluster): #try: t0 = time.time() labels = cluster.run(pre) t2 = t1 + time.time() - t0 if use_nne: purities.append(purity_method(pre, true_labels)) else: purities.append(purity_method(labels, true_labels)) if i == 0: names.append(name + '_' + cluster.name) clusterings[names[-1]] = [] other_results['timing'][names[-1]] = [] print(names[r]) clusterings[names[r]].append(labels) print('time: ' + str(t2)) other_results['timing'][names[r]].append(t2) print(purities[-1]) r += 1 method_index += 1 #except: # print('failed to do clustering') elif type(cluster) == list: for c in cluster: if isinstance(c, list): t2 = t1 name2 = name sub_data = pre.copy() for subproc in c[:-1]: t0 = time.time() subproc_out, ll = subproc.run(sub_data) sub_data = subproc_out[0] name2 = name2 + '_' + subproc.output_names[0] t2 += time.time() - t0 t0 = time.time() labels = c[-1].run(sub_data) t2 += time.time() - t0 if use_nne: purities.append( purity_method(sub_data, true_labels)) else: purities.append( purity_method(labels, true_labels)) if i == 0: names.append(name2 + '_' + c[-1].name) clusterings[names[-1]] = [] other_results['timing'][names[-1]] = [] print(names[r]) clusterings[names[r]].append(labels) other_results['timing'][names[r]].append(t2) print('time: ' + str(t2)) print(purities[-1]) r += 1 method_index += 1 else: try: t0 = time.time() labels = c.run(pre) t2 = t1 + time.time() - t0 if i == 0: names.append(name + '_' + c.name) clusterings[names[-1]] = [] other_results['timing'][names[-1]] = [] if use_nne: purities.append( purity_method(pre, true_labels)) else: purities.append( purity_method(labels, true_labels)) print(names[r]) clusterings[names[r]].append(labels) other_results['timing'][names[r]].append(t2) print('time: ' + str(t2)) print(purities[-1]) r += 1 method_index += 1 except: print('failed to do clustering') # find the highest purity for the pre-processing method # save the preprocessing result with the highest NMI num_clustering_results = method_index - starting_index clustering_results = purities[-num_clustering_results:] if i > 0 and len(clustering_results) > 0: old_clustering_results = results[-1][ starting_index:method_index] if max(old_clustering_results) < max(clustering_results): other_results['preprocessing'][name] = pre else: other_results['preprocessing'][name] = pre print('\t'.join(names)) print('purities: ' + '\t'.join(map(str, purities))) results.append(purities) consensus_purities = [] if consensus: other_results['consensus'] = {} k = len(np.unique(true_labels)) for name, clusts in clusterings.items(): print(name) clusts = np.vstack(clusts) consensus_clust = CE.cluster_ensembles(clusts, verbose=False, N_clusters_max=k) other_results['consensus'][name] = consensus_clust if use_purity: consensus_purity = purity(consensus_clust.flatten(), true_labels) print('consensus purity: ' + str(consensus_purity)) consensus_purities.append(consensus_purity) if use_nmi: consensus_nmi = nmi(true_labels, consensus_clust) print('consensus NMI: ' + str(consensus_nmi)) consensus_purities.append(consensus_nmi) if use_ari: consensus_ari = ari(true_labels, consensus_clust) print('consensus ARI: ' + str(consensus_ari)) consensus_purities.append(consensus_ari) print('consensus results: ' + '\t'.join(map(str, consensus_purities))) other_results['clusterings'] = clusterings return results, names, other_results
from sklearn.metrics.cluster import adjusted_rand_score as ari from scipy.sparse import csr_matrix filename = 'Zeisel.mat' X = csr_matrix(sio.loadmat(filename)['X']) #loading single-cell RNA-seq data X.data = np.log10( 1 + X.data ) ##take log transform of gene counts. This is very important since it makes the data more gaussian label = sio.loadmat(filename)[ 'true_labs'] #this is ground-truth label for validation c = label.max() # number of clusters ### if the number of genes are more than 500, we recommend to perform pca first! print('Start to Run PCA on the RNA-seq data!\n') start_main = time.time() if X.shape[1] > 500: X = SIMLR.helper.fast_pca(X, 500) else: X = X.todense() print('Successfully Run PCA! PCA took %f seconds in total\n' % (time.time() - start_main)) print('Start to Run SIMLR!\n') start_main = time.time() simlr = SIMLR.SIMLR_LARGE(c, 30, 0) ###This is how we initialize an object for SIMLR. the first input is number of rank (clusters) and the second input is number of neighbors. The third one is an binary indicator whether to use memory-saving mode. you can turn it on when the number of cells are extremely large to save some memory but with the cost of efficiency. S, F, val, ind = simlr.fit(X) print('Successfully Run SIMLR! SIMLR took %f seconds in total\n' % (time.time() - start_main)) y_pred = simlr.fast_minibatch_kmeans(F, c) print('NMI value is %f \n' % nmi(y_pred.flatten(), label.flatten())) print('ARI value is %f \n' % ari(y_pred.flatten(), label.flatten()))