def measure(FC, GC, SL, TL): nmi_s = NMI(SL, FC) nmi_t = NMI(TL, GC) ari_s = ARI(SL, FC) ari_t = ARI(TL, GC) # print(len(set(FC)), len(set(GC))) pri_s = purity(FC, SL) pri_t = purity(GC, TL) ps, rs, fs = PRF1(FC, SL) pt, rt, ft = PRF1(GC, TL) perform_source = [nmi_s, ari_s, pri_s, ps, rs, fs] perform_target = [nmi_t, ari_t, pri_t, pt, rt, ft] return perform_source, perform_target
def get_performance(self, y_true, y_pred): purity = self.get_purity(y_true, y_pred) from sklearn.metrics import normalized_mutual_info_score as NMI from sklearn.metrics import adjusted_rand_score as ARI nmi = NMI(y_true, y_pred) ari = ARI(y_true, y_pred) return purity, nmi, ari
def cluster_scores(latent_space, K, labels_true): labels_pred = KMeans(K).fit_predict(latent_space) return [ silhouette_score(latent_space, labels_true), NMI(labels_true, labels_pred), ARI(labels_true, labels_pred) ]
def test(self, embed): acc_scores = [] nmi_scores = [] ami_scores = [] ari_scores = [] true_labels = self.data.labels.cpu() for _ in range(10): if self.clustering == 'kmeans': pred = KMeans( n_clusters=self.data.num_classes).fit_predict(embed) else: pred = SpectralClustering( n_clusters=self.data.num_classes).fit_predict(embed) acc = self.accuracy(pred) nmi = NMI(true_labels, pred, average_method='arithmetic') ami = AMI(true_labels, pred, average_method='arithmetic') ari = ARI(true_labels, pred) acc_scores.append(acc) nmi_scores.append(nmi) ami_scores.append(ami) ari_scores.append(ari) print("ACC", mean(acc_scores), std(acc_scores)) print("NMI", mean(nmi_scores), std(nmi_scores)) print("AMI", mean(ami_scores), std(ami_scores)) print("ARI", mean(ari_scores), std(ari_scores))
def compute_dist(): with open(path + 'Trapnell_TCC_pairwise_distance_dge.dat','wb') as outfile: pickle.dump(D, outfile) path = '/home/zgy_ucla_cs/Research/singleCell/TCC_old_pipeline/scRNA-Clustering/Trapnell_pipeline/' with open(path + "Trapnell_TCC_pairwise_distance_21.dat", 'rb') as f: D=pickle.load(f, encoding='latin1') with open(path + "Trapnell_TCC_pairwise_distance_31.dat", 'rb') as f: D=pickle.load(f, encoding='latin1') cluster_labels = np.loadtxt(path + 'Trapnells_data/Trapnell_labels.txt',dtype=str).astype(int)-1 num_of_clusters=3 similarity_mat= D.max()-D labels_spectral = spectral(num_of_clusters,similarity_mat) print(NMI(cluster_labels, labels_spectral), ARI(cluster_labels, labels_spectral)) # ===================== scVI ===================== # expression_train, expression_test, cluster_labels, c_test = train_test_split(X, X_type, random_state=0) batch_size = 128 learning_rate = 0.001 epsilon = 0.01 latent_dimension = 10 tf.reset_default_graph() expression = tf.placeholder(tf.float32, (None, X.shape[1]), name='x') kl_scalar = tf.placeholder(tf.float32, (), name='kl_scalar') optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=epsilon) training_phase = tf.placeholder(tf.bool, (), name='training_phase') # getting priors log_library_size = np.log(np.sum(X, axis=1)) mean, var = np.mean(log_library_size), np.var(log_library_size) # loading data model = scVI.scVIModel(expression=expression, kl_scale=kl_scalar, \ optimize_algo=optimizer, phase=training_phase, \ library_size_mean=mean, library_size_var=var, n_latent=latent_dimension) #starting computing session sess = tf.Session() # Initialize the graph and fit the training set # this takes less than a minute on a Tesla K80 sess.run(tf.global_variables_initializer()) result = train_model(model, (X, X), sess, 250, batch_size=batch_size) dic_full = {expression: X, training_phase:False} latent = sess.run(model.z, feed_dict=dic_full) # clustering_score = cluster_scores(latent, len(cell_types), cluster_labels) clustering_score = cluster_scores(latent, np.max(cluster_labels), cluster_labels) print("Silhouette", clustering_score[0], "\nAdjusted Rand Index", clustering_score[1], \ "\nNormalized Mutual Information", clustering_score[2])
def evaluate(net, loader): """evaluates on provided data """ net.eval() predicts = np.zeros(len(loader.dataset), dtype=np.int32) labels = np.zeros(len(loader.dataset), dtype=np.int32) with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(loader): logger.progress('processing %d/%d batch' % (batch_idx, len(loader))) inputs = inputs.to(cfg.device, non_blocking=True) # assuming the last head is the main one # output dimension of the last head # should be consistent with the ground-truth logits = net(inputs)[-1] start = batch_idx * loader.batch_size end = start + loader.batch_size end = min(end, len(loader.dataset)) labels[start:end] = targets.cpu().numpy() predicts[start:end] = logits.max(1)[1].cpu().numpy() # compute accuracy num_classes = labels.max().item() + 1 count_matrix = np.zeros((num_classes, num_classes), dtype=np.int32) for i in xrange(predicts.shape[0]): count_matrix[predicts[i], labels[i]] += 1 reassignment = np.dstack( linear_sum_assignment(count_matrix.max() - count_matrix))[0] acc = count_matrix[reassignment[:, 0], reassignment[:, 1]].sum().astype( np.float32) / predicts.shape[0] return acc, NMI(labels, predicts), ARI(labels, predicts)
def clustering_scores(self, name, verbose=True, prediction_algorithm='knn'): if self.gene_dataset.n_labels > 1: latent, _, labels = get_latent(self.model, self.data_loaders[name]) if prediction_algorithm == 'knn': labels_pred = KMeans(self.gene_dataset.n_labels, n_init=200).fit_predict( latent) # n_jobs>1 ? elif prediction_algorithm == 'gmm': gmm = GMM(self.gene_dataset.n_labels) gmm.fit(latent) labels_pred = gmm.predict(latent) asw_score = silhouette_score(latent, labels) nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) uca_score = unsupervised_clustering_accuracy(labels, labels_pred)[0] if verbose: print( "Clustering Scores for %s:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nUCA: %.4f" % (name, asw_score, nmi_score, ari_score, uca_score)) return asw_score, nmi_score, ari_score, uca_score
def clustering_scores(n_labels, labels, latent, prediction_algorithm="knn"): if n_labels > 1: if prediction_algorithm == "knn": labels_pred = KMeans(n_labels, n_init=200).fit_predict(latent) # n_jobs>1 ? elif prediction_algorithm == "gmm": gmm = GMM(n_labels) gmm.fit(latent) labels_pred = gmm.predict(latent) ari_score = ARI(labels, labels_pred) return ari_score
def analysis(): df_gan = pd.read_csv('result/pbmc_two_batch-cluster_result.csv', delimiter=',') df_lsi = pd.read_csv('result/pbmc_two_batch-LSI-cluster_result.csv', delimiter=',') labels = df_lsi['predicted label'].values labels_pred = df_gan['predicted label'].values nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) print("Clustering Scores:\nNMI: %.4f\nARI: %.4f\n" % (nmi_score, ari_score))
def clustering(): str_input = [i.strip() for i in open('./ts2str', 'r').readlines()] tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, analyzer='char', ngram_range=(2, 5)) str2tfidf = tfidf_vectorizer.fit_transform(str_input) #print tfidf_vectorizer.get_feature_names() sc = SpectralClustering( n_clusters=113, eigen_solver='arpack', affinity="nearest_neighbors", #assign_labels="discretize" ) y_pred = sc.fit_predict(str2tfidf) #y_true = [int(i.strip()) for i in open('./ts_cluster','r').readlines()] y_true = [i.strip() for i in open('./ts_type', 'r').readlines()] y_true = y_true[1:] print ARI(y_true, y_pred) y_shuffle = list(y_true) random.shuffle(y_shuffle) print ARI(y_true, y_shuffle)
def clustering_scores(self, name, verbose=True): if self.gene_dataset.n_labels > 1: latent, _, labels = get_latent(self.model, self.data_loaders[name]) labels_pred = KMeans(self.gene_dataset.n_labels, n_init=200).fit_predict(latent) # n_jobs>1 ? asw_score = silhouette_score(latent, labels) nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) if verbose: print( "Clustering Scores for %s:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f" % (name, asw_score, nmi_score, ari_score)) return asw_score, nmi_score, ari_score
def get_performance(y_true, y_pred, n_cluster): """ 获取当前轮次的评估指标 :param y_true: :param y_pred: :param n_cluster: :return: """ purity = get_purity(y_true, y_pred, n_cluster) from sklearn.metrics import normalized_mutual_info_score as NMI from sklearn.metrics import adjusted_rand_score as ARI nmi = NMI(y_true, y_pred) ari = ARI(y_true, y_pred) return purity, nmi, ari
def score(self, z, true_labels): assert self.labels is not None, "Cannot compute clustering scores before fitting the data." self.silhouette_score = silhouette_score(z, self.labels) self.nmi = NMI( true_labels, self.labels, average_method="geometric") # Same average as original paper self.ari = ARI(true_labels, self.labels) true_k = len(np.unique(true_labels)) if self.k == true_k: self.accuracy = accuracy(true_labels, self.labels) else: print( "Fitted number of labels ({}) is not equal to given true number of labels ({}). Cannot " "compute accuracy.".format(self.k, true_k))
def clusterize_search( word, vecs, gold_sense_ids = None ,ncs=list(range(1, 5, 1)) + list(range(5, 12, 2)), affinities=('cosine',), linkages=('average',)): if linkages is None: linkages = sklearn.cluster.hierarchical._TREE_BUILDERS.keys() if affinities is None: affinities = ('cosine', 'euclidean', 'manhattan') sdfs = [] mem = Memory('maxari_cache', verbose=0) zero_vecs = ((vecs ** 2).sum(axis=-1) == 0) if zero_vecs.sum() > 0: vecs = np.concatenate((vecs, zero_vecs[:, np.newaxis].astype(vecs.dtype)), axis=-1) best_clids = None best_silhouette = 0 distances = [] for affinity in affinities: distance_matrix = cdist(vecs, vecs, metric=affinity) distances.append(distance_matrix) for nc in ncs: for linkage in linkages: if linkage == 'ward' and affinity != 'euclidean': continue clr = AgglomerativeClustering(affinity='precomputed', linkage=linkage, n_clusters=nc, memory=mem) clids = clr.fit_predict(distance_matrix) if nc > 1 else np.zeros(len(vecs)) ari = ARI(gold_sense_ids, clids) if gold_sense_ids is not None else np.nan sil_cosine = -1. if len(np.unique(clids)) < 2 else silhouette_score(vecs, clids,metric='cosine') sil_euclidean = -1. if len(np.unique(clids)) < 2 else silhouette_score(vecs, clids, metric='euclidean') vc = '' if gold_sense_ids is None else '/'.join( np.sort(pd.value_counts(gold_sense_ids).values)[::-1].astype(str)) if sil_cosine > best_silhouette: best_silhouette = sil_cosine best_clids = clids sdf = pd.DataFrame({'ari': ari, 'word': word, 'nc': nc, 'sil_cosine': sil_cosine, 'sil_euclidean': sil_euclidean, 'vc': vc, 'affinity': affinity, 'linkage': linkage}, index=[0]) sdfs.append(sdf) sdf = pd.concat(sdfs, ignore_index=True) return best_clids, sdf, None, distances
def clustering_scores(self, prediction_algorithm="knn"): if self.gene_dataset.n_labels > 1: latent, _, labels = self.get_latent() if prediction_algorithm == "knn": labels_pred = KMeans(self.gene_dataset.n_labels, n_init=200).fit_predict(latent) # n_jobs>1 ? elif prediction_algorithm == "gmm": gmm = GMM(self.gene_dataset.n_labels) gmm.fit(latent) labels_pred = gmm.predict(latent) asw_score = silhouette_score(latent, labels) nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) uca_score = unsupervised_clustering_accuracy(labels, labels_pred)[0] logger.debug("Clustering Scores:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nUCA: %.4f" % (asw_score, nmi_score, ari_score, uca_score)) return asw_score, nmi_score, ari_score, uca_score
def main(): cwd = os.getcwd() + '/' + sys.argv[0].replace('test.py', '') sys.stdout.write("Testing... this may take a few minutes.\n") sys.stdout.flush() process = Popen("CHIMERA -i "+ cwd+"/test_data.csv -r "+ cwd+"/output.txt " +\ "-k 2 -m 20 -N 3 -e 0.01", shell=True) process.communicate() with open(cwd + '/output.txt') as f: out_label = numpy.asarray(list(csv.reader(f, delimiter='\t'))) idx = numpy.nonzero(out_label[0] == "Cluster")[0] out_label = out_label[1:, idx].flatten().astype(numpy.int) true_label = numpy.append(numpy.ones(250), numpy.ones(250) * 2) measure = ARI(true_label, out_label) sys.stdout.write("Test Complete, output labels in test/ folder.\n") sys.stdout.write( "Clustering test samples yields an adjusted rand index of %.3f with ground truth labels.\n" % measure) if measure >= 0.9: sys.stdout.write("Test is successful.\n")
def clustering_scores(X, y, prediction_algorithm='knn'): from sklearn.metrics import adjusted_rand_score as ARI from sklearn.metrics import normalized_mutual_info_score as NMI from sklearn.metrics import silhouette_score from sklearn.mixture import GaussianMixture as GMM from sklearn.cluster import KMeans cluster_num = np.unique(y).shape[0] if prediction_algorithm == 'knn': labels_pred = KMeans(cluster_num, n_init=200).fit_predict(X) elif prediction_algorithm == 'gmm': gmm = GMM(cluster_num) gmm.fit(X) labels_pred = gmm.predict(X) labels = y asw_score = silhouette_score(X, labels) nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) labels_int = convert_label_to_int(labels) uca_score = unsupervised_clustering_accuracy(labels_int, labels_pred)[0] return asw_score, nmi_score, ari_score, uca_score
def test_casc_cca(): n = [10, 10] p = [[0.8, 0.2], [0.2, 0.8]] np.random.seed(105) A = sbm(n=n, p=p) covarites = np.array( [ [1.0, 0.0], [1.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 0.0], [0.0, 1.0], ] ) casc = CovariateAssistedSpectralEmbed( n_components=2, assortative=True, cca=True, check_lcc=False ) casc_results = casc.fit_predict(np.array(A), covarites, y=None, return_full=False) ans = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ResultARI = ARI(casc_results, ans) assert ResultARI == 1
# ============================== spectral ============================== # Load Z TCC 31 path = '/home/zgy_ucla_cs/Research/singleCell/TCC_old_pipeline/scRNA-Clustering/Zeisel_pipeline/mat_31/' with open(path + "pwise_dist_L1.dat", 'rb') as f: D = pickle.load(f) #, encoding='latin1') path = '/home/zgy_ucla_cs/Research/singleCell/scRNA-Seq-TCC-prep/Zeisel/' with open(path + "pwise_dist_l1.dat", 'rb') as f: D_l1 = pickle.load(f) #, encoding='latin1') D = D_l1 num_of_clusters = 9 similarity_mat = D.max() - D labels_spectral = spectral(num_of_clusters, similarity_mat) print(NMI(cluster_labels, labels_spectral), ARI(cluster_labels, labels_spectral)) # pwise_dist_latent = latent_dist(X, 10) # similarity_mat=pwise_dist_latent.max()-pwise_dist_latent # labels_spectral = spectral(num_of_clusters,similarity_mat) # print(NMI(cluster_labels, labels_spectral), ARI(cluster_labels, labels_spectral)) # ===================== scVI ===================== # expression_train, expression_test, c_train, c_test = train_test_split(X, X_type, random_state=0) expression_train, expression_test, c_train, c_test = train_test_split( expression_data, X_type, random_state=0) log_library_size = np.log(np.sum(expression_train, axis=1)) mean, var = np.mean(log_library_size), np.var(log_library_size) batch_size = 128
accm = Accumulator('model ll', 'oracle ll', 'ARI', 'NMI', 'k-MAE', 'et') for dataset in tqdm(benchmark): true_labels = to_numpy(dataset['labels'].argmax(-1)) X = to_numpy(dataset['X']) ll = 0 ari = 0 nmi = 0 mae = 0 et = 0 for b in range(len(X)): tick = time.time() vbmog.run(X[b], verbose=False) et += time.time() - tick ll += vbmog.loglikel(X[b]) labels = vbmog.labels() ari += ARI(true_labels[b], labels) nmi += NMI(true_labels[b], labels, average_method='arithmetic') mae += abs(len(np.unique(true_labels[b])) - len(np.unique(labels))) ll /= len(X) ari /= len(X) nmi /= len(X) mae /= len(X) et /= len(X) accm.update([ll.item(), dataset['ll'], ari, nmi, mae, et]) save_dir = os.path.join(results_path, 'baselines', 'vbmog') if not os.path.isdir(save_dir): os.makedirs(save_dir) logger = get_logger('vbmog_baseline', os.path.join(save_dir, args.filename))
random_state=random_state).fit_predict(dataset) #ejecucion de HAC averange linkage labels_HAC_averange = AgglomerativeClustering( n_clusters=K, linkage="average").fit_predict(dataset) #ejecucion de HAC single linkage links = linkage(dataset, "single") labels_HAC_single = fcluster(links, K, criterion="maxclust") #ejecucion de HAC complete linkage labels_HAC_complete = AgglomerativeClustering( n_clusters=K, linkage="complete").fit_predict(dataset) #calculo de la metrica ARI para los algoritmos de clustering result_ARI[j][0] = ARI(labels_true, labels_kmeans) result_ARI[j][1] = ARI(labels_true, labels_HAC_averange) result_ARI[j][2] = ARI(labels_true, labels_HAC_single) result_ARI[j][3] = ARI(labels_true, labels_HAC_complete) #validacion de las dimenciones del dataset para saber si se crearan los scatter plots de los algoritmos if (D == 2): #creamos la figura para los scatterplots figure = plot.figure(figsize=(12, 18)) #creamos el scatter plot del Ground Truth plot.subplot(321) plot.scatter(dataset[:, 0], dataset[:, 1], c=labels_true, linewidth=1) plot.title("Ground Truth", fontsize=18, fontweight="bold")
def clustering_metrics(labels_pred, labels_true): return {"NMI":NMI(labels_true, labels_pred), "ARI":ARI(labels_true, labels_pred),\ "F1":f1_score(labels_true, labels_pred, average='weighted')}
accPRcut = (accPRcut + 0.0) / (s0 * s1) arr_scale.append(scale) arr_processtime.append(process_time) arr_Rcut_time.append(Rcut_time) arr_accRcut.append(accRcut) arr_PRcut_time.append(PRcut_time) arr_accPRcut.append(accPRcut) print '***********************************' print ' Scale = ', img.shape print 'Constructing graph time = ', process_time print 'Ratio cut time = ', Rcut_time print ' Accuracy = ', AMI(img_gt.flatten(), labelsRcut), ARI( img_gt.flatten(), labelsRcut), accRcut print 'PowerRatio cut time = ', PRcut_time print ' Accuracy = ', AMI(img_gt.flatten(), labelsPRcut), ARI( img_gt.flatten(), labelsPRcut), accPRcut print '***********************************' # Writing the results to a csv file f = open('results_8c.csv', "w+") for i in range(len(arr_scale)): l = "" l += str(arr_scale[i]) + "," + str(arr_processtime[i]) + "," l += str(arr_Rcut_time[i]) + "," + str(arr_accRcut[i]) + "," + str( AMI(img_gt.flatten(), labelsRcut)) + "," + str(
print("The number of dimensions is: " + str(data.shape[1])) return data if __name__ == "__main__": from FlowGrid import * import numpy as np from time import time import argparse import os file, bin_n, MinDenC, eps, output, label_file = setting_arg() data = check_file_valid(file) t1 = time() if MinDenC: fg = FlowGrid(data, bin_n=bin_n, eps=eps, MinDenC=MinDenC) else: fg = FlowGrid(data, bin_n=bin_n, eps=eps) label = fg.clustering() print("runing time: " + str(round(time() - t1, 3))) if output: np.savetxt(output, label, delimiter=',', fmt="%d") else: np.savetxt(file[:-4] + "_FlowGrid_label.csv", label, delimiter=',', fmt="%d") if label_file: from sklearn.metrics import adjusted_rand_score as ARI true_label = np.genfromtxt(label_file, delimiter=',', skip_header=1) print("ARI:" + str(round(ARI(true_label, label), 4)))
m = 1.2 n_runs = 5 for adataset in range(len(datasets)): print('Dataset: ', datasets[adataset]) tmp = np.load('datasets/'+datasets[adataset]) X = tmp['X'] y = tmp['y'] k = len(np.unique(y)) print('Data size =', X.shape) print('# clusters =', k) for i_runs in range(n_runs): u, w, centers, cost, svals, BIC, selected_idx = select_s( X, m=m, num_s=n_s, n_clusters=k, max_iter=max_iter, n_init=n_init, tol=tol, n_jobs=10 ) #print('svals', svals) #print('selected s:', svals[selected_idx]) #print('w:', w[selected_idx]) #print('cost:', cost[selected_idx]) #print('BIC:', BIC) #print(BIC.argmax()) from sklearn.metrics import adjusted_rand_score as ARI ari = ARI(y, u[selected_idx].argmax(axis=0)) print('ARI =', ari) #break
def evaluate(net, loader, writer, epoch): """evaluates on provided data """ net.eval() predicts = np.zeros(len(loader.dataset), dtype=np.int32) labels = np.zeros(len(loader.dataset), dtype=np.int32) intermediates = np.zeros((len(loader.dataset), 2048), dtype=np.float32) images = np.zeros((len(loader.dataset), 3, 64, 64), dtype=np.float32) print(f"Evaluating on {len(loader.dataset)} samples") with torch.no_grad(): for batch_idx, (batch, targets) in enumerate(loader): # logger.progress('processing %d/%d batch' % (batch_idx, len(loader))) batch = batch.to(cfg.device, non_blocking=True) # assuming the last head is the main one # output dimension of the last head # should be consistent with the ground-truth logits = net(batch, -1) start = batch_idx * loader.batch_size end = start + loader.batch_size end = min(end, len(loader.dataset)) labels[start:end] = targets.cpu().numpy() predicts[start:end] = logits.max(1)[1].cpu().numpy() if epoch % cfg.embedding_freq == 0: intermediates[start:end] = net(batch, -1, True).cpu().numpy() if not cfg.tfm_adaptive_thresholding: for i in range(3): batch[:, i] = (batch[:, i] * cfg.tfm_stds[i]) + cfg.tfm_means[i] images[start:end] = torch.nn.functional.interpolate( batch, size=(64, 64), mode='bicubic', align_corners=False).cpu().numpy() # TODO: Gather labels and predicts # compute accuracy num_classes = labels.max().item() + 1 count_matrix = np.zeros((num_classes, num_classes), dtype=np.int32) for i in range(predicts.shape[0]): count_matrix[predicts[i], labels[i]] += 1 reassignment = np.dstack( linear_sum_assignment(count_matrix.max() - count_matrix))[0] acc = count_matrix[reassignment[:, 0], reassignment[:, 1]].sum().astype( np.float32) / predicts.shape[0] nmi = NMI(labels, predicts) ari = ARI(labels, predicts) # compute f1 scores per class predicts_reassigned = reassignment[predicts, 1] precision = precision_score(labels, predicts_reassigned, average=None, zero_division=0) recall = recall_score(labels, predicts_reassigned, average=None, zero_division=0) f1 = f1_score(labels, predicts_reassigned, average=None, zero_division=0) logger.info('Evaluation results at epoch %d are: ' 'ACC: %.3f, NMI: %.3f, ARI: %.3f' % (epoch, acc, nmi, ari)) if cfg.local_rank == 0: writer.add_scalar('Evaluate/ACC', acc, epoch) writer.add_scalar('Evaluate/NMI', nmi, epoch) writer.add_scalar('Evaluate/ARI', ari, epoch) for i in range(len(f1)): writer.add_scalar(f'Evaluate/f1_{i}', f1[i], epoch) writer.add_scalar(f'Evaluate/precision_{i}', precision[i], epoch) writer.add_scalar(f'Evaluate/recall_{i}', recall[i], epoch) if epoch % cfg.embedding_freq == 0 and cfg.embedding_freq != -1: writer.add_embedding(intermediates, labels, images, epoch, cfg.session) return acc
args, _ = parser.parse_known_args() print(str(args)) benchmark = torch.load(os.path.join(benchmarks_path, args.benchmarkfile)) accm = Accumulator('ari', 'nmi', 'et') for batch in tqdm(benchmark): B = batch['X'].shape[0] for b in range(B): X = to_numpy(batch['X'][b]) true_labels = to_numpy(batch['labels'][b].argmax(-1)) true_K = len(np.unique(true_labels)) tick = time.time() spec = SpectralClustering(n_clusters=true_K, affinity='nearest_neighbors', n_neighbors=10).fit(X) labels = spec.labels_ accm.update([ ARI(true_labels, labels), NMI(true_labels, labels, average_method='arithmetic'), time.time() - tick ]) save_dir = os.path.join(results_path, 'baselines', 'mmaf_spectral') if not os.path.isdir(save_dir): os.makedirs(save_dir) logger = get_logger('spectral_baseline', os.path.join(save_dir, args.filename)) logger.info(accm.info())
plt.yticks(()) plt.text(.99, .01, ('Comp time ' '%.2fs' % (t1 - t0)).lstrip('0'), transform=plt.gca().transAxes, size=10, horizontalalignment='right') plt.text(.99, .06, ('Purity score ' '%.3f' % (homogeneity_score(y, y_pred))).lstrip('0'), transform=plt.gca().transAxes, size=10, horizontalalignment='right') plt.text(.99, .11, ('Rand Index ' '%.3f' % (ARI(y, y_pred))).lstrip('0'), transform=plt.gca().transAxes, size=10, horizontalalignment='right') plt.text( .99, .16, ('Silhouette score ' '%.3f' % (silhouette(X, y_pred, metric='euclidean', sample_size=n_samples))).lstrip('0'), transform=plt.gca().transAxes, size=10, horizontalalignment='right') plot1_num += 1
def clustering(dataFile, outFile, config): """Core function of CHIMERA, performs: 1) read and preprocess data 2) clustering 3) save results """ #================================= Reading Data ====================================================== sys.stdout.write('\treading data...\n') feat_cov = None feat_set = None ID = None with open(dataFile) as f: data = list(csv.reader(f)) header = np.asarray(data[0]) if 'Group' not in header: sys.stdout.write( 'Error: group information not found. Please check csv header line for field "Group".\n' ) sys.exit(1) if 'IMG' not in header: sys.stdout.write( 'Error: image features not found. Please check csv header line for field "IMG".\n' ) sys.exit(1) data = np.asarray(data[1:]) group = (data[:, np.nonzero(header == 'Group')[0]].flatten()).astype( np.int8) feat_img = (data[:, np.nonzero(header == 'IMG')[0]]).astype(np.float) if 'COVAR' in header: feat_cov = (data[:, np.nonzero(header == 'COVAR')[0]]).astype( np.float) if 'ID' in header: ID = data[:, np.nonzero(header == 'ID')[0]] ID = ID[group == 1] if 'Set' in header: feat_set = data[:, np.nonzero(header == 'Set')[0]].flatten() #================================= Normalizing Data ====================================================== if config['norm'] != 0: model, feat_img, feat_cov = data_normalization(feat_img, feat_cov, config) #================================= Prepare Dataset ID ====================================================== if feat_set is None: config['rs'] = 0 else: unique_ID = np.unique(feat_set) datasetID = np.copy(feat_set) feat_set = np.zeros((len(datasetID), len(unique_ID))) for i in range(len(unique_ID)): feat_set[np.nonzero(datasetID == unique_ID[i])[0], i] = 1 #================================= Calculate auto weight ================================================== if feat_cov is None: config['r'] = 0 else: if config['r'] == -1.0: config['r'] = np.sum(np.var(feat_cov, axis=0)) / np.sum( np.var(feat_img, axis=0)) #================================= Verbose information ================================================== if config['verbose']: sys.stdout.write( '\t\t================= data summary ==================\n') sys.stdout.write('\t\tnumber of patients: %d\n' % sum(group == 1)) sys.stdout.write('\t\tnumber of normal controls: %d\n' % sum(group == 0)) sys.stdout.write('\t\timaging feature dimension: %d\n' % feat_img.shape[1]) if feat_cov is not None: sys.stdout.write('\t\tcovariates dimension: %d\n' % feat_cov.shape[1]) if feat_set is not None: sys.stdout.write('\t\tunique data set id: %d\n' % len(unique_ID)) sys.stdout.write( '\t\t================ configurations =================\n') sys.stdout.write('\t\tnumber of clusters: %d\n' % config['K']) sys.stdout.write('\t\tnumber of runs: %d\n' % config['numRun']) sys.stdout.write('\t\tmax number of iterations: %d\n' % config['max_iter']) sys.stdout.write('\t\tdistance ratio covar/img = %.4f\n' % config['r']) sys.stdout.write('\t\tdistance ratio set/img = %.4f\n' % config['rs']) sys.stdout.write('\t\tlambda1 = %.2f\tlambda2 = %.2f\n' % (config['lambda1'], config['lambda2'])) sys.stdout.write('\t\ttransformation chosen: %s\n' % config['transform']) sys.stdout.write( '\t\t=================================================\n') #============================ Preparing Data ====================================================== # separate data into patient and normal groups feat_img = np.transpose(feat_img) x = feat_img[:, group == 0] # normal controls y = feat_img[:, group == 1] # patients xd = [] yd = [] xs = [] ys = [] if feat_cov is not None: feat_cov = np.transpose(feat_cov) xd = feat_cov[:, group == 0] yd = feat_cov[:, group == 1] if feat_set is not None: feat_set = np.transpose(feat_set) xs = feat_set[:, group == 0] ys = feat_set[:, group == 1] #================================Perform Clustering (2 modes available)================================= sys.stdout.write('\tclustering...\n') if config['mode'] == 2: #save result yields minimal energy obj = np.float('inf') for i in range(config['numRun']): cur_result = optimize(x, xd, xs, y, yd, ys, config) cur_obj = cur_result[2].min() if config['verbose']: sys.stdout.write('\t\tRun id %d, obj = %f\n' % (i, cur_obj)) else: time_bar(i, config['numRun']) if cur_obj < obj: result = cur_result obj = cur_obj sys.stdout.write('\n') membership = np.dot(result[1], Tr(result[0]['delta'])) label = np.argmax(membership, axis=1) else: # save result most reproducible label_mat = [] results = [] for i in range(config['numRun']): cur_result = optimize(x, xd, xs, y, yd, ys, config) membership = np.dot(cur_result[1], Tr(cur_result[0]['delta'])) label = np.argmax(membership, axis=1) label_mat.append(label) results.append(cur_result) time_bar(i, config['numRun']) sys.stdout.write('\n') label_mat = np.asarray(label_mat) ari_mat = np.zeros((config['numRun'], config['numRun'])) for i in range(config['numRun']): for j in range(i + 1, config['numRun']): ari_mat[i, j] = ARI(label_mat[i, :], label_mat[j, :]) ari_mat[j, i] = ari_mat[i, j] ave_ari = np.sum(ari_mat, axis=0) / (config['numRun'] - 1) idx = np.argmax(ave_ari) if config['verbose']: sys.stdout.write('\t\tBest average ARI is %f\n' % (max(ave_ari))) label = label_mat[idx, :] result = results[idx] #================================ Finalizing and Save ===================================== sys.stdout.write('\tsaving results...\n') with open(outFile, 'w') as f: if ID is None: f.write('Cluster\n') for i in range(len(label)): f.write('%d\n' % (label[i] + 1)) else: f.write('ID,Cluster\n') for i in range(len(label)): f.write('%s,%d\n' % (ID[i][0], label[i] + 1)) if config['modelFile'] != "": trainData = {'x': x, 'xd': xd, 'xs': xs, 'datasetID': unique_ID} model.update({'trainData': trainData}) model.update({'model': result}) model.update({'config': config}) with open(config['modelFile'], 'wb') as f: cPickle.dump(model, f, 2)
def clustering(Xsvd, cells, dataset, suffix, labels=None, tlabels=None, method='knn', istsne=True, name='', batch_labels=None, seed=42): tsne = TSNE(n_jobs=24).fit_transform(Xsvd) for n_components in [15]: if method == 'gmm': clf = mixture.GaussianMixture(n_components=n_components).fit(mat) labels_pred = clf.predict(tsne) elif method == 'knn': labels_pred = KMeans(n_components, n_init=200).fit_predict(tsne) # n_jobs>1 ? elif method == 'dbscan': labels_pred = DBSCAN(eps=0.3, min_samples=10).fit(tsne).labels_ elif method == 'spectral': spectral = cluster.SpectralClustering(n_clusters=n_components, eigen_solver='arpack', affinity="nearest_neighbors") labels_pred = spectral.fit_predict(tsne) elif method == 'louvain': from scipy.spatial import distance for louvain in [30]: print('****', louvain) mat = kneighbors_graph(Xsvd, louvain, mode='distance', include_self=True).todense() G = nx.from_numpy_matrix(mat) partition = community.best_partition(G, random_state=seed) labels_pred = [] for i in range(mat.shape[0]): labels_pred.append(partition[i]) labels_pred = np.array(labels_pred) print('louvain', louvain, tsne[:5], len(labels), len(labels_pred)) #print(np.unique(labels_pred)) if labels is not None: nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) print( n_components, method, "Clustering Scores:\nNMI: %.4f\nARI: %.4f\n" % (nmi_score, ari_score)) if istsne: n_components = len(np.unique(labels_pred)) vis_x = tsne[:, 0] vis_y = tsne[:, 1] colors = [ 'blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'yellow', 'black', 'teal', 'plum', 'tan', 'bisque', 'beige', 'slategray', 'brown', 'darkred', 'salmon', 'coral', 'olive', 'lightpink', 'teal', 'darkcyan', 'BlueViolet', 'CornflowerBlue', 'DarkKhaki', 'DarkTurquoise' ] show_tsne(tsne, labels, 'result/%s/%s-%s-LSI-true.png' % (dataset, name, suffix), tlabels=tlabels) show_tsne(tsne, labels_pred, 'result/%s/%s-%s-LSI-pred.png' % (dataset, name, suffix)) with open('result/%s-LSI-cluster_result.csv' % (dataset), 'w') as f: f.write('cell,predicted label,tsne-1,tsne-2\n') for cell, pred, t in zip(cells, labels_pred, tsne): f.write('%s,%d,%f,%f\n' % (cell, pred, t[0], t[1])) if batch_labels is not None: show_tsne( tsne, batch_labels, 'result/%s/%s-GMVAE-%s-%s-batch.png' % (dataset, dataset, suffix, name))