def measure(FC, GC, SL, TL): nmi_s = NMI(SL, FC) nmi_t = NMI(TL, GC) ari_s = ARI(SL, FC) ari_t = ARI(TL, GC) # print(len(set(FC)), len(set(GC))) pri_s = purity(FC, SL) pri_t = purity(GC, TL) ps, rs, fs = PRF1(FC, SL) pt, rt, ft = PRF1(GC, TL) perform_source = [nmi_s, ari_s, pri_s, ps, rs, fs] perform_target = [nmi_t, ari_t, pri_t, pt, rt, ft] return perform_source, perform_target
def cluster_scores(latent_space, K, labels_true): labels_pred = KMeans(K).fit_predict(latent_space) return [ silhouette_score(latent_space, labels_true), NMI(labels_true, labels_pred), ARI(labels_true, labels_pred) ]
def calc_NMI(G): # top と bottom 両方を計算して返す top = {n: d for n, d in G.nodes(data=True) if d['bipartite'] == 0} bottom = {n: d for n, d in G.nodes(data=True) if d['bipartite'] == 1} pred_top = [] truth_top = [] for n, d in top.items(): pred_top.append(d['label']) truth_top.append(d['community']) pred_bottom = [] truth_bottom = [] for n, d in bottom.items(): pred_bottom.append(d['label']) truth_bottom.append(d['community']) return NMI(truth_top, pred_top, average_method='arithmetic'), \ NMI(truth_bottom, pred_bottom, average_method='arithmetic')
def test(self, embed): acc_scores = [] nmi_scores = [] ami_scores = [] ari_scores = [] true_labels = self.data.labels.cpu() for _ in range(10): if self.clustering == 'kmeans': pred = KMeans( n_clusters=self.data.num_classes).fit_predict(embed) else: pred = SpectralClustering( n_clusters=self.data.num_classes).fit_predict(embed) acc = self.accuracy(pred) nmi = NMI(true_labels, pred, average_method='arithmetic') ami = AMI(true_labels, pred, average_method='arithmetic') ari = ARI(true_labels, pred) acc_scores.append(acc) nmi_scores.append(nmi) ami_scores.append(ami) ari_scores.append(ari) print("ACC", mean(acc_scores), std(acc_scores)) print("NMI", mean(nmi_scores), std(nmi_scores)) print("AMI", mean(ami_scores), std(ami_scores)) print("ARI", mean(ari_scores), std(ari_scores))
def clustering_scores(self, name, verbose=True, prediction_algorithm='knn'): if self.gene_dataset.n_labels > 1: latent, _, labels = get_latent(self.model, self.data_loaders[name]) if prediction_algorithm == 'knn': labels_pred = KMeans(self.gene_dataset.n_labels, n_init=200).fit_predict( latent) # n_jobs>1 ? elif prediction_algorithm == 'gmm': gmm = GMM(self.gene_dataset.n_labels) gmm.fit(latent) labels_pred = gmm.predict(latent) asw_score = silhouette_score(latent, labels) nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) uca_score = unsupervised_clustering_accuracy(labels, labels_pred)[0] if verbose: print( "Clustering Scores for %s:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nUCA: %.4f" % (name, asw_score, nmi_score, ari_score, uca_score)) return asw_score, nmi_score, ari_score, uca_score
def get_performance(self, y_true, y_pred): purity = self.get_purity(y_true, y_pred) from sklearn.metrics import normalized_mutual_info_score as NMI from sklearn.metrics import adjusted_rand_score as ARI nmi = NMI(y_true, y_pred) ari = ARI(y_true, y_pred) return purity, nmi, ari
def compute_dist(): with open(path + 'Trapnell_TCC_pairwise_distance_dge.dat','wb') as outfile: pickle.dump(D, outfile) path = '/home/zgy_ucla_cs/Research/singleCell/TCC_old_pipeline/scRNA-Clustering/Trapnell_pipeline/' with open(path + "Trapnell_TCC_pairwise_distance_21.dat", 'rb') as f: D=pickle.load(f, encoding='latin1') with open(path + "Trapnell_TCC_pairwise_distance_31.dat", 'rb') as f: D=pickle.load(f, encoding='latin1') cluster_labels = np.loadtxt(path + 'Trapnells_data/Trapnell_labels.txt',dtype=str).astype(int)-1 num_of_clusters=3 similarity_mat= D.max()-D labels_spectral = spectral(num_of_clusters,similarity_mat) print(NMI(cluster_labels, labels_spectral), ARI(cluster_labels, labels_spectral)) # ===================== scVI ===================== # expression_train, expression_test, cluster_labels, c_test = train_test_split(X, X_type, random_state=0) batch_size = 128 learning_rate = 0.001 epsilon = 0.01 latent_dimension = 10 tf.reset_default_graph() expression = tf.placeholder(tf.float32, (None, X.shape[1]), name='x') kl_scalar = tf.placeholder(tf.float32, (), name='kl_scalar') optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=epsilon) training_phase = tf.placeholder(tf.bool, (), name='training_phase') # getting priors log_library_size = np.log(np.sum(X, axis=1)) mean, var = np.mean(log_library_size), np.var(log_library_size) # loading data model = scVI.scVIModel(expression=expression, kl_scale=kl_scalar, \ optimize_algo=optimizer, phase=training_phase, \ library_size_mean=mean, library_size_var=var, n_latent=latent_dimension) #starting computing session sess = tf.Session() # Initialize the graph and fit the training set # this takes less than a minute on a Tesla K80 sess.run(tf.global_variables_initializer()) result = train_model(model, (X, X), sess, 250, batch_size=batch_size) dic_full = {expression: X, training_phase:False} latent = sess.run(model.z, feed_dict=dic_full) # clustering_score = cluster_scores(latent, len(cell_types), cluster_labels) clustering_score = cluster_scores(latent, np.max(cluster_labels), cluster_labels) print("Silhouette", clustering_score[0], "\nAdjusted Rand Index", clustering_score[1], \ "\nNormalized Mutual Information", clustering_score[2])
def evaluate(net, loader): """evaluates on provided data """ net.eval() predicts = np.zeros(len(loader.dataset), dtype=np.int32) labels = np.zeros(len(loader.dataset), dtype=np.int32) with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(loader): logger.progress('processing %d/%d batch' % (batch_idx, len(loader))) inputs = inputs.to(cfg.device, non_blocking=True) # assuming the last head is the main one # output dimension of the last head # should be consistent with the ground-truth logits = net(inputs)[-1] start = batch_idx * loader.batch_size end = start + loader.batch_size end = min(end, len(loader.dataset)) labels[start:end] = targets.cpu().numpy() predicts[start:end] = logits.max(1)[1].cpu().numpy() # compute accuracy num_classes = labels.max().item() + 1 count_matrix = np.zeros((num_classes, num_classes), dtype=np.int32) for i in xrange(predicts.shape[0]): count_matrix[predicts[i], labels[i]] += 1 reassignment = np.dstack( linear_sum_assignment(count_matrix.max() - count_matrix))[0] acc = count_matrix[reassignment[:, 0], reassignment[:, 1]].sum().astype( np.float32) / predicts.shape[0] return acc, NMI(labels, predicts), ARI(labels, predicts)
def score_function(self, output): model, data = self.model, self.data loss = model.loss_function(data, output) error = self.mse(output['adj_recon'], data.adjmat) pred = model.classify(data) nmi = NMI(data.labels.cpu(), pred, average_method='arithmetic') return {'loss': loss, 'error': error, 'nmi': nmi}
def test_NMI(): label_true = np.asarray([0, 1, 2, 3]) label_pred = np.asarray([0, 1, 2, 3]) expected = NMI(label_true, label_pred) actual = statistics.NMI(label_true, label_pred) assert expected == actual label_true = np.asarray([0, 0, 0, 3]) label_pred = np.asarray([0, 2, 2, 2]) expected = NMI(label_true, label_pred) actual = statistics.NMI(label_true, label_pred) assert expected == actual
def eval_output(self): if self.cost: print( f"The final cost reduction is {round((self.cost[-2]-self.cost[-1])/self.cost[-2]*100, 2)}%" ) if self.location == 'python': final_assign = self.label[-1, :] else: final_assign = self.label nmi = NMI(self.gt_par, final_assign) print(f"The final NMI score is {round(nmi, 4)}") return nmi
def Lda_topic_model(docs, dictionary, nb_topics, true_labels): k = 5 lda = LdaModel(docs, num_topics=k, id2word=dictionary, passes=10) top_words = [[word[::-1] for word, _ in lda.show_topic(topic_id, topn=50)] for topic_id in range(lda.num_topics)] top_betas = [[beta for _, beta in lda.show_topic(topic_id, topn=50)] for topic_id in range(lda.num_topics)] nb_words = 12 f, ax = plt.subplots(3, 2, figsize=(20, 15)) for i in range(nb_topics): # ax = plt.subplot(gs[i]) m, n = np.unravel_index(i, shape=(3, 2))[0], np.unravel_index(i, shape=(3, 2))[1] ax[m, n].barh(range(nb_words), top_betas[i][:nb_words], align='center', color='green', ecolor='black') ax[m, n].invert_yaxis() ax[m, n].set_yticks(range(nb_words)) ax[m, n].set_yticklabels(top_words[i][:nb_words]) ax[m, n].set_title("Topic " + str(i)) plt.show() # get distribution of docs on topics. dist_on_topics = lda.get_document_topics(docs) topic_predict = [] for d in dist_on_topics: p = 0 win_topic = 0 print(d) for i, t in enumerate(d): if t[1] > p: p = t[1] win_topic = t[0] print(win_topic) topic_predict.append(win_topic) mat = confusion_matrix(true_labels, topic_predict) print(mat) cluster_to_class = {} for i in range(5): cluster_to_class[i] = np.argmax(mat[:, i]) custom_labels = [cluster_to_class[c] for c in topic_predict] print("accuracy:", accuracy_score(true_labels, custom_labels)) print("f1_score micro: ", f1_score(true_labels, custom_labels, average='micro')) print("f1_score: macro", f1_score(true_labels, custom_labels, average='macro')) print("NMI", NMI(true_labels, custom_labels))
def analysis(): df_gan = pd.read_csv('result/pbmc_two_batch-cluster_result.csv', delimiter=',') df_lsi = pd.read_csv('result/pbmc_two_batch-LSI-cluster_result.csv', delimiter=',') labels = df_lsi['predicted label'].values labels_pred = df_gan['predicted label'].values nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) print("Clustering Scores:\nNMI: %.4f\nARI: %.4f\n" % (nmi_score, ari_score))
def clustering_scores(self, name, verbose=True): if self.gene_dataset.n_labels > 1: latent, _, labels = get_latent(self.model, self.data_loaders[name]) labels_pred = KMeans(self.gene_dataset.n_labels, n_init=200).fit_predict(latent) # n_jobs>1 ? asw_score = silhouette_score(latent, labels) nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) if verbose: print( "Clustering Scores for %s:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f" % (name, asw_score, nmi_score, ari_score)) return asw_score, nmi_score, ari_score
def get_performance(y_true, y_pred, n_cluster): """ 获取当前轮次的评估指标 :param y_true: :param y_pred: :param n_cluster: :return: """ purity = get_purity(y_true, y_pred, n_cluster) from sklearn.metrics import normalized_mutual_info_score as NMI from sklearn.metrics import adjusted_rand_score as ARI nmi = NMI(y_true, y_pred) ari = ARI(y_true, y_pred) return purity, nmi, ari
def log_NMI_AC(cp, ground_truth, dataset, log_flag, SEED=1234): # Init clustering hyperparameters n_clusters = cp.getint('Hyperparameters', 'ClusterNum') cluster_init = cp.getint('Hyperparameters', 'ClusterInit') # KMeans model # km = KMeans(n_clusters=n_clusters, n_init=cluster_init, n_jobs=-1, # random_state=SEED) km = AC(n_clusters=n_clusters) if isinstance(dataset, basestring): pred = km.fit_predict(np.load(dataset)) else: pred = km.fit_predict(dataset) log('--------------- {} {} ------------------------'.format( log_flag, NMI(ground_truth, pred)))
def eval_adv_efficiency(self, Y, Yadv): score = float("inf") if self.objective == "frobenius": y = self.one_hot(Y) yadv = self.one_hot(Yadv) score = -torch.norm(y @ torch.t(y) - yadv @ torch.t(yadv), p=2) / len(Y) elif self.objective == "AMI": score = max(AMI(Y.cpu(), Yadv.cpu()), 0.0) elif self.objective == "NMI": score = max(NMI(Y.cpu(), Yadv.cpu()), 0.0) elif self.objective == "accuracy": score = AC(Y.cpu(), Yadv.cpu()) return score
def kmean_clustering(doc_vectors, true_labels): k_mean = KMeans(n_clusters=5) k_mean.fit(doc_vectors) predict_label = k_mean.labels_ mat = confusion_matrix(true_labels, predict_label) print(mat) cluster_to_class = {} for i in range(5): cluster_to_class[i] = np.argmax(mat[:, i]) custom_labels = [cluster_to_class[c] for c in predict_label] print("accuracy:", accuracy_score(true_labels, custom_labels)) print("f1_score micro: ", f1_score(true_labels, custom_labels, average='micro')) print("f1_score: macro", f1_score(true_labels, custom_labels, average='macro')) print("NMI", NMI(true_labels, predict_label))
def score(self, z, true_labels): assert self.labels is not None, "Cannot compute clustering scores before fitting the data." self.silhouette_score = silhouette_score(z, self.labels) self.nmi = NMI( true_labels, self.labels, average_method="geometric") # Same average as original paper self.ari = ARI(true_labels, self.labels) true_k = len(np.unique(true_labels)) if self.k == true_k: self.accuracy = accuracy(true_labels, self.labels) else: print( "Fitted number of labels ({}) is not equal to given true number of labels ({}). Cannot " "compute accuracy.".format(self.k, true_k))
def clustering_scores(self, prediction_algorithm="knn"): if self.gene_dataset.n_labels > 1: latent, _, labels = self.get_latent() if prediction_algorithm == "knn": labels_pred = KMeans(self.gene_dataset.n_labels, n_init=200).fit_predict(latent) # n_jobs>1 ? elif prediction_algorithm == "gmm": gmm = GMM(self.gene_dataset.n_labels) gmm.fit(latent) labels_pred = gmm.predict(latent) asw_score = silhouette_score(latent, labels) nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) uca_score = unsupervised_clustering_accuracy(labels, labels_pred)[0] logger.debug("Clustering Scores:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nUCA: %.4f" % (asw_score, nmi_score, ari_score, uca_score)) return asw_score, nmi_score, ari_score, uca_score
def main(): data_df = readdata(dataFolder / corpus) data_df = shuffle(data_df, random_state=0) texts = data_df['text'] print(len(texts)) texts = preprocessortext(texts) texts = filter_text(texts) vectorizer, vocab = getvocabulary(texts) print('vocab', len(vocab)) doctopic = None model = OTDA(len(vocab), num_topics, alpha=0.1, beta=0, max_iter=100, max_err=0.0001, fix_seed=True) topic_term_hist = [] for batch_texts in generate_batches(texts, batch_size): docterm_m = get_docterm_matrix(batch_texts, vocab, method='tf') print(docterm_m.shape) prevS = 0 W, H, prevS = otda(docterm_m, model, prevS) print(W.shape, H.shape) if isinstance(doctopic, type(None)) == True: doctopic = H else: doctopic = np.vstack((doctopic, H)) topic_term_hist.append(W) cluster = np.argmax(doctopic, axis=1) nmi = NMI(data_df["Tag1"], cluster, average_method='arithmetic') print('NMI', nmi) emerging_topics = detect_emergingtopics(topic_term_hist, 90) print('emerging', emerging_topics)
def test_clustering(data, Y, coltype): nmis = [] times = [] n_clusters = len(set(Y)) for i in range(20): cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity="precomputed", linkage="average") start = time.time() sim = build_ensemble_inc(data, coltypes=coltype) duration = time.time() - start times.append(duration) distance = sim_to_dist(sim) predicted = cluster.fit_predict(distance) nmis.append(NMI(Y, predicted)) print("Summary of all the runs.\n") print( "Mean nmi : {0} (standard deviation : {1}), mean duration : {2} (standard deviation : {3}) \n" .format(np.mean(nmis), np.std(nmis), np.mean(duration), np.std(duration))) print("----------\n")
def clustering_scores(X, y, prediction_algorithm='knn'): from sklearn.metrics import adjusted_rand_score as ARI from sklearn.metrics import normalized_mutual_info_score as NMI from sklearn.metrics import silhouette_score from sklearn.mixture import GaussianMixture as GMM from sklearn.cluster import KMeans cluster_num = np.unique(y).shape[0] if prediction_algorithm == 'knn': labels_pred = KMeans(cluster_num, n_init=200).fit_predict(X) elif prediction_algorithm == 'gmm': gmm = GMM(cluster_num) gmm.fit(X) labels_pred = gmm.predict(X) labels = y asw_score = silhouette_score(X, labels) nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) labels_int = convert_label_to_int(labels) uca_score = unsupervised_clustering_accuracy(labels_int, labels_pred)[0] return asw_score, nmi_score, ari_score, uca_score
# Load dataset embeddings and labels. obj = np.load('mit67_embeddings.npz') labels = obj['labels'] embeddings = obj['embeddings'] # Set up the subset of labels to visualize. n_labels = 5 _uniq = np.unique(labels)[:n_labels] print('The {n} labels selected to clusterize:'.format(n=n_labels), _uniq) # Create the embeddings matrix (and its corresponding labels list) # containing instances belonging to labels in the previous subset. subsets_emb = np.zeros((0, 4096)) true_labels = [] for idx, _lab in enumerate(_uniq): part_emb = embeddings[np.where(labels == _lab)] subsets_emb = np.concatenate((subsets_emb, part_emb)) true_labels += [_lab for _ in range(part_emb.shape[0])] true_labels = np.array(true_labels) # Use a dimensionality reduction technique (e.g., PCA). reduced_emb = PCA(n_components=100).fit_transform(subsets_emb) # Apply a clustering technique (e.g., KMeans) and evaluate its performance with # a metric (e.g., Normalized Mutual Information). pred_labels = KMeans(n_clusters=n_labels, random_state=0).fit_predict(reduced_emb) nmi = NMI(true_labels, pred_labels) print('NMI score:', nmi)
args, _ = parser.parse_known_args() print(str(args)) benchmark = torch.load(os.path.join(benchmarks_path, args.benchmarkfile)) accm = Accumulator('ari', 'nmi', 'et') for batch in tqdm(benchmark): B = batch['X'].shape[0] for b in range(B): X = to_numpy(batch['X'][b]) true_labels = to_numpy(batch['labels'][b].argmax(-1)) true_K = len(np.unique(true_labels)) tick = time.time() spec = SpectralClustering(n_clusters=true_K, affinity='nearest_neighbors', n_neighbors=10).fit(X) labels = spec.labels_ accm.update([ ARI(true_labels, labels), NMI(true_labels, labels, average_method='arithmetic'), time.time() - tick ]) save_dir = os.path.join(results_path, 'baselines', 'mmaf_spectral') if not os.path.isdir(save_dir): os.makedirs(save_dir) logger = get_logger('spectral_baseline', os.path.join(save_dir, args.filename)) logger.info(accm.info())
def optimize(self): it=0 tm=[] md=[] nmi=[] ittr=[] com=[] for itr in range(20): self.__init__() startTime = time.time() self.Input_Graph() self.bats_init() #for i in self.bats: # print(i.node) t=self.G.number_of_nodes() vel=[] for ll in range(t): vel.append(0) self.velocity=vel #print(self.velocity) #self.best_positions=vel; #self.best_p=vel #print('bf',self.velocity) self.best_bats() #print(self.velocity) #print(self.best_positions) #print('af',self.velocity) print("iteration : ",(itr+1)) for i in range(self.iteration): #print('af',self.velocity) #print("Iteration : %d"%(i+1),end='\r') #print("iteration : ",(i+1)) inc=0 for p in self.bats: #self.velocity=vel #print(p.node) #print('bf',self.best_positions) self.updatevelocity(p) #print('af',self.best_positions) t1=self.updatepos(p) y=self.rearrange(t1) if(round(np.random.uniform(0,1),2)>self.R): #t2=self.eq_4() #y=self.rearrange(t2) y=self.eq_4() rd=round(np.random.uniform(0,1),2) fnew=self.fitness(y) #print(fnew) fi=self.bats_fitness_value[inc] bp=[] if(rd<self.A and fi<fnew): #print("p ",p.node) #print("y ",y.node) p=y.copy() self.bats_fitness_value[inc] = fnew #self.best_p = #self.bats.bats_fitness_values[(self.bats_fitness_values.keys())[inc]]=fnew it=i #print(fnew) self.best_positions = [y.node[nd]['pos'] for nd in y] self.best_p[inc] = [y.node[nd]['pos'] for nd in y] # this is an array of positions #self.best_p[inc] = [] #if (self.pos_modularity<fnew): # here comparison with old fitness if good then position vector and # self.pos_modularity=fnew # modularity is update #fit = self.fitness(y) #print() # self.best_p = [y.node[nd]['pos'] for nd in y] #print(self.best_p) #print(self.best_positions) #print("communites : ",len(set(self.best_positions))) #print(self.best_positions) self.increase_r(i) self.decrease_a() inc+=1 fmax=max(self.bats_fitness_value) #print(' max ',self.pos_modularity) #print(self.best_p) #fmax=max(self.bats_fitness_values.keys()) #print(self.bats_fitness_value) #print(self.best_positions) pos=self.best_p[self.bats_fitness_value.index(fmax)] self.positions=pos #print("communites : ",len(set(pos))) #print("Position : ",self.best_positions) print(fmax) n=NMI(list(self.pred.values()),pos) tm.append(float(format(np.round((time.time() - startTime),2)))) md.append(fmax) nmi.append(n) #ittr.append(it) com.append(len(set(pos))) print("\n\n**********************************************************") print('\nThe script take {0} second ',tm) print("\nModularity is : ",md) print("\nNMI : ",nmi) print("\nNumber of Communites : ",com) #print("\nGlobal Best Position : ",pos) #nx.draw(self.G,node_color=[self.G.node[i]['pos'] for i in self.G]) nx.draw(self.G, node_color=pos) plt.show()
def evaluate(net, loader, writer, epoch): """evaluates on provided data """ net.eval() predicts = np.zeros(len(loader.dataset), dtype=np.int32) labels = np.zeros(len(loader.dataset), dtype=np.int32) intermediates = np.zeros((len(loader.dataset), 2048), dtype=np.float32) images = np.zeros((len(loader.dataset), 3, 64, 64), dtype=np.float32) print(f"Evaluating on {len(loader.dataset)} samples") with torch.no_grad(): for batch_idx, (batch, targets) in enumerate(loader): # logger.progress('processing %d/%d batch' % (batch_idx, len(loader))) batch = batch.to(cfg.device, non_blocking=True) # assuming the last head is the main one # output dimension of the last head # should be consistent with the ground-truth logits = net(batch, -1) start = batch_idx * loader.batch_size end = start + loader.batch_size end = min(end, len(loader.dataset)) labels[start:end] = targets.cpu().numpy() predicts[start:end] = logits.max(1)[1].cpu().numpy() if epoch % cfg.embedding_freq == 0: intermediates[start:end] = net(batch, -1, True).cpu().numpy() if not cfg.tfm_adaptive_thresholding: for i in range(3): batch[:, i] = (batch[:, i] * cfg.tfm_stds[i]) + cfg.tfm_means[i] images[start:end] = torch.nn.functional.interpolate( batch, size=(64, 64), mode='bicubic', align_corners=False).cpu().numpy() # TODO: Gather labels and predicts # compute accuracy num_classes = labels.max().item() + 1 count_matrix = np.zeros((num_classes, num_classes), dtype=np.int32) for i in range(predicts.shape[0]): count_matrix[predicts[i], labels[i]] += 1 reassignment = np.dstack( linear_sum_assignment(count_matrix.max() - count_matrix))[0] acc = count_matrix[reassignment[:, 0], reassignment[:, 1]].sum().astype( np.float32) / predicts.shape[0] nmi = NMI(labels, predicts) ari = ARI(labels, predicts) # compute f1 scores per class predicts_reassigned = reassignment[predicts, 1] precision = precision_score(labels, predicts_reassigned, average=None, zero_division=0) recall = recall_score(labels, predicts_reassigned, average=None, zero_division=0) f1 = f1_score(labels, predicts_reassigned, average=None, zero_division=0) logger.info('Evaluation results at epoch %d are: ' 'ACC: %.3f, NMI: %.3f, ARI: %.3f' % (epoch, acc, nmi, ari)) if cfg.local_rank == 0: writer.add_scalar('Evaluate/ACC', acc, epoch) writer.add_scalar('Evaluate/NMI', nmi, epoch) writer.add_scalar('Evaluate/ARI', ari, epoch) for i in range(len(f1)): writer.add_scalar(f'Evaluate/f1_{i}', f1[i], epoch) writer.add_scalar(f'Evaluate/precision_{i}', precision[i], epoch) writer.add_scalar(f'Evaluate/recall_{i}', recall[i], epoch) if epoch % cfg.embedding_freq == 0 and cfg.embedding_freq != -1: writer.add_embedding(intermediates, labels, images, epoch, cfg.session) return acc
for dataset in tqdm(benchmark): true_labels = to_numpy(dataset['labels'].argmax(-1)) X = to_numpy(dataset['X']) ll = 0 ari = 0 nmi = 0 mae = 0 et = 0 for b in range(len(X)): tick = time.time() vbmog.run(X[b], verbose=False) et += time.time() - tick ll += vbmog.loglikel(X[b]) labels = vbmog.labels() ari += ARI(true_labels[b], labels) nmi += NMI(true_labels[b], labels, average_method='arithmetic') mae += abs(len(np.unique(true_labels[b])) - len(np.unique(labels))) ll /= len(X) ari /= len(X) nmi /= len(X) mae /= len(X) et /= len(X) accm.update([ll.item(), dataset['ll'], ari, nmi, mae, et]) save_dir = os.path.join(results_path, 'baselines', 'vbmog') if not os.path.isdir(save_dir): os.makedirs(save_dir) logger = get_logger('vbmog_baseline', os.path.join(save_dir, args.filename)) logger.info(accm.info())
def main(): ##divide image into patches(polygons) and get the positions of each one scene = cv2.imread('C:/Users/dario.dotti/Documents/LOST_dataset/camera001.jpg') list_poly = my_img_proc.divide_image(scene) with open('C:/Users/dario.dotti/Documents/LOST_dataset/8_2013-12_2012_camera001/raw_traj_organized_by_id.txt', 'rb') as handle: slices = pickle.load(handle) print 'data extracted' ##Feature Extraction METHOD: compute histogram of Oriented Tracklets for all the tracklets in every patch normalized_data_HOT,orig_list_id = main_camera017.histograms_of_oriented_trajectories(list_poly,slices) ##Feature Extraction METHOD: Compute the features velocity-acceleration-curvature to distinguish between pedestrian and cars #normalized_data_HOT = velo_acc_curva_feature(list_poly,slices) ####Experinment AUTOENCODER # ##3)load the model of the trained AE # if my_AE.load_training(): # print 'model loaded' # weights = my_AE.get_weights() # weights= weights.T ###)3a)load the model of the trained deep AE if my_AE.load_deep_AE(): print 'model loaded' list_weights = my_AE.get_stacked_weigths([1,2]) hid1_space = np.dot(normalized_data_HOT,list_weights[0]) #hid2_space = np.dot(hid1_space,list_weights[1]) weights= list_weights[1].T ####COMPUTE THE MOST SIMILAR FEATURE FOR EVERY WEIGHT #mostSimilar_feature_sorted,seeds_index = my_exp.get_most_similar_feature_from_AEweights(hid1_space,weights,orig_list_id,scene,slices) ###CLUSTERING ####seed # seeds = [] # # ##get the traj with strongest weight and use it to initialize mean shift # map(lambda s:seeds.append(normalized_data_HOT[s]),seeds_index) # # # cluster_pred_AE,cluster_centers_seeds = my_exp.cluster_fit_predict(normalized_data_HOT,seeds) # freq_counter_seedsAE = Counter(cluster_pred_AE) # # # print 'seeds from AE' # print freq_counter_seedsAE # # # main_camera017.color_traj_based_on_clusters(slices,cluster_pred_AE,freq_counter_seedsAE,scene) ############# ##raw features cluster cluster_pred_raw,cluster_centers_unseeds = my_exp.cluster_fit_predict(normalized_data_HOT,[]) freq_counter_unseed = Counter(cluster_pred_raw) print freq_counter_unseed # ##getting same number of cluster with raw features # temp_seeds = [] # for trj_id in freq_counter_unseed.keys()[:len(freq_counter_seedsAE.keys())] : # ##get index of every class # index = np.where(cluster_pred_raw == trj_id)[0][0] # # temp_seeds.append(normalized_data_HOT[index]) # # cluster_pred_raw,cluster_centers_unseeds = my_exp.cluster_fit_predict(normalized_data_HOT,temp_seeds) # freq_counter_seeds_raw = Counter(cluster_pred_raw) # print 'seeds from raw features' # print freq_counter_seeds_raw main_camera017.color_traj_based_on_clusters(slices,cluster_pred_raw,freq_counter_unseed,scene) ####SUPERVISED CLASSIFIER test_samples = [] test_labels= [] training_samples = [] training_labels= [] test_pos = [] ##get test samples from every class for trj_c,trj_id in enumerate(freq_counter_unseed): ##get index of every class index = np.where(cluster_pred_raw == trj_id)[0] #take 10% of every class as test samples #print w[trj_c] random_index = random.sample(index,int((len(index)*0.15))) for i,c in enumerate(cluster_pred_raw): if i in random_index: test_samples.append(normalized_data_HOT[i]) test_labels.append(trj_id) test_pos.append(i) print len(test_samples) for i,sample in enumerate(normalized_data_HOT): if i not in test_pos: training_samples.append(sample) training_labels.append(cluster_pred_raw[i]) print np.array(training_samples).shape # #train Logistic regression classifier my_exp.logistic_regression_train(training_samples,training_labels) # #test pred = my_exp.logistic_regression_predict(test_samples) print NMI(test_labels,pred)
def clustering(Xsvd, cells, dataset, suffix, labels=None, tlabels=None, method='knn', istsne=True, name='', batch_labels=None, seed=42): tsne = TSNE(n_jobs=24).fit_transform(Xsvd) for n_components in [15]: if method == 'gmm': clf = mixture.GaussianMixture(n_components=n_components).fit(mat) labels_pred = clf.predict(tsne) elif method == 'knn': labels_pred = KMeans(n_components, n_init=200).fit_predict(tsne) # n_jobs>1 ? elif method == 'dbscan': labels_pred = DBSCAN(eps=0.3, min_samples=10).fit(tsne).labels_ elif method == 'spectral': spectral = cluster.SpectralClustering(n_clusters=n_components, eigen_solver='arpack', affinity="nearest_neighbors") labels_pred = spectral.fit_predict(tsne) elif method == 'louvain': from scipy.spatial import distance for louvain in [30]: print('****', louvain) mat = kneighbors_graph(Xsvd, louvain, mode='distance', include_self=True).todense() G = nx.from_numpy_matrix(mat) partition = community.best_partition(G, random_state=seed) labels_pred = [] for i in range(mat.shape[0]): labels_pred.append(partition[i]) labels_pred = np.array(labels_pred) print('louvain', louvain, tsne[:5], len(labels), len(labels_pred)) #print(np.unique(labels_pred)) if labels is not None: nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) print( n_components, method, "Clustering Scores:\nNMI: %.4f\nARI: %.4f\n" % (nmi_score, ari_score)) if istsne: n_components = len(np.unique(labels_pred)) vis_x = tsne[:, 0] vis_y = tsne[:, 1] colors = [ 'blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'yellow', 'black', 'teal', 'plum', 'tan', 'bisque', 'beige', 'slategray', 'brown', 'darkred', 'salmon', 'coral', 'olive', 'lightpink', 'teal', 'darkcyan', 'BlueViolet', 'CornflowerBlue', 'DarkKhaki', 'DarkTurquoise' ] show_tsne(tsne, labels, 'result/%s/%s-%s-LSI-true.png' % (dataset, name, suffix), tlabels=tlabels) show_tsne(tsne, labels_pred, 'result/%s/%s-%s-LSI-pred.png' % (dataset, name, suffix)) with open('result/%s-LSI-cluster_result.csv' % (dataset), 'w') as f: f.write('cell,predicted label,tsne-1,tsne-2\n') for cell, pred, t in zip(cells, labels_pred, tsne): f.write('%s,%d,%f,%f\n' % (cell, pred, t[0], t[1])) if batch_labels is not None: show_tsne( tsne, batch_labels, 'result/%s/%s-GMVAE-%s-%s-batch.png' % (dataset, dataset, suffix, name))