def entropy_batch_mixing(self, verbose=False, **kwargs): if self.gene_dataset.n_batches == 2: latent, batch_indices, labels = self.get_latent() sample = select_indices_evenly(2000, batch_indices) be_score = entropy_batch_mixing(latent[sample, :], batch_indices[sample], **kwargs) if verbose: print("Entropy batch mixing :", be_score) return be_score
def eval_latent(batch_indices, labels, latent, keys, labelled_idx=None, unlabelled_idx=None, plotname=None, plotting=False, partial_only=True): res_knn_partial = clustering_scores(latent, labels, 'knn', True, labelled_idx, unlabelled_idx) res_kmeans_partial = clustering_scores(latent, labels, 'KMeans', True, labelled_idx, unlabelled_idx) if partial_only == False: res_knn = clustering_scores(np.asarray(latent), labels, 'knn') res_kmeans = clustering_scores(np.asarray(latent), labels, 'KMeans') # sample = select_indices_evenly(2000, batch_indices) # batch_entropy = entropy_batch_mixing(latent[sample, :], batch_indices[sample]) # print("Entropy batch mixing :", batch_entropy) if plotting == True and (os.path.isfile('../' + plotname + '.labels.pdf') is False): sample = select_indices_evenly(2000, batch_indices) # sample = select_indices_evenly(2000, labels) if plotname is not None: colors = sns.color_palette('bright') + \ sns.color_palette('muted') + \ sns.color_palette('dark') + \ sns.color_palette('pastel') + \ sns.color_palette('colorblind') latent_s = latent[sample, :] label_s = labels[sample] batch_s = batch_indices[sample] if latent_s.shape[1] != 2: latent_s = UMAP(spread=2).fit_transform(latent_s) fig, ax = plt.subplots(figsize=(18, 18)) key_order = np.argsort(keys) for i, k in enumerate(key_order): ax.scatter(latent_s[label_s == k, 0], latent_s[label_s == k, 1], c=colors[i % 30], label=keys[k], edgecolors='none') # ax.legend(bbox_to_anchor=(1.1, 0.5), borderaxespad=0, fontsize='x-large') fig.patch.set_visible(False) ax.axis('off') fig.tight_layout() plt.savefig('../' + plotname + '.labels.pdf') fig, ax = plt.subplots(figsize=(18, 18)) # for i, x in enumerate(batch): ax.scatter(latent_s[:, 0], latent_s[:, 1], c=batch_s, alpha=0.8) # ax.legend(bbox_to_anchor=(1.1, 0.5), borderaxespad=0, fontsize='x-large') plt.axis('off') plt.savefig('../' + plotname + '.batchid.pdf') if partial_only == False: return res_knn, res_knn_partial, res_kmeans, res_kmeans_partial else: return 0, res_knn_partial, 0, res_kmeans_partial
def VAEstats(full): ll = full.ll(verbose=True) latent, batch_indices, labels = full.sequential().get_latent() batch_indices = batch_indices.ravel() if len(np.unique(batch_indices)) == 2: sample = select_indices_evenly( np.min(np.unique(batch_indices, return_counts=True)[1]), batch_indices) batch_entropy = entropy_batch_mixing(latent[sample, :], batch_indices[sample]) else: batch_entropy = -1 labels = labels.ravel() stats = [ll, batch_entropy, -1, -1, np.arange(0, len(labels))] return latent, batch_indices, labels, stats
def SCANVIstats(trainer_scanvi, gene_dataset): full = trainer_scanvi.create_posterior(trainer_scanvi.model, gene_dataset, indices=np.arange( len(gene_dataset))) ll = full.ll(verbose=True) latent, batch_indices, labels = full.sequential().get_latent() batch_indices = batch_indices.ravel() if len(np.unique(batch_indices)) == 2: sample = select_indices_evenly( np.min(np.unique(batch_indices, return_counts=True)[1]), batch_indices) batch_entropy = entropy_batch_mixing(latent[sample, :], batch_indices[sample]) else: batch_entropy = -1 labelled_idx = trainer_scanvi.labelled_set.indices unlabelled_idx = trainer_scanvi.unlabelled_set.indices trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior( trainer_scanvi.model, gene_dataset, indices=unlabelled_idx) acc = trainer_scanvi.unlabelled_set.accuracy() stats = [ll, batch_entropy, acc, labelled_idx, unlabelled_idx] return latent, batch_indices, labels, stats
"correlation between the cell-type composition of the subsampled dataset is %.3f" % correlation) sub_dataset = deepcopy(gene_dataset) sub_dataset.update_cells(np.concatenate(cells)) vae = VAE(sub_dataset.nb_genes, n_batch=sub_dataset.n_batches, n_labels=sub_dataset.n_labels, n_hidden=128, dispersion='gene') infer = VariationalInference(vae, sub_dataset, use_cuda=use_cuda) infer.train(n_epochs=250) latent, batch_indices, labels = infer.get_latent('sequential') keys = sub_dataset.cell_types batch_entropy = entropy_batch_mixing(latent, batch_indices) print("Entropy batch mixing :", batch_entropy) sample = select_indices_evenly(1000, labels) res = knn_purity_avg(latent[sample, :], labels[sample].astype('int'), keys=keys, acc=True) print('average classification accuracy per cluster') for x in res: print(x) knn_acc = np.mean([x[1] for x in res]) print("average KNN accuracy:", knn_acc) res = clustering_scores( np.asarray(latent)[sample, :], labels[sample], 'knn', len(np.unique(labels[sample]))) for x in res: print(x, res[x])
# latent, batch_indices,labels,keys = SEURAT.get_cca() latent = np.genfromtxt('../macosko_regev.CCA.txt') label = np.genfromtxt('../macosko_regev.CCA.label.txt',dtype='str') keys = gene_dataset.cell_types batch_indices = np.genfromtxt('../macosko_regev.CCA.batch.txt') elif model_type == 'Combat': COMBAT = COMBAT() latent = COMBAT.combat_pca(gene_dataset) latent = latent.T batch_indices = np.concatenate(gene_dataset.batch_indices) labels = np.concatenate(gene_dataset.labels) keys = gene_dataset.cell_types sample = select_indices_evenly(2000,batch_indices) batch_entropy = entropy_batch_mixing(latent[sample, :], batch_indices[sample]) print("Entropy batch mixing :", batch_entropy) sample = select_indices_evenly(1000,labels) res = knn_purity_avg( latent[sample, :], labels[sample], keys=keys[np.unique(labels)], acc=True ) print('average classification accuracy per cluster',np.mean([x[1] for x in res])) for x in res: print(x) res = clustering_scores(np.asarray(latent)[sample,:],labels[sample],'knn',len(np.unique(labels[sample])))
def CompareModels(gene_dataset, dataset1, dataset2, plotname, models): KNeighbors = np.concatenate( [np.arange(10, 100, 10), np.arange(100, 500, 50)]) K_int = np.concatenate([np.repeat(10, 10), np.repeat(50, 7)]) f = open('../' + plotname + '/' + models + '.res.txt', "w+") f.write("model_type " + \ "knn_asw knn_nmi knn_ari knn_uca knn_wuca " + \ "p_knn_asw p_knn_nmi p_knn_ari p_knn_uca p_knn_wuca " + \ "p1_knn_asw p1_knn_nmi p1_knn_ari p1_knn_uca p1_knn_wuca " + \ "p2_knn_asw p2_knn_nmi p2_knn_ari p2_knn_uca p2_knn_wuca " + \ "kmeans_asw kmeans_nmi kmeans_ari kmeans_uca kmeans_wuca " + \ "p_kmeans_asw p_kmeans_nmi p_kmeans_ari p_kmeans_uca p_kmeans_wuca " + \ "p1_kmeans_asw p1_kmeans_nmi p1_kmeans_ari p1_kmeans_uca p1_kmeans_wuca " + \ "p2_kmeans_asw p2_kmeans_nmi p2_kmeans_ari p2_kmeans_uca p2_kmeans_wuca " + \ " ".join(['res_jaccard' + x for x in np.concatenate([np.repeat(10, 10), np.repeat(50, 7)]).astype('str')]) + " " + \ 'jaccard_score likelihood BE classifier_acc\n' ) g = open('../' + plotname + '/' + models + '.percluster.res.txt', "w+") g.write("model_type\tannotation\t" + "\t".join(gene_dataset.cell_types) + "\n") scanvi = SCANVI(gene_dataset.nb_genes, gene_dataset.n_batches, gene_dataset.n_labels) trainer_scanvi = SemiSupervisedTrainer(scanvi, gene_dataset, classification_ratio=1, n_epochs_classifier=1, lr_classification=5 * 1e-3) labelled_idx = trainer_scanvi.labelled_set.indices unlabelled_idx = trainer_scanvi.unlabelled_set.indices if models == 'others': latent1 = np.genfromtxt('../harmonization/Seurat_data/' + plotname + '.1.CCA.txt') latent2 = np.genfromtxt('../harmonization/Seurat_data/' + plotname + '.2.CCA.txt') for model_type in [ 'scmap', 'readSeurat', 'coral', 'Combat', 'MNN', 'PCA' ]: print(model_type) if (model_type == 'scmap') or (model_type == 'coral'): latent, batch_indices, labels, keys, stats = run_model( model_type, gene_dataset, dataset1, dataset2, filename=plotname) pred1 = latent pred2 = stats res1 = scmap_eval(pred1, labels[batch_indices == 1], labels) res2 = scmap_eval(pred2, labels[batch_indices == 0], labels) g.write("%s\t" % (model_type) + "p1\t" + ("%.4f\t" * len(gene_dataset.cell_types) % tuple(res1['clusteracc']) + "\n")) g.write("%s\t" % (model_type) + "p2\t" + ("%.4f\t" * len(gene_dataset.cell_types) % tuple(res2['clusteracc']) + "\n")) res = [-1] * 10 + \ [-1] + [res1[x] for x in ['nmi', 'ari', 'ca', 'weighted ca']] + \ [-1] + [res2[x] for x in ['nmi', 'ari', 'ca', 'weighted ca']] + \ [-1] * 41 f.write(model_type + (" %.4f" * 61 + "\n") % tuple(res)) else: if model_type == 'readSeurat': dataset1, dataset2, gene_dataset = SubsetGenes( dataset1, dataset2, gene_dataset, plotname) latent, batch_indices, labels, keys, stats = run_model( model_type, gene_dataset, dataset1, dataset2, filename=plotname) res_jaccard = [ KNNJaccardIndex(latent1, latent2, latent, batch_indices, k)[0] for k in KNeighbors ] res_jaccard_score = np.sum(res_jaccard * K_int) res_knn, res_knn_partial, res_kmeans, res_kmeans_partial = \ eval_latent(batch_indices, labels, latent, keys, labelled_idx, unlabelled_idx, plotname=plotname + '.' + model_type, plotting=False, partial_only=False) _, res_knn_partial1, _, res_kmeans_partial1 = \ eval_latent(batch_indices, labels, latent, keys, batch_indices == 0, batch_indices == 1, plotname=plotname + '.' + model_type, plotting=False) _, res_knn_partial2, _, res_kmeans_partial2 = \ eval_latent(batch_indices, labels, latent, keys, batch_indices == 1, batch_indices == 0, plotname=plotname + '.' + model_type, plotting=False) sample = select_indices_evenly( np.min(np.unique(batch_indices, return_counts=True)[1]), batch_indices) batch_entropy = entropy_batch_mixing(latent[sample, :], batch_indices[sample]) res = [res_knn[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \ [res_knn_partial[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \ [res_knn_partial1[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \ [res_knn_partial2[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \ [res_kmeans[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \ [res_kmeans_partial[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \ [res_kmeans_partial1[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \ [res_kmeans_partial2[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \ res_jaccard + \ [res_jaccard_score, -1, batch_entropy, -1] f.write(model_type + (" %.4f" * 61 + "\n") % tuple(res)) g.write("%s\t" % (model_type) + 'all\t' + ("%.4f\t" * len(gene_dataset.cell_types) % tuple(res_knn['clusteracc']) + "\n")) g.write("%s\t" % (model_type) + 'p\t' + ("%.4f\t" * len(gene_dataset.cell_types) % tuple(res_knn_partial['clusteracc']) + "\n")) g.write("%s\t" % (model_type) + 'p1\t' + ("%.4f\t" * len(gene_dataset.cell_types) % tuple(res_knn_partial1['clusteracc']) + "\n")) g.write("%s\t" % (model_type) + 'p2\t' + ("%.4f\t" * len(gene_dataset.cell_types) % tuple(res_knn_partial2['clusteracc']) + "\n")) elif (models == 'scvi') or (models == 'scvi_nb'): dataset1, dataset2, gene_dataset = SubsetGenes(dataset1, dataset2, gene_dataset, plotname) if models == 'scvi_nb': latent1, _, _, _, _ = run_model('vae_nb', dataset1, 0, 0, filename=plotname, rep='vae1_nb') latent2, _, _, _, _ = run_model('vae_nb', dataset2, 0, 0, filename=plotname, rep='vae2_nb') else: latent1, _, _, _, _ = run_model('vae', dataset1, 0, 0, filename=plotname, rep='vae1') latent2, _, _, _, _ = run_model('vae', dataset2, 0, 0, filename=plotname, rep='vae2') for model_type in [ 'vae', 'scanvi1', 'scanvi2', 'vae_nb', 'scanvi1_nb', 'scanvi2_nb' ]: print(model_type) latent, batch_indices, labels, keys, stats = run_model( model_type, gene_dataset, dataset1, dataset2, filename=plotname, rep='0') res_jaccard = [ KNNJaccardIndex(latent1, latent2, latent, batch_indices, k)[0] for k in KNeighbors ] res_jaccard_score = np.sum(res_jaccard * K_int) res_knn, res_knn_partial, res_kmeans, res_kmeans_partial = \ eval_latent(batch_indices=batch_indices, labels=labels, latent=latent, keys=keys, labelled_idx=labelled_idx, unlabelled_idx=unlabelled_idx, plotname=plotname + '.' + model_type, plotting=False, partial_only=False) _, res_knn_partial1, _, res_kmeans_partial1 = \ eval_latent(batch_indices=batch_indices, labels=labels, latent=latent, keys=keys, labelled_idx=(batch_indices == 0), unlabelled_idx=(batch_indices == 1), plotname=plotname + '.' + model_type, plotting=False) _, res_knn_partial2, _, res_kmeans_partial2 = \ eval_latent(batch_indices=batch_indices, labels=labels, latent=latent, keys=keys, labelled_idx=(batch_indices == 1), unlabelled_idx=(batch_indices == 0), plotname=plotname + '.' + model_type, plotting=False) res = [res_knn[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \ [res_knn_partial[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \ [res_knn_partial1[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \ [res_knn_partial2[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \ [res_kmeans[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \ [res_kmeans_partial[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \ [res_kmeans_partial1[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \ [res_kmeans_partial2[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \ res_jaccard + \ [res_jaccard_score, stats[0], stats[1], stats[2]] f.write(model_type + (" %.4f" * 61 + "\n") % tuple(res)) g.write("%s\t" % (model_type) + 'all\t' + ("%.4f\t" * len(gene_dataset.cell_types) % tuple(res_knn['clusteracc']) + "\n")) g.write("%s\t" % (model_type) + 'p\t' + ("%.4f\t" * len(gene_dataset.cell_types) % tuple(res_knn_partial['clusteracc']) + "\n")) g.write("%s\t" % (model_type) + 'p1\t' + ("%.4f\t" * len(gene_dataset.cell_types) % tuple(res_knn_partial1['clusteracc']) + "\n")) g.write("%s\t" % (model_type) + 'p2\t' + ("%.4f\t" * len(gene_dataset.cell_types) % tuple(res_knn_partial2['clusteracc']) + "\n")) # for i in [1, 2, 3]: # latent, batch_indices, labels, keys, stats = run_model(model_type, gene_dataset, dataset1, dataset2, # filename=plotname, rep=str(i)) # res_jaccard, res_jaccard_score = KNNJaccardIndex(latent1, latent2, latent, batch_indices) # # res_knn, res_knn_partial, res_kmeans, res_kmeans_partial = \ # eval_latent(batch_indices=batch_indices, labels=labels, latent=latent, keys=keys, # labelled_idx=labelled_idx, unlabelled_idx=unlabelled_idx, # plotname=plotname + '.' + model_type, plotting=False,partial_only=False) # # _, res_knn_partial1, _, res_kmeans_partial1 = \ # eval_latent(batch_indices=batch_indices, labels=labels, latent=latent, keys=keys, # labelled_idx=(batch_indices == 0), unlabelled_idx=(batch_indices == 1), # plotname=plotname + '.' + model_type, plotting=False) # # _, res_knn_partial2, _, res_kmeans_partial2 = \ # eval_latent(batch_indices=batch_indices, labels=labels, latent=latent, keys=keys, # labelled_idx=(batch_indices == 1), unlabelled_idx=(batch_indices == 0), # plotname=plotname + '.' + model_type, plotting=False) # # res = [res_knn[x] for x in res_knn] + \ # [res_knn_partial[x] for x in res_knn_partial] + \ # [res_knn_partial1[x] for x in res_knn_partial1] + \ # [res_knn_partial2[x] for x in res_knn_partial2] + \ # [res_kmeans[x] for x in res_kmeans] + \ # [res_kmeans_partial[x] for x in res_kmeans_partial] + \ # [res_kmeans_partial1[x] for x in res_kmeans_partial1] + \ # [res_kmeans_partial2[x] for x in res_kmeans_partial2] + \ # res_jaccard + \ # [res_jaccard_score,stats[0], stats[1], stats[2]] # f.write(model_type + (" %.4f" * 61 + "\n") % tuple(res)) elif models == 'writedata': _, _, _, _, _ = run_model('writedata', gene_dataset, dataset1, dataset2, filename=plotname) f.close() g.close()