def correct_scvi(Xs, genes): import torch use_cuda = True torch.cuda.set_device(1) from scvi.dataset.dataset import GeneExpressionDataset from scvi.inference import UnsupervisedTrainer from scvi.models import SCANVI, VAE from scvi.dataset.anndata import AnnDataset all_ann = [AnnDataset(AnnData(X, var=genes)) for X in Xs] all_dataset = GeneExpressionDataset.concat_datasets(*all_ann) vae = VAE(all_dataset.nb_genes, n_batch=all_dataset.n_batches, n_labels=all_dataset.n_labels, n_hidden=128, n_latent=30, n_layers=2, dispersion='gene') trainer = UnsupervisedTrainer(vae, all_dataset, train_size=0.99999) n_epochs = 100 #trainer.train(n_epochs=n_epochs) #torch.save(trainer.model.state_dict(), # 'data/harmonization.vae.pkl') trainer.model.load_state_dict(torch.load('data/harmonization.vae.pkl')) trainer.model.eval() full = trainer.create_posterior(trainer.model, all_dataset, indices=np.arange(len(all_dataset))) latent, batch_indices, labels = full.sequential().get_latent() return latent
def trainVAE(gene_dataset, filename, rep, nlayers=2, n_hidden=128, reconstruction_loss: str = 'zinb'): vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels, n_hidden=n_hidden, n_latent=10, n_layers=nlayers, dispersion='gene', reconstruction_loss=reconstruction_loss) trainer = UnsupervisedTrainer(vae, gene_dataset, train_size=1.0) filename = '../' + filename + '/' + 'vae' + '.' + reconstruction_loss + '.rep' + str( rep) + '.pkl' if os.path.isfile(filename): trainer.model.load_state_dict(torch.load(filename)) trainer.model.eval() else: trainer.train(n_epochs=250) torch.save(trainer.model.state_dict(), filename) full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset))) return full
def scVI_latent(csv_file, csv_path, vae_model=VAE, train_size=1.0, n_labels=0, seed=1234, n_cores=1, lr=1e-3, use_cuda=False): set_seed(seed) dat = CsvDataset(csv_file, save_path=csv_path, new_n_genes=None) # Based on recommendations in basic_tutorial.ipynb n_epochs = 400 if (len(dat) < 10000) else 200 # trainer and model vae = vae_model(dat.nb_genes, n_labels=n_labels) trainer = UnsupervisedTrainer( vae, dat, train_size=train_size, # default to 0.8, documentation recommends 1 use_cuda=use_cuda) # limit cpu usage torch.set_num_threads(n_cores) trainer.train(n_epochs=n_epochs, lr=lr) full = trainer.create_posterior(trainer.model, dat, indices=np.arange(len(dat))) # Updating the "minibatch" size after training is useful in low memory configurations Z_hat = full.sequential().get_latent()[0] adata = anndata.AnnData(dat.X) for i, z in enumerate(Z_hat.T): adata.obs[f'Z_{i}'] = z # reordering for convenience and correspondance with PCA's ordering cellLoads = adata.obs.reindex(adata.obs.std().sort_values().index, axis=1) return (cellLoads)
def test_gamma_de(): cortex_dataset = CortexDataset() cortex_vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches) trainer_cortex_vae = UnsupervisedTrainer(cortex_vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda) trainer_cortex_vae.train(n_epochs=2) full = trainer_cortex_vae.create_posterior(trainer_cortex_vae.model, cortex_dataset, indices=np.arange( len(cortex_dataset))) n_samples = 10 M_permutation = 100 cell_idx1 = cortex_dataset.labels.ravel() == 0 cell_idx2 = cortex_dataset.labels.ravel() == 1 full.differential_expression_score(cell_idx1, cell_idx2, n_samples=n_samples, M_permutation=M_permutation) full.differential_expression_gamma(cell_idx1, cell_idx2, n_samples=n_samples, M_permutation=M_permutation)
def scVI_norm(csv_file, csv_path, vae_model=VAE, train_size=1.0, n_labels=0, seed=1234, n_cores=1, lr=1e-3, use_cuda=False): set_seed(seed) dat = CsvDataset(csv_file, save_path=csv_path, new_n_genes=None) dat.subsample_genes(1000, mode="variance") # Based on recommendations in basic_tutorial.ipynb n_epochs = 400 if (len(dat) < 10000) else 200 # trainer and model vae = vae_model(dat.nb_genes, n_labels=n_labels) trainer = UnsupervisedTrainer( vae, dat, train_size=train_size, # default to 0.8, documentation recommends 1 use_cuda=use_cuda) # limit cpu usage torch.set_num_threads(n_cores) trainer.train(n_epochs=n_epochs, lr=lr) full = trainer.create_posterior(trainer.model, dat, indices=np.arange(len(dat))) # Updating the "minibatch" size after training is useful in low memory configurations normalized_values = full.sequential().get_sample_scale() return [normalized_values, dat.gene_names]
def scVI_ld(csv_file, csv_path, ndims, vae_model = VAE, n_labels = 0, n_cores=1, seed= 1234, lr = 1e-3, use_cuda = False): set_seed(seed) dat = CsvDataset(csv_file, save_path=csv_path, new_n_genes=None) # Based on recommendations in linear_decoder.ipynb n_epochs = 250 # trainer and model ldvae = LDVAE( dat.nb_genes, n_batch = dat.n_batches, n_latent = ndims, n_labels = n_labels ) trainerLD = UnsupervisedTrainer(ldvae, dat, use_cuda=use_cuda) # limit cpu usage torch.set_num_threads(n_cores) trainerLD.train(n_epochs=n_epochs, lr=lr) # extract mean value for the ld full = trainerLD.create_posterior(trainerLD.model, dat, indices=np.arange(len(dat))) Z_hat = full.sequential().get_latent()[0] adata = anndata.AnnData(dat.X) for i, z in enumerate(Z_hat.T): adata.obs[f'Z_{i}'] = z # reordering for convenience and correspondance with PCA's ordering cellLoads = adata.obs.reindex(adata.obs.std().sort_values().index, axis = 1) return(cellLoads)
def run(self): n_epochs = 100 n_latent = 10 n_hidden = 128 n_layers = 2 net_data = self.data.copy() net_data.X = self.data.layers['counts'] del net_data.layers['counts'] net_data.raw = None # Ensure that the raw counts are not accidentally used # Define batch indices le = LabelEncoder() net_data.obs['batch_indices'] = le.fit_transform( net_data.obs[self.batch].values) net_data = AnnDatasetFromAnnData(net_data) vae = VAE(net_data.nb_genes, reconstruction_loss='nb', n_batch=net_data.n_batches, n_layers=n_layers, n_latent=n_latent, n_hidden=n_hidden) trainer = UnsupervisedTrainer(vae, net_data, train_size=1, use_cuda=False) trainer.train(n_epochs=n_epochs, lr=1e-3) full = trainer.create_posterior(trainer.model, net_data, indices=np.arange(len(net_data))) latent, _, _ = full.sequential().get_latent() self.data.obsm['X_emb'] = latent self.dump_to_h5ad("scvi")
def test_encoder_only(): # torch.autograd.set_detect_anomaly(mode=True) dataset = LatentLogPoissonDataset(n_genes=5, n_latent=2, n_cells=300, n_comps=1) dataset = LatentLogPoissonDataset(n_genes=3, n_latent=2, n_cells=15, n_comps=2) dataset = LatentLogPoissonDataset(n_genes=5, n_latent=2, n_cells=150, n_comps=1, learn_prior_scale=True) # _, _, marginals = dataset.compute_posteriors( # x_obs=torch.randint(0, 150, size=(1, 5), dtype=torch.float), # mcmc_kwargs={"num_samples": 20, "warmup_steps": 20, "num_chains": 1} # ) # stats = marginals.diagnostics() # print(stats) dataset.cuda() vae_mdl = LogNormalPoissonVAE( dataset.nb_genes, dataset.n_batches, autoregressive=False, full_cov=True, n_latent=2, gt_decoder=dataset.nn_model, ) params = vae_mdl.encoder_params trainer = UnsupervisedTrainer( model=vae_mdl, gene_dataset=dataset, use_cuda=True, train_size=0.7, n_epochs_kl_warmup=1, ratio_loss=True, ) trainer.train( n_epochs=2, lr=1e-3, params=params, ) full = trainer.create_posterior(trainer.model, dataset, indices=np.arange(len(dataset))) lkl_estimate = vae_mdl.marginal_ll(full, n_samples_mc=50)
def test_differential_expression(save_path): dataset = CortexDataset(save_path=save_path) n_cells = len(dataset) all_indices = np.arange(n_cells) vae = VAE(dataset.nb_genes, dataset.n_batches) trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda) trainer.train(n_epochs=2) post = trainer.create_posterior(vae, dataset, shuffle=False, indices=all_indices) # Sample scale example px_scales = post.scale_sampler(n_samples_per_cell=4, n_samples=None, selection=all_indices)["scale"] assert (px_scales.shape[1] == dataset.nb_genes ), "posterior scales should have shape (n_samples, n_genes)" # Differential expression different models idx_1 = [1, 2, 3] idx_2 = [4, 5, 6, 7] de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="vanilla", use_permutation=True, M_permutation=100, ) de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="change", use_permutation=True, M_permutation=100, ) print(de_dataframe.keys()) assert (de_dataframe["confidence_interval_0.5_min"] <= de_dataframe["confidence_interval_0.5_max"]).all() assert (de_dataframe["confidence_interval_0.95_min"] <= de_dataframe["confidence_interval_0.95_max"]).all() # DE estimation example de_probabilities = de_dataframe.loc[:, "proba_de"] assert ((0.0 <= de_probabilities) & (de_probabilities <= 1.0)).all()
def trainVAE(gene_dataset, rmCellTypes,rep): vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels, n_hidden=128, n_latent=10, n_layers=2, dispersion='gene') trainer = UnsupervisedTrainer(vae, gene_dataset, train_size=1.0) if os.path.isfile('../NoOverlap/vae.%s%s.pkl' % (rmCellTypes,rep)): trainer.model.load_state_dict(torch.load('../NoOverlap/vae.%s%s.pkl' % (rmCellTypes,rep))) trainer.model.eval() else: trainer.train(n_epochs=150) torch.save(trainer.model.state_dict(), '../NoOverlap/vae.%s%s.pkl' % (rmCellTypes,rep)) full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset))) latent, batch_indices, labels = full.sequential().get_latent() batch_indices = batch_indices.ravel() return latent, batch_indices,labels,trainer
def compute_scvi_latent( adata: sc.AnnData, n_latent: int = 5, n_epochs: int = 100, lr: float = 1e-3, use_batches: bool = False, use_cuda: bool = True, ) -> Tuple[scvi.inference.Posterior, np.ndarray]: """Train and return a scVI model and sample a latent space :param adata: sc.AnnData object non-normalized :param n_latent: dimension of the latent space :param n_epochs: number of training epochs :param lr: learning rate :param use_batches :param use_cuda :return: (scvi.Posterior, latent_space) """ # Convert easily to scvi dataset scviDataset = AnnDataset(adata) # Train a model vae = VAE( scviDataset.nb_genes, n_batch=scviDataset.n_batches * use_batches, n_latent=n_latent, ) trainer = UnsupervisedTrainer(vae, scviDataset, train_size=1.0, use_cuda=use_cuda) trainer.train(n_epochs=n_epochs, lr=lr) #### # Extract latent space posterior = trainer.create_posterior(trainer.model, scviDataset, indices=np.arange( len(scviDataset))).sequential() latent, _, _ = posterior.get_latent() return posterior, latent
def correct_scvi(Xs, genes): import torch torch.manual_seed(0) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False from scvi.dataset import AnnDatasetFromAnnData from scvi.dataset.dataset import GeneExpressionDataset from scvi.inference import UnsupervisedTrainer from scvi.models import VAE all_ann = [AnnDatasetFromAnnData(AnnData(X, var=genes)) for X in Xs] all_dataset = GeneExpressionDataset() all_dataset.populate_from_datasets(all_ann) vae = VAE(all_dataset.nb_genes, n_batch=all_dataset.n_batches, n_labels=all_dataset.n_labels, n_hidden=128, n_latent=30, n_layers=2, dispersion='gene') trainer = UnsupervisedTrainer( vae, all_dataset, train_size=1., use_cuda=True, ) n_epochs = 100 #trainer.train(n_epochs=n_epochs) #torch.save(trainer.model.state_dict(), # 'data/harmonization.vae.pkl') trainer.model.load_state_dict(torch.load('data/harmonization.vae.pkl')) trainer.model.eval() full = trainer.create_posterior(trainer.model, all_dataset, indices=np.arange(len(all_dataset))) latent, batch_indices, labels = full.sequential().get_latent() return latent
def scvi_impute() -> None: fnm: str = "sc_10x_5cl_forimput_cnt.csv" save_path: PosixPath = here('./10xGenomics/scRNAseq') symsim_dataset = CsvDataset(fnm, save_path=save_path, gene_by_cell=True) vae = VAE(symsim_dataset.nb_genes) trainer = UnsupervisedTrainer(vae, symsim_dataset, train_size=1.0, use_cuda=use_cuda, frequency=5) trainer.train(n_epochs=n_epochs, lr=lr) full = trainer.create_posterior(trainer.model, symsim_dataset, indices=np.arange(len(symsim_dataset))) impute_values = full.sequential().imputation() outfnm: str = "scvi_impt.csv" out_path = here("./10xGenomics/impt/").joinpath(outfnm) np.savetxt(out_path, impute_values, delimiter=",")
def scvi_impute(seed: int = 1, platform: str = "umi") -> None: fnm: str = f"sim_{ncell}_{ngene}_{seed}_{platform}_.csv" save_path: PosixPath = here('./scVI/data/symsim') # fullpath:PosixPath = here('./scVI/data/symsim').joinpath(fnm) symsim_dataset = CsvDataset(fnm, save_path=save_path, gene_by_cell=True) vae = VAE(symsim_dataset.nb_genes) trainer = UnsupervisedTrainer(vae, symsim_dataset, train_size=1.0, use_cuda=use_cuda, frequency=5) trainer.train(n_epochs=n_epochs, lr=lr) full = trainer.create_posterior(trainer.model, symsim_dataset, indices=np.arange(len(symsim_dataset))) impute_values = full.sequential().imputation() out_path = here("./simutool/jobs/scvi_result").joinpath(fnm) np.savetxt(out_path, impute_values, delimiter=",")
def test_differential_expression(save_path): dataset = CortexDataset(save_path=save_path) n_cells = len(dataset) all_indices = np.arange(n_cells) vae = VAE(dataset.nb_genes, dataset.n_batches) trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda) trainer.train(n_epochs=2) post = trainer.create_posterior(vae, dataset, shuffle=False, indices=all_indices) with tempfile.TemporaryDirectory() as temp_dir: posterior_save_path = os.path.join(temp_dir, "posterior_data") post.save_posterior(posterior_save_path) new_vae = VAE(dataset.nb_genes, dataset.n_batches) new_post = load_posterior(posterior_save_path, model=new_vae, use_cuda=False) assert np.array_equal(new_post.indices, post.indices) assert np.array_equal(new_post.gene_dataset.X, post.gene_dataset.X) # Sample scale example px_scales = post.scale_sampler( n_samples_per_cell=4, n_samples=None, selection=all_indices )["scale"] assert ( px_scales.shape[1] == dataset.nb_genes ), "posterior scales should have shape (n_samples, n_genes)" # Differential expression different models idx_1 = [1, 2, 3] idx_2 = [4, 5, 6, 7] de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="vanilla", use_permutation=True, M_permutation=100, ) de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="change", use_permutation=True, M_permutation=100, cred_interval_lvls=[0.5, 0.95], ) print(de_dataframe.keys()) assert ( de_dataframe["confidence_interval_0.5_min"] <= de_dataframe["confidence_interval_0.5_max"] ).all() assert ( de_dataframe["confidence_interval_0.95_min"] <= de_dataframe["confidence_interval_0.95_max"] ).all() # DE estimation example de_probabilities = de_dataframe.loc[:, "proba_de"] assert ((0.0 <= de_probabilities) & (de_probabilities <= 1.0)).all() # Test totalVI DE sp = os.path.join(save_path, "10X") dataset = Dataset10X(dataset_name="pbmc_10k_protein_v3", save_path=sp) n_cells = len(dataset) all_indices = np.arange(n_cells) vae = TOTALVI( dataset.nb_genes, len(dataset.protein_names), n_batch=dataset.n_batches ) trainer = TotalTrainer( vae, dataset, train_size=0.5, use_cuda=use_cuda, early_stopping_kwargs=None ) trainer.train(n_epochs=2) post = trainer.create_posterior( vae, dataset, shuffle=False, indices=all_indices, type_class=TotalPosterior ) # Differential expression different models idx_1 = [1, 2, 3] idx_2 = [4, 5, 6, 7] de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="vanilla", use_permutation=True, M_permutation=100, ) de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="change", use_permutation=True, M_permutation=100, )
def test_cortex(save_path): cortex_dataset = CortexDataset(save_path=save_path) vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches) trainer_cortex_vae = UnsupervisedTrainer( vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda ) trainer_cortex_vae.train(n_epochs=1) trainer_cortex_vae.train_set.reconstruction_error() trainer_cortex_vae.train_set.differential_expression_stats() trainer_cortex_vae.train_set.generate_feature_correlation_matrix( n_samples=2, correlation_type="pearson" ) trainer_cortex_vae.train_set.generate_feature_correlation_matrix( n_samples=2, correlation_type="spearman" ) trainer_cortex_vae.train_set.imputation(n_samples=1) trainer_cortex_vae.test_set.imputation(n_samples=5) trainer_cortex_vae.corrupt_posteriors(corruption="binomial") trainer_cortex_vae.corrupt_posteriors() trainer_cortex_vae.train(n_epochs=1) trainer_cortex_vae.uncorrupt_posteriors() trainer_cortex_vae.train_set.imputation_benchmark( n_samples=1, show_plot=False, title_plot="imputation", save_path=save_path ) trainer_cortex_vae.train_set.generate_parameters() n_cells, n_genes = ( len(trainer_cortex_vae.train_set.indices), cortex_dataset.nb_genes, ) n_samples = 3 (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters() assert dropout.shape == (n_cells, n_genes) and means.shape == (n_cells, n_genes) assert dispersions.shape == (n_cells, n_genes) (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters( n_samples=n_samples ) assert dropout.shape == (n_samples, n_cells, n_genes) assert means.shape == (n_samples, n_cells, n_genes,) (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters( n_samples=n_samples, give_mean=True ) assert dropout.shape == (n_cells, n_genes) and means.shape == (n_cells, n_genes) full = trainer_cortex_vae.create_posterior( vae, cortex_dataset, indices=np.arange(len(cortex_dataset)) ) x_new, x_old = full.generate(n_samples=10) assert x_new.shape == (cortex_dataset.nb_cells, cortex_dataset.nb_genes, 10) assert x_old.shape == (cortex_dataset.nb_cells, cortex_dataset.nb_genes) trainer_cortex_vae.train_set.imputation_benchmark( n_samples=1, show_plot=False, title_plot="imputation", save_path=save_path ) svaec = SCANVI( cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels ) trainer_cortex_svaec = JointSemiSupervisedTrainer( svaec, cortex_dataset, n_labelled_samples_per_class=3, use_cuda=use_cuda ) trainer_cortex_svaec.train(n_epochs=1) trainer_cortex_svaec.labelled_set.accuracy() trainer_cortex_svaec.full_dataset.reconstruction_error() svaec = SCANVI( cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels ) trainer_cortex_svaec = AlternateSemiSupervisedTrainer( svaec, cortex_dataset, n_labelled_samples_per_class=3, use_cuda=use_cuda ) trainer_cortex_svaec.train(n_epochs=1, lr=1e-2) trainer_cortex_svaec.unlabelled_set.accuracy() data_train, labels_train = trainer_cortex_svaec.labelled_set.raw_data() data_test, labels_test = trainer_cortex_svaec.unlabelled_set.raw_data() compute_accuracy_svc( data_train, labels_train, data_test, labels_test, param_grid=[{"C": [1], "kernel": ["linear"]}], ) compute_accuracy_rf( data_train, labels_train, data_test, labels_test, param_grid=[{"max_depth": [3], "n_estimators": [10]}], ) cls = Classifier(cortex_dataset.nb_genes, n_labels=cortex_dataset.n_labels) cls_trainer = ClassifierTrainer(cls, cortex_dataset) cls_trainer.train(n_epochs=1) cls_trainer.train_set.accuracy()
#Train the scVI model #depending on the size of your data and if you have an NVIDIA GPU, this could #take 10 minutes to 1+ hours. If you'd like to make some tea or coffee, now #would be an appropriate time to do so. trainer = UnsupervisedTrainer(vae, dataset, train_size=train_size, use_cuda=use_cuda, frequency=5) trainer.train(n_epochs=n_epochs, lr=lr) print("Model training finished!") #Create the posterior representation of the data, and extract the latent space #and imputed data downsampled_gene_names = dataset.gene_names full_posterior = trainer.create_posterior(vae, dataset, indices=np.arange(len(dataset))) scVI_latent = full_posterior.sequential().get_latent()[0] scVI_imputed = full_posterior.sequential().imputation() #Save the relevant output files np.savetxt(latent_save_file, scVI_latent, fmt='%s', delimiter=",") np.savetxt(imputation_save_file, scVI_imputed, fmt='%s', delimiter=",") np.savetxt(gene_names_save_file, downsampled_gene_names, fmt='%s', delimiter=",") torch.save(trainer.model.state_dict(), scVI_model_save_file)
trainer.train(n_epochs=n_epochs, lr=0.001) torch.save(trainer.model.state_dict(), file_name) # write training info ll_train_set = trainer.history["ll_train_set"][1:] ll_test_set = trainer.history["ll_test_set"][1:] x = np.linspace(1, n_epochs, (len(ll_train_set))) plt.plot(x, ll_train_set) plt.plot(x, ll_test_set) plt.title("training ll") plt.savefig("figures/simulations_scRNA/loss_training.png") plt.clf() # get latent space full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset))) latent, batch_indices, labels = full.sequential().get_latent() if plot: n_samples_tsne = 4000 full.show_t_sne(n_samples=n_samples_tsne, color_by='labels', save_name="figures/simulations_scRNA/tSNE.png") # prepare for differential expression cell_types = gene_dataset.cell_types print(gene_dataset.cell_types) couple_celltypes_list = [(0, 1), (1, 2), (1, 3), (3, 4)] for key in theoretical_FC.columns: print(key)
dataset = "path/to/UMI_count_table.csv.gz" dataset_dir = "path/to/" outdir = "path/to/output/directory/" # Read count matrix with all genes (from https://github.com/YosefLab/scVI/blob/master/tests/notebooks/data_loading.ipynb) local_csv_dataset = CsvDataset(dataset, save_path=dataset_dir, compression='gzip', new_n_genes=False) # Process data (from https://github.com/YosefLab/scVI/blob/master/tests/notebooks/basic_tutorial.ipynb) use_batches = False use_cuda = True vae = VAE(local_csv_dataset.nb_genes, n_batch=local_csv_dataset.n_batches * use_batches) trainer = UnsupervisedTrainer(vae, local_csv_dataset, train_size=0.75, use_cuda=use_cuda) trainer.train() full = trainer.create_posterior(trainer.model, local_csv_dataset, indices=np.arange(len(local_csv_dataset))) imputed_values = full.sequential().imputation() # Write output matrix np.savetxt(outdir + '/scvi_normalization.txt', imputed_values.T, fmt='%.6e', delimiter='\t')
trainer = UnsupervisedTrainer(vae, gene_dataset, train_size=0.9, use_cuda=use_cuda, frequency=5) trainer.train(n_epochs=n_epochs, lr=lr) ll_train = trainer.history["ll_train_set"] ll_test = trainer.history["ll_test_set"] x = np.linspace(0,50,(len(ll_train))) plt.plot(x, ll_train) plt.plot(x, ll_test) plt.ylim(min(ll_train)-50, 1000) plt.show() full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset))) print("Entropy batch mixing :", full.entropy_batch_mixing()) full.clustering_scores(prediction_algorithm = "gmm") full.show_t_sne() xx = full.one_vs_all_degenes() # ======== from scvi.inference import Trainer from scvi.inference.posterior import Posterior from sklearn.model_selection._split import _validate_shuffle_split trainerr = Trainer(vae,gene_dataset)
def test_model_fit(model_fit: bool): """ Test that controls that scVI inferred distributions make sense on a non-trivial synthetic dataset. We define technical zeros of the synthetic dataset as the zeros that result from highly expressed genes (relatively to the considered cell) and the biological zeros as the rest of the zeros :return: None """ print('model_fit set to : ', model_fit) folder = '/tmp/scVI_zeros_test' print('Saving graphs in : {}'.format(folder)) if not os.path.exists(folder): os.makedirs(folder) n_epochs = 150 if model_fit else 1 n_mc_sim_total = 100 if model_fit else 1 n_cells_cluster = 1000 if model_fit else 100 torch.manual_seed(seed=42) synth_data = ZISyntheticDatasetCorr(n_clusters=8, n_genes_high=15, n_overlap=8, lam_0=320, n_cells_cluster=n_cells_cluster, weight_high=1.714286, weight_low=1, dropout_coef_low=0.08, dropout_coef_high=0.05) is_high = synth_data.is_highly_exp.squeeze() poisson_params_gt = synth_data.exprs_param.squeeze() # Step 2: Training scVI model mdl = VAE(n_input=synth_data.nb_genes, n_batch=synth_data.n_batches, reconstruction_loss='zinb', n_latent=5) trainer = UnsupervisedTrainer(model=mdl, gene_dataset=synth_data, use_cuda=True, train_size=1.0) trainer.train(n_epochs=n_epochs, lr=1e-3) full = trainer.create_posterior(trainer.model, synth_data, indices=np.arange(len(synth_data))) # Step 3: Inference poisson_params = [] p_dropout_infered = [] latent_reps = [] bio_zero_p = [] tech_zero_p = [] with torch.no_grad(): for tensors in full.sequential(): # TODO: Properly sample posterior sample_batch, _, _, batch_index, labels = tensors px_scale, px_dispersion, px_rate, px_dropout, qz_m, qz_v, z, ql_m, ql_v, library = mdl.inference( sample_batch, batch_index) p_zero = 1.0 / (1.0 + torch.exp(-px_dropout)) p_dropout_infered.append(p_zero.cpu().numpy()) l_train_batch = torch.zeros( (sample_batch.size(0), sample_batch.size(1), n_mc_sim_total), device=sample_batch.device) for n_mc_sim in range(n_mc_sim_total): p = px_rate / (px_rate + px_dispersion) r = px_dispersion l_train = torch.distributions.Gamma(concentration=r, rate=(1 - p) / p).sample() l_train = torch.clamp(l_train, max=1e18) X = torch.distributions.Poisson(l_train).sample() l_train_batch[:, :, n_mc_sim] = l_train p_zero = 1.0 / (1.0 + torch.exp(-px_dropout)) random_prob = torch.rand_like(p_zero) X[random_prob <= p_zero] = 0 l_train_batch = torch.mean(l_train_batch, dim=(-1)) bio_zero_prob_batch = torch.exp(-l_train_batch) tech_zero_prob_batch = p_zero bio_zero_p.append(bio_zero_prob_batch.cpu().numpy()) tech_zero_p.append(tech_zero_prob_batch.cpu().numpy()) latent_reps.append(z.cpu().numpy()) poisson_params.append(l_train_batch.cpu().numpy()) latent_reps = np.concatenate(latent_reps) bio_zero_p = np.concatenate(bio_zero_p) tech_zero_p = np.concatenate(tech_zero_p) bio_zero_tech_no = bio_zero_p * (1.0 - tech_zero_p) tech_zero_bio_no = (1.0 - bio_zero_p) * tech_zero_p # Final Step: Checking predictions # Dropout checks p_dropout_infered_all = np.concatenate(p_dropout_infered) p_dropout_gt = synth_data.p_dropout.squeeze() vmin = 0.0 vmax = 2.0 * p_dropout_gt.max() fig, axes = plt.subplots(ncols=2, nrows=2, figsize=(10, 10)) sns.heatmap(p_dropout_infered_all, vmin=vmin, vmax=vmax, ax=axes[0, 1]) axes[0, 1].set_title('Dropout Rate Predicted') sns.heatmap(p_dropout_gt, vmin=vmin, vmax=vmax, ax=axes[0, 0]) axes[0, 0].set_title('Dropout Rate GT') # Poisson Params checks poisson_params = np.concatenate(poisson_params) vmin = min(poisson_params_gt.min(), poisson_params.min()) vmax = max(poisson_params_gt.max(), poisson_params.max()) sns.heatmap(poisson_params, vmin=vmin, vmax=vmax, ax=axes[1, 1]) axes[1, 1].set_title('Poisson Distribution Parameter Predicted') sns.heatmap(poisson_params_gt, vmin=vmin, vmax=vmax, ax=axes[1, 0]) axes[1, 0].set_title('Poisson Distribution Parameter GT') plt.savefig(os.path.join(folder, 'params_comparison.png')) plt.close() # TODO: Decrease test tolerances l1_poisson = np.abs(poisson_params - poisson_params_gt).mean() if model_fit: print('Average Poisson L1 error: ', l1_poisson) assert l1_poisson <= 0.75, \ 'High Error on Poisson parameter inference' l1_dropout = np.abs(p_dropout_infered_all - synth_data.p_dropout).mean() print('Average Dropout L1 error: ', l1_dropout) assert l1_dropout <= 5e-2, \ 'High Error on Dropout parameter inference' # tSNE plot print("Computing tSNE rep ...") x_rep = TSNE(n_components=2).fit_transform(latent_reps) print("Done!") pos = np.random.permutation(len(x_rep))[:1000] labels = ['c_{}'.format(idx) for idx in synth_data.labels[pos].squeeze()] sns.scatterplot(x=x_rep[pos, 0], y=x_rep[pos, 1], hue=labels, palette='Set2') plt.title('Synthetic Dataset latent space') plt.savefig(os.path.join(folder, 't_sne.png')) plt.close() # Tech/Bio Classif checks # --For high expressed genes # ---Poisson nul and ZI non null print(bio_zero_tech_no[is_high].mean(), synth_data.probas_zero_bio_tech_high[1, 0]) # ---Poisson non nul and . print(tech_zero_bio_no[is_high].mean(), synth_data.probas_zero_bio_tech_high[0, 1]) # --Low expressed expressend # ---Poisson nul and ZI non null print(bio_zero_tech_no[~is_high].mean(), synth_data.probas_zero_bio_tech_low[1, 0]) # ---Poisson non nul and . print(tech_zero_bio_no[~is_high].mean(), synth_data.probas_zero_bio_tech_low[0, 1]) diff1 = np.abs(bio_zero_tech_no[is_high].mean() - synth_data.probas_zero_bio_tech_high[1, 0]) diff2 = np.abs(tech_zero_bio_no[is_high].mean() - synth_data.probas_zero_bio_tech_high[0, 1]) diff3 = np.abs(bio_zero_tech_no[~is_high].mean() - synth_data.probas_zero_bio_tech_low[1, 0]) diff4 = np.abs(tech_zero_bio_no[~is_high].mean() - synth_data.probas_zero_bio_tech_low[0, 1]) if model_fit: assert diff1 <= 2e-2 assert diff2 <= 2e-2 assert diff3 <= 2e-2 assert diff4 <= 2e-2
def solo(X, gene_names, doublet_depth=2.0, gpu=False, out_dir='solo_out', doublet_ratio=2.0, seed=None, known_doublets=None, doublet_type='multinomial', expected_number_of_doublets=None, plot=False, normal_logging=False, n_hidden=128, n_latent=16, cl_hidden=64, cl_layers=1, dropout_rate=0.1, learning_rate=0.001, valid_pct=0.1): from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter import json import os import shutil import anndata import numpy as np from anndata import AnnData from sklearn.metrics import roc_auc_score, roc_curve from scipy.sparse import issparse from collections import defaultdict import scvi from scvi.dataset import AnnDatasetFromAnnData, LoomDataset, GeneExpressionDataset from scvi.models import Classifier, VAE from scvi.inference import UnsupervisedTrainer, ClassifierTrainer import torch from solo.utils import create_average_doublet, create_summed_doublet, create_multinomial_doublet, make_gene_expression_dataset if not normal_logging: scvi._settings.set_verbosity(10) if gpu and not torch.cuda.is_available(): gpu = torch.cuda.is_available() print('Cuda is not available, switching to cpu running!') # if not os.path.isdir(out_dir): # os.mkdir(out_dir) # data_ext = os.path.splitext(data_file)[-1] # if data_ext == '.loom': # scvi_data = LoomDataset(data_file) # elif data_ext == '.h5ad': # scvi_data = AnnDatasetFromAnnData(anndata.read(data_file)) # else: # msg = f'{data_ext} is not a recognized format.\n' # msg += 'must be one of {h5ad, loom}' # raise TypeError(msg) # if issparse(scvi_data.X): # scvi_data.X = scvi_data.X.todense() scvi_data = make_gene_expression_dataset(X, gene_names) num_cells, num_genes = scvi_data.X.shape if known_doublets is not None: print('Removing known doublets for in silico doublet generation') print('Make sure known doublets are in the same order as your data') known_doublets = np.loadtxt(known_doublets, dtype=str) == 'True' assert len(known_doublets) == scvi_data.X.shape[0] known_doublet_data = make_gene_expression_dataset( scvi_data.X[known_doublets], scvi_data.gene_names) known_doublet_data.labels = np.ones(known_doublet_data.X.shape[0]) singlet_scvi_data = make_gene_expression_dataset( scvi_data.X[~known_doublets], scvi_data.gene_names) singlet_num_cells, _ = singlet_scvi_data.X.shape else: known_doublet_data = None singlet_num_cells = num_cells known_doublets = np.zeros(num_cells, dtype=bool) singlet_scvi_data = scvi_data singlet_scvi_data.labels = np.zeros(singlet_scvi_data.X.shape[0]) scvi_data.labels = known_doublets.astype(int) params = { "n_hidden": n_hidden, "n_latent": n_latent, "cl_hidden": cl_hidden, "cl_layers": cl_layers, "dropout_rate": dropout_rate, "learning_rate": learning_rate, "valid_pct": valid_pct } # set VAE params vae_params = {} for par in [ 'n_hidden', 'n_latent', 'n_layers', 'dropout_rate', 'ignore_batch' ]: if par in params: vae_params[par] = params[par] vae_params['n_batch'] = 0 if params.get('ignore_batch', False) else scvi_data.n_batches # training parameters valid_pct = params.get('valid_pct', 0.1) learning_rate = params.get('learning_rate', 1e-3) stopping_params = {'patience': params.get('patience', 10), 'threshold': 0} ################################################## # VAE vae = VAE(n_input=singlet_scvi_data.nb_genes, n_labels=2, reconstruction_loss='nb', log_variational=True, **vae_params) if seed: if gpu: device = torch.device('cuda') vae.load_state_dict(torch.load(os.path.join(seed, 'vae.pt'))) vae.to(device) else: map_loc = 'cpu' vae.load_state_dict( torch.load(os.path.join(seed, 'vae.pt'), map_location=map_loc)) # copy latent representation latent_file = os.path.join(seed, 'latent.npy') if os.path.isfile(latent_file): shutil.copy(latent_file, os.path.join(out_dir, 'latent.npy')) else: stopping_params['early_stopping_metric'] = 'reconstruction_error' stopping_params['save_best_state_metric'] = 'reconstruction_error' # initialize unsupervised trainer utrainer = \ UnsupervisedTrainer(vae, singlet_scvi_data, train_size=(1. - valid_pct), frequency=2, metrics_to_monitor=['reconstruction_error'], use_cuda=gpu, early_stopping_kwargs=stopping_params) utrainer.history['reconstruction_error_test_set'].append(0) # initial epoch utrainer.train(n_epochs=2000, lr=learning_rate) # drop learning rate and continue utrainer.early_stopping.wait = 0 utrainer.train(n_epochs=500, lr=0.5 * learning_rate) # save VAE torch.save(vae.state_dict(), os.path.join(out_dir, 'vae.pt')) # save latent representation full_posterior = utrainer.create_posterior(utrainer.model, singlet_scvi_data, indices=np.arange( len(singlet_scvi_data))) latent, _, _ = full_posterior.sequential().get_latent() np.save(os.path.join(out_dir, 'latent.npy'), latent.astype('float32')) ################################################## # simulate doublets non_zero_indexes = np.where(singlet_scvi_data.X > 0) cells = non_zero_indexes[0] genes = non_zero_indexes[1] cells_ids = defaultdict(list) for cell_id, gene in zip(cells, genes): cells_ids[cell_id].append(gene) # choose doublets function type if doublet_type == 'average': doublet_function = create_average_doublet elif doublet_type == 'sum': doublet_function = create_summed_doublet else: doublet_function = create_multinomial_doublet cell_depths = singlet_scvi_data.X.sum(axis=1) num_doublets = int(doublet_ratio * singlet_num_cells) if known_doublet_data is not None: num_doublets -= known_doublet_data.X.shape[0] # make sure we are making a non negative amount of doublets assert num_doublets >= 0 in_silico_doublets = np.zeros((num_doublets, num_genes), dtype='float32') # for desired # doublets for di in range(num_doublets): # sample two cells i, j = np.random.choice(singlet_num_cells, size=2) # generate doublets in_silico_doublets[di, :] = \ doublet_function(singlet_scvi_data.X, i, j, doublet_depth=doublet_depth, cell_depths=cell_depths, cells_ids=cells_ids) # merge datasets # we can maybe up sample the known doublets # concatentate classifier_data = GeneExpressionDataset() classifier_data.populate_from_data( X=np.vstack([scvi_data.X, in_silico_doublets]), labels=np.hstack( [np.ravel(scvi_data.labels), np.ones(in_silico_doublets.shape[0])]), remap_attributes=False) assert (len(np.unique(classifier_data.labels.flatten())) == 2) ################################################## # classifier # model classifier = Classifier(n_input=(vae.n_latent + 1), n_hidden=params['cl_hidden'], n_layers=params['cl_layers'], n_labels=2, dropout_rate=params['dropout_rate']) # trainer stopping_params['early_stopping_metric'] = 'accuracy' stopping_params['save_best_state_metric'] = 'accuracy' strainer = ClassifierTrainer(classifier, classifier_data, train_size=(1. - valid_pct), frequency=2, metrics_to_monitor=['accuracy'], use_cuda=gpu, sampling_model=vae, sampling_zl=True, early_stopping_kwargs=stopping_params) # initial strainer.train(n_epochs=1000, lr=learning_rate) # drop learning rate and continue strainer.early_stopping.wait = 0 strainer.train(n_epochs=300, lr=0.1 * learning_rate) torch.save(classifier.state_dict(), os.path.join(out_dir, 'classifier.pt')) ################################################## # post-processing # use logits for predictions for better results logits_classifier = Classifier(n_input=(vae.n_latent + 1), n_hidden=params['cl_hidden'], n_layers=params['cl_layers'], n_labels=2, dropout_rate=params['dropout_rate'], logits=True) logits_classifier.load_state_dict(classifier.state_dict()) # using logits leads to better performance in for ranking logits_strainer = ClassifierTrainer(logits_classifier, classifier_data, train_size=(1. - valid_pct), frequency=2, metrics_to_monitor=['accuracy'], use_cuda=gpu, sampling_model=vae, sampling_zl=True, early_stopping_kwargs=stopping_params) # models evaluation mode vae.eval() classifier.eval() logits_classifier.eval() print('Train accuracy: %.4f' % strainer.train_set.accuracy()) print('Test accuracy: %.4f' % strainer.test_set.accuracy()) # compute predictions manually # output logits train_y, train_score = strainer.train_set.compute_predictions(soft=True) test_y, test_score = strainer.test_set.compute_predictions(soft=True) # train_y == true label # train_score[:, 0] == singlet score; train_score[:, 1] == doublet score train_score = train_score[:, 1] train_y = train_y.astype('bool') test_score = test_score[:, 1] test_y = test_y.astype('bool') train_auroc = roc_auc_score(train_y, train_score) test_auroc = roc_auc_score(test_y, test_score) print('Train AUROC: %.4f' % train_auroc) print('Test AUROC: %.4f' % test_auroc) train_fpr, train_tpr, train_t = roc_curve(train_y, train_score) test_fpr, test_tpr, test_t = roc_curve(test_y, test_score) train_t = np.minimum(train_t, 1 + 1e-9) test_t = np.minimum(test_t, 1 + 1e-9) train_acc = np.zeros(len(train_t)) for i in range(len(train_t)): train_acc[i] = np.mean(train_y == (train_score > train_t[i])) test_acc = np.zeros(len(test_t)) for i in range(len(test_t)): test_acc[i] = np.mean(test_y == (test_score > test_t[i])) # write predictions # softmax predictions order_y, order_score = strainer.compute_predictions(soft=True) _, order_pred = strainer.compute_predictions() doublet_score = order_score[:, 1] np.save(os.path.join(out_dir, 'softmax_scores.npy'), doublet_score[:num_cells]) np.save(os.path.join(out_dir, 'softmax_scores_sim.npy'), doublet_score[num_cells:]) # logit predictions logit_y, logit_score = logits_strainer.compute_predictions(soft=True) logit_doublet_score = logit_score[:, 1] np.save(os.path.join(out_dir, 'logit_scores.npy'), logit_doublet_score[:num_cells]) np.save(os.path.join(out_dir, 'logit_scores_sim.npy'), logit_doublet_score[num_cells:]) if expected_number_of_doublets is not None: solo_scores = doublet_score[:num_cells] k = len(solo_scores) - expected_number_of_doublets if expected_number_of_doublets / len(solo_scores) > .5: print( 'Make sure you actually expect more than half your cells to be doublets. If not change your -e parameter value' ) assert k > 0 idx = np.argpartition(solo_scores, k) threshold = np.max(solo_scores[idx[:k]]) is_solo_doublet = doublet_score > threshold else: is_solo_doublet = order_pred[:num_cells] is_doublet = known_doublets new_doublets_idx = np.where(~(is_doublet) & is_solo_doublet[:num_cells])[0] is_doublet[new_doublets_idx] = True np.save(os.path.join(out_dir, 'is_doublet.npy'), is_doublet[:num_cells]) np.save(os.path.join(out_dir, 'is_doublet_sim.npy'), is_doublet[num_cells:]) np.save(os.path.join(out_dir, 'preds.npy'), order_pred[:num_cells]) np.save(os.path.join(out_dir, 'preds_sim.npy'), order_pred[num_cells:]) if plot: import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns # plot ROC plt.figure() plt.plot(train_fpr, train_tpr, label='Train') plt.plot(test_fpr, test_tpr, label='Test') plt.gca().set_xlabel('False positive rate') plt.gca().set_ylabel('True positive rate') plt.legend() plt.savefig(os.path.join(out_dir, 'roc.pdf')) plt.close() # plot accuracy plt.figure() plt.plot(train_t, train_acc, label='Train') plt.plot(test_t, test_acc, label='Test') plt.axvline(0.5, color='black', linestyle='--') plt.gca().set_xlabel('Threshold') plt.gca().set_ylabel('Accuracy') plt.legend() plt.savefig(os.path.join(out_dir, 'accuracy.pdf')) plt.close() # plot distributions plt.figure() sns.distplot(test_score[test_y], label='Simulated') sns.distplot(test_score[~test_y], label='Observed') plt.legend() plt.savefig(os.path.join(out_dir, 'train_v_test_dist.pdf')) plt.close() plt.figure() sns.distplot(doublet_score[:num_cells], label='Simulated') plt.legend() plt.savefig(os.path.join(out_dir, 'real_cells_dist.pdf')) plt.close()
hemat_batch_2) hemat_vae = VAE(hemat_data.nb_genes, n_batch=hemat_data.n_batches, n_labels=hemat_data.n_labels, n_hidden=128, n_latent=30, n_layers=2, dispersion='gene') hemat_trainer = UnsupervisedTrainer(hemat_vae, hemat_data, train_size=0.9) hemat_trainer.train(n_epochs=100) hemat_full = hemat_trainer.create_posterior(hemat_trainer.model, hemat_data, indices=np.arange(len(hemat_data))) hemat_latent, hemat_batch_indices, hemat_labels = hemat_full.sequential( ).get_latent() hemat_batch_indices = hemat_batch_indices.ravel() np.savetxt("scVI_hemat_v1_latent_0716.txt", hemat_latent, fmt="%10.9f", delimiter="\t") hemat_adata_latent = sc.AnnData(hemat_latent) sc.pp.neighbors(hemat_adata_latent, use_rep='X', n_neighbors=30, metric='minkowski')
def scvi( adata: AnnData, n_hidden: int = 128, n_latent: int = 10, n_layers: int = 1, dispersion: str = "gene", n_epochs: int = 400, lr: int = 1e-3, train_size: int = 1.0, batch_key: Optional[str] = None, use_highly_variable_genes: bool = True, subset_genes: Optional[Sequence[Union[int, str]]] = None, linear_decoder: bool = False, copy: bool = False, use_cuda: bool = True, return_posterior: bool = True, trainer_kwargs: dict = {}, model_kwargs: dict = {}, ) -> Optional[AnnData]: """\ SCVI [Lopez18]_. Fits scVI model onto raw count data given an anndata object scVI uses stochastic optimization and deep neural networks to aggregate information across similar cells and genes and to approximate the distributions that underlie observed expression values, while accounting for batch effects and limited sensitivity. To use a linear-decoded Variational AutoEncoder model (implementation of [Svensson20]_.), set linear_decoded = True. Compared to standard VAE, this model is less powerful, but can be used to inspect which genes contribute to variation in the dataset. It may also be used for all scVI tasks, like differential expression, batch correction, imputation, etc. However, batch correction may be less powerful as it assumes a linear model. .. note:: More information and bug reports `here <https://github.com/YosefLab/scVI>`__. Parameters ---------- adata An anndata file with `X` attribute of unnormalized count data n_hidden Number of nodes per hidden layer n_latent Dimensionality of the latent space n_layers Number of hidden layers used for encoder and decoder NNs dispersion One of the following * `'gene'` - dispersion parameter of NB is constant per gene across cells * `'gene-batch'` - dispersion can differ between different batches * `'gene-label'` - dispersion can differ between different labels * `'gene-cell'` - dispersion can differ for every gene in every cell n_epochs Number of epochs to train lr Learning rate train_size The train size, either a float between 0 and 1 or an integer for the number of training samples to use batch_key Column name in anndata.obs for batches. If None, no batch correction is performed If not None, batch correction is performed per batch category use_highly_variable_genes If true, uses only the genes in anndata.var["highly_variable"] subset_genes Optional list of indices or gene names to subset anndata. If not None, use_highly_variable_genes is ignored linear_decoder If true, uses LDVAE model, which is an implementation of [Svensson20]_. copy If true, a copy of anndata is returned return_posterior If true, posterior object is returned use_cuda If true, uses cuda trainer_kwargs Extra arguments for UnsupervisedTrainer model_kwargs Extra arguments for VAE or LDVAE model Returns ------- If `copy` is true, anndata is returned. If `return_posterior` is true, the posterior object is returned If both `copy` and `return_posterior` are true, a tuple of anndata and the posterior are returned in that order. `adata.obsm['X_scvi']` stores the latent representations `adata.obsm['X_scvi_denoised']` stores the normalized mean of the negative binomial `adata.obsm['X_scvi_sample_rate']` stores the mean of the negative binomial If linear_decoder is true: `adata.uns['ldvae_loadings']` stores the per-gene weights in the linear decoder as a genes by n_latent matrix. """ warnings.warn( "scvi via scanpy external API is no longer supported. " + "Please use the new scvi-tools package from `scvi-tools.org`", FutureWarning, ) try: from scvi.models import VAE, LDVAE from scvi.inference import UnsupervisedTrainer from scvi.dataset import AnnDatasetFromAnnData except ImportError: raise ImportError( "Please install scvi package from https://github.com/YosefLab/scVI" ) # check if observations are unnormalized using first 10 # code from: https://github.com/theislab/dca/blob/89eee4ed01dd969b3d46e0c815382806fbfc2526/dca/io.py#L63-L69 if len(adata) > 10: X_subset = adata.X[:10] else: X_subset = adata.X norm_error = ( 'Make sure that the dataset (adata.X) contains unnormalized count data.' ) if sp.sparse.issparse(X_subset): assert (X_subset.astype(int) != X_subset).nnz == 0, norm_error else: assert np.all(X_subset.astype(int) == X_subset), norm_error if subset_genes is not None: adata_subset = adata[:, subset_genes] elif use_highly_variable_genes and "highly_variable" in adata.var: adata_subset = adata[:, adata.var["highly_variable"]] else: adata_subset = adata if batch_key is not None: codes, uniques = pd.factorize(adata_subset.obs[batch_key]) adata_subset.obs['_tmp_scvi_batch'] = codes n_batches = len(uniques) else: n_batches = 0 dataset = AnnDatasetFromAnnData(adata_subset.copy(), batch_label='_tmp_scvi_batch') if linear_decoder: vae = LDVAE( n_input=dataset.nb_genes, n_batch=n_batches, n_labels=dataset.n_labels, n_hidden=n_hidden, n_latent=n_latent, n_layers_encoder=n_layers, dispersion=dispersion, **model_kwargs, ) else: vae = VAE( dataset.nb_genes, n_batch=n_batches, n_labels=dataset.n_labels, n_hidden=n_hidden, n_latent=n_latent, n_layers=n_layers, dispersion=dispersion, **model_kwargs, ) trainer = UnsupervisedTrainer( model=vae, gene_dataset=dataset, use_cuda=use_cuda, train_size=train_size, **trainer_kwargs, ) trainer.train(n_epochs=n_epochs, lr=lr) full = trainer.create_posterior(trainer.model, dataset, indices=np.arange(len(dataset))) latent, batch_indices, labels = full.sequential().get_latent() if copy: adata = adata.copy() adata.obsm['X_scvi'] = latent adata.obsm['X_scvi_denoised'] = full.sequential().get_sample_scale() adata.obsm['X_scvi_sample_rate'] = full.sequential().imputation() if linear_decoder: loadings = vae.get_loadings() df = pd.DataFrame(loadings, index=adata_subset.var_names) adata.uns['ldvae_loadings'] = df if copy and return_posterior: return adata, full elif copy: return adata elif return_posterior: return full
simulation_vae = VAE(simulation_data.nb_genes, n_batch=simulation_data.n_batches, n_labels=simulation_data.n_labels, n_hidden=128, n_latent=30, n_layers=2, dispersion='gene') simulation_trainer = UnsupervisedTrainer(simulation_vae, simulation_data, train_size=0.9) simulation_trainer.train(n_epochs=100) simulation_full = simulation_trainer.create_posterior( simulation_trainer.model, simulation_data, indices=np.arange(len(simulation_data))) simulation_latent, simulation_batch_indices, simulation_labels = simulation_full.sequential( ).get_latent() simulation_batch_indices = simulation_batch_indices.ravel() np.savetxt("scVI_simulation_v1_latent.txt", simulation_latent, fmt="%10.9f", delimiter="\t") simulation_adata_latent = sc.AnnData(simulation_latent) sc.pp.neighbors(simulation_adata_latent, use_rep='X', n_neighbors=30, metric='minkowski')
# LOAD full_file_save_path = os.path.join(save_path, vae_file_name) trainer.model.load_state_dict(torch.load(full_file_save_path)) trainer.model.eval() print(' ### ### ### loaded vae') print(datetime.datetime.now()) # n_epochs = 5 # lr = 0.001 # full_file_save_path = os.path.join(save_path, vae_file_name) # trainer.train(n_epochs=n_epochs, lr=lr) # torch.save(trainer.model.state_dict(), full_file_save_path) # train_test_results = pd.DataFrame(trainer.history).rename(columns={'elbo_train_set':'Train', 'elbo_test_set':'Test'}) # print(train_test_results) full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset))) latent, batch_indices, labels = full.sequential().get_latent() batch_indices = batch_indices.ravel() print(' ### ### ### computed full posterior') print(' ### ### ### url = ', url) # read submission csv and fetch selected cells submission = pd.read_csv(io.StringIO(requests.get(url).content.decode('utf-8')), index_col=0) selected_cells_csv_string = submission.to_csv(index=False).replace('\n', '<br>') # reconstruct user email from submission url email = url.split('https://aavcells-de.s3.us-west-2.amazonaws.com/submissions/')[1] email = email.split('%25')[0] email = email.replace('%40', '@')
def main(): usage = 'solo' parser = ArgumentParser(usage, formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument(dest='model_json_file', help='json file to pass VAE parameters') parser.add_argument( dest='data_path', help= 'path to h5ad, loom or 10x directory containing cell by genes counts') parser.add_argument('-d', dest='doublet_depth', default=2., type=float, help='Depth multiplier for a doublet relative to the \ average of its constituents') parser.add_argument('-g', dest='gpu', default=True, action='store_true', help='Run on GPU') parser.add_argument('-a', dest='anndata_output', default=False, action='store_true', help='output modified anndata object with solo scores \ Only works for anndata') parser.add_argument('-o', dest='out_dir', default='solo_out') parser.add_argument('-r', dest='doublet_ratio', default=2., type=float, help='Ratio of doublets to true \ cells') parser.add_argument('-s', dest='seed', default=None, help='Path to previous solo output \ directory. Seed VAE models with previously \ trained solo model. Directory structure is assumed to \ be the same as solo output directory structure. \ should at least have a vae.pt a pickled object of \ vae weights and a latent.npy an np.ndarray of the \ latents of your cells.') parser.add_argument('-k', dest='known_doublets', help='Experimentally defined doublets tsv file. \ Should be a single column of True/False. True \ indicates the cell is a doublet. No header.', type=str) parser.add_argument('-t', dest='doublet_type', help='Please enter \ multinomial, average, or sum', default='multinomial', choices=['multinomial', 'average', 'sum']) parser.add_argument('-e', dest='expected_number_of_doublets', help='Experimentally expected number of doublets', type=int, default=None) parser.add_argument('-p', dest='plot', default=False, action='store_true', help='Plot outputs for solo') parser.add_argument('-l', dest='normal_logging', default=False, action='store_true', help='Logging level set to normal (aka not debug)') parser.add_argument('--random_size', dest='randomize_doublet_size', default=False, action='store_true', help='Sample depth multipliers from Unif(1, \ DoubletDepth) \ to provide a diversity of possible doublet depths.') args = parser.parse_args() if not args.normal_logging: scvi._settings.set_verbosity(10) model_json_file = args.model_json_file data_path = args.data_path if args.gpu and not torch.cuda.is_available(): args.gpu = torch.cuda.is_available() print('Cuda is not available, switching to cpu running!') if not os.path.isdir(args.out_dir): os.mkdir(args.out_dir) ################################################## # data # read loom/anndata data_ext = os.path.splitext(data_path)[-1] if data_ext == '.loom': scvi_data = LoomDataset(data_path) elif data_ext == '.h5ad': adata = anndata.read(data_path) if issparse(adata.X): adata.X = adata.X.todense() scvi_data = AnnDatasetFromAnnData(adata) elif os.path.isdir(data_path): scvi_data = Dataset10X(save_path=data_path, measurement_names_column=1, dense=True) cell_umi_depth = scvi_data.X.sum(axis=1) fifth, ninetyfifth = np.percentile(cell_umi_depth, [5, 95]) min_cell_umi_depth = np.min(cell_umi_depth) max_cell_umi_depth = np.max(cell_umi_depth) if fifth * 10 < ninetyfifth: print("""WARNING YOUR DATA HAS A WIDE RANGE OF CELL DEPTHS. PLEASE MANUALLY REVIEW YOUR DATA""") print( f"Min cell depth: {min_cell_umi_depth}, Max cell depth: {max_cell_umi_depth}" ) else: msg = f'{data_path} is not a recognized format.\n' msg += 'must be one of {h5ad, loom, 10x directory}' raise TypeError(msg) num_cells, num_genes = scvi_data.X.shape if args.known_doublets is not None: print('Removing known doublets for in silico doublet generation') print('Make sure known doublets are in the same order as your data') known_doublets = np.loadtxt(args.known_doublets, dtype=str) == 'True' assert len(known_doublets) == scvi_data.X.shape[0] known_doublet_data = make_gene_expression_dataset( scvi_data.X[known_doublets], scvi_data.gene_names) known_doublet_data.labels = np.ones(known_doublet_data.X.shape[0]) singlet_scvi_data = make_gene_expression_dataset( scvi_data.X[~known_doublets], scvi_data.gene_names) singlet_num_cells, _ = singlet_scvi_data.X.shape else: known_doublet_data = None singlet_num_cells = num_cells known_doublets = np.zeros(num_cells, dtype=bool) singlet_scvi_data = scvi_data singlet_scvi_data.labels = np.zeros(singlet_scvi_data.X.shape[0]) scvi_data.labels = known_doublets.astype(int) ################################################## # parameters # check for parameters if not os.path.exists(model_json_file): raise FileNotFoundError(f'{model_json_file} does not exist.') # read parameters with open(model_json_file, 'r') as model_json_open: params = json.load(model_json_open) # set VAE params vae_params = {} for par in [ 'n_hidden', 'n_latent', 'n_layers', 'dropout_rate', 'ignore_batch' ]: if par in params: vae_params[par] = params[par] vae_params['n_batch'] = 0 if params.get('ignore_batch', False) else scvi_data.n_batches # training parameters batch_size = params.get('batch_size', 128) valid_pct = params.get('valid_pct', 0.1) learning_rate = params.get('learning_rate', 1e-3) stopping_params = {'patience': params.get('patience', 10), 'threshold': 0} # protect against single example batch while num_cells % batch_size == 1: batch_size = int(np.round(1.25 * batch_size)) print('Increasing batch_size to %d to avoid single example batch.' % batch_size) ################################################## # VAE vae = VAE(n_input=singlet_scvi_data.nb_genes, n_labels=2, reconstruction_loss='nb', log_variational=True, **vae_params) if args.seed: if args.gpu: device = torch.device('cuda') vae.load_state_dict(torch.load(os.path.join(args.seed, 'vae.pt'))) vae.to(device) else: map_loc = 'cpu' vae.load_state_dict( torch.load(os.path.join(args.seed, 'vae.pt'), map_location=map_loc)) # save latent representation utrainer = \ UnsupervisedTrainer(vae, singlet_scvi_data, train_size=(1. - valid_pct), frequency=2, metrics_to_monitor=['reconstruction_error'], use_cuda=args.gpu, early_stopping_kwargs=stopping_params, batch_size=batch_size) full_posterior = utrainer.create_posterior(utrainer.model, singlet_scvi_data, indices=np.arange( len(singlet_scvi_data))) latent, _, _ = full_posterior.sequential(batch_size).get_latent() np.save(os.path.join(args.out_dir, 'latent.npy'), latent.astype('float32')) else: stopping_params['early_stopping_metric'] = 'reconstruction_error' stopping_params['save_best_state_metric'] = 'reconstruction_error' # initialize unsupervised trainer utrainer = \ UnsupervisedTrainer(vae, singlet_scvi_data, train_size=(1. - valid_pct), frequency=2, metrics_to_monitor=['reconstruction_error'], use_cuda=args.gpu, early_stopping_kwargs=stopping_params, batch_size=batch_size) utrainer.history['reconstruction_error_test_set'].append(0) # initial epoch utrainer.train(n_epochs=2000, lr=learning_rate) # drop learning rate and continue utrainer.early_stopping.wait = 0 utrainer.train(n_epochs=500, lr=0.5 * learning_rate) # save VAE torch.save(vae.state_dict(), os.path.join(args.out_dir, 'vae.pt')) # save latent representation full_posterior = utrainer.create_posterior(utrainer.model, singlet_scvi_data, indices=np.arange( len(singlet_scvi_data))) latent, _, _ = full_posterior.sequential(batch_size).get_latent() np.save(os.path.join(args.out_dir, 'latent.npy'), latent.astype('float32')) ################################################## # simulate doublets non_zero_indexes = np.where(singlet_scvi_data.X > 0) cells = non_zero_indexes[0] genes = non_zero_indexes[1] cells_ids = defaultdict(list) for cell_id, gene in zip(cells, genes): cells_ids[cell_id].append(gene) # choose doublets function type if args.doublet_type == 'average': doublet_function = create_average_doublet elif args.doublet_type == 'sum': doublet_function = create_summed_doublet else: doublet_function = create_multinomial_doublet cell_depths = singlet_scvi_data.X.sum(axis=1) num_doublets = int(args.doublet_ratio * singlet_num_cells) if known_doublet_data is not None: num_doublets -= known_doublet_data.X.shape[0] # make sure we are making a non negative amount of doublets assert num_doublets >= 0 in_silico_doublets = np.zeros((num_doublets, num_genes), dtype='float32') # for desired # doublets for di in range(num_doublets): # sample two cells i, j = np.random.choice(singlet_num_cells, size=2) # generate doublets in_silico_doublets[di, :] = \ doublet_function(singlet_scvi_data.X, i, j, doublet_depth=args.doublet_depth, cell_depths=cell_depths, cells_ids=cells_ids, randomize_doublet_size=args.randomize_doublet_size) # merge datasets # we can maybe up sample the known doublets # concatentate classifier_data = GeneExpressionDataset() classifier_data.populate_from_data( X=np.vstack([scvi_data.X, in_silico_doublets]), labels=np.hstack( [np.ravel(scvi_data.labels), np.ones(in_silico_doublets.shape[0])]), remap_attributes=False) assert (len(np.unique(classifier_data.labels.flatten())) == 2) ################################################## # classifier # model classifier = Classifier(n_input=(vae.n_latent + 1), n_hidden=params['cl_hidden'], n_layers=params['cl_layers'], n_labels=2, dropout_rate=params['dropout_rate']) # trainer stopping_params['early_stopping_metric'] = 'accuracy' stopping_params['save_best_state_metric'] = 'accuracy' strainer = ClassifierTrainer(classifier, classifier_data, train_size=(1. - valid_pct), frequency=2, metrics_to_monitor=['accuracy'], use_cuda=args.gpu, sampling_model=vae, sampling_zl=True, early_stopping_kwargs=stopping_params, batch_size=batch_size) # initial strainer.train(n_epochs=1000, lr=learning_rate) # drop learning rate and continue strainer.early_stopping.wait = 0 strainer.train(n_epochs=300, lr=0.1 * learning_rate) torch.save(classifier.state_dict(), os.path.join(args.out_dir, 'classifier.pt')) ################################################## # post-processing # use logits for predictions for better results logits_classifier = Classifier(n_input=(vae.n_latent + 1), n_hidden=params['cl_hidden'], n_layers=params['cl_layers'], n_labels=2, dropout_rate=params['dropout_rate'], logits=True) logits_classifier.load_state_dict(classifier.state_dict()) # using logits leads to better performance in for ranking logits_strainer = ClassifierTrainer(logits_classifier, classifier_data, train_size=(1. - valid_pct), frequency=2, metrics_to_monitor=['accuracy'], use_cuda=args.gpu, sampling_model=vae, sampling_zl=True, early_stopping_kwargs=stopping_params, batch_size=batch_size) # models evaluation mode vae.eval() classifier.eval() logits_classifier.eval() print('Train accuracy: %.4f' % strainer.train_set.accuracy()) print('Test accuracy: %.4f' % strainer.test_set.accuracy()) # compute predictions manually # output logits train_y, train_score = strainer.train_set.compute_predictions(soft=True) test_y, test_score = strainer.test_set.compute_predictions(soft=True) # train_y == true label # train_score[:, 0] == singlet score; train_score[:, 1] == doublet score train_score = train_score[:, 1] train_y = train_y.astype('bool') test_score = test_score[:, 1] test_y = test_y.astype('bool') train_auroc = roc_auc_score(train_y, train_score) test_auroc = roc_auc_score(test_y, test_score) print('Train AUROC: %.4f' % train_auroc) print('Test AUROC: %.4f' % test_auroc) train_fpr, train_tpr, train_t = roc_curve(train_y, train_score) test_fpr, test_tpr, test_t = roc_curve(test_y, test_score) train_t = np.minimum(train_t, 1 + 1e-9) test_t = np.minimum(test_t, 1 + 1e-9) train_acc = np.zeros(len(train_t)) for i in range(len(train_t)): train_acc[i] = np.mean(train_y == (train_score > train_t[i])) test_acc = np.zeros(len(test_t)) for i in range(len(test_t)): test_acc[i] = np.mean(test_y == (test_score > test_t[i])) # write predictions # softmax predictions order_y, order_score = strainer.compute_predictions(soft=True) _, order_pred = strainer.compute_predictions() doublet_score = order_score[:, 1] np.save(os.path.join(args.out_dir, 'no_updates_softmax_scores.npy'), doublet_score[:num_cells]) np.save(os.path.join(args.out_dir, 'no_updates_softmax_scores_sim.npy'), doublet_score[num_cells:]) # logit predictions logit_y, logit_score = logits_strainer.compute_predictions(soft=True) logit_doublet_score = logit_score[:, 1] np.save(os.path.join(args.out_dir, 'logit_scores.npy'), logit_doublet_score[:num_cells]) np.save(os.path.join(args.out_dir, 'logit_scores_sim.npy'), logit_doublet_score[num_cells:]) # update threshold as a function of Solo's estimate of the number of # doublets # essentially a log odds update # TODO put in a function diff = np.inf counter_update = 0 solo_scores = doublet_score[:num_cells] logit_scores = logit_doublet_score[:num_cells] d_s = (args.doublet_ratio / (args.doublet_ratio + 1)) while (diff > .01) | (counter_update < 5): # calculate log odss calibration for logits d_o = np.mean(solo_scores) c = np.log(d_o / (1 - d_o)) - np.log(d_s / (1 - d_s)) # update soloe scores solo_scores = 1 / (1 + np.exp(-(logit_scores + c))) # update while conditions diff = np.abs(d_o - np.mean(solo_scores)) counter_update += 1 np.save(os.path.join(args.out_dir, 'softmax_scores.npy'), solo_scores) if args.expected_number_of_doublets is not None: k = len(solo_scores) - args.expected_number_of_doublets if args.expected_number_of_doublets / len(solo_scores) > .5: print('''Make sure you actually expect more than half your cells to be doublets. If not change your -e parameter value''') assert k > 0 idx = np.argpartition(solo_scores, k) threshold = np.max(solo_scores[idx[:k]]) is_solo_doublet = solo_scores > threshold else: is_solo_doublet = solo_scores > .5 is_doublet = known_doublets new_doublets_idx = np.where(~(is_doublet) & is_solo_doublet[:num_cells])[0] is_doublet[new_doublets_idx] = True np.save(os.path.join(args.out_dir, 'is_doublet.npy'), is_doublet[:num_cells]) np.save(os.path.join(args.out_dir, 'is_doublet_sim.npy'), is_doublet[num_cells:]) np.save(os.path.join(args.out_dir, 'preds.npy'), order_pred[:num_cells]) np.save(os.path.join(args.out_dir, 'preds_sim.npy'), order_pred[num_cells:]) smoothed_preds = knn_smooth_pred_class(X=latent, pred_class=is_doublet[:num_cells]) np.save(os.path.join(args.out_dir, 'smoothed_preds.npy'), smoothed_preds) if args.anndata_output and data_ext == '.h5ad': adata.obs['is_doublet'] = is_doublet[:num_cells] adata.obs['logit_scores'] = logit_doublet_score[:num_cells] adata.obs['softmax_scores'] = doublet_score[:num_cells] adata.write(os.path.join(args.out_dir, "soloed.h5ad")) if args.plot: import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns # plot ROC plt.figure() plt.plot(train_fpr, train_tpr, label='Train') plt.plot(test_fpr, test_tpr, label='Test') plt.gca().set_xlabel('False positive rate') plt.gca().set_ylabel('True positive rate') plt.legend() plt.savefig(os.path.join(args.out_dir, 'roc.pdf')) plt.close() # plot accuracy plt.figure() plt.plot(train_t, train_acc, label='Train') plt.plot(test_t, test_acc, label='Test') plt.axvline(0.5, color='black', linestyle='--') plt.gca().set_xlabel('Threshold') plt.gca().set_ylabel('Accuracy') plt.legend() plt.savefig(os.path.join(args.out_dir, 'accuracy.pdf')) plt.close() # plot distributions plt.figure() sns.distplot(test_score[test_y], label='Simulated') sns.distplot(test_score[~test_y], label='Observed') plt.legend() plt.savefig(os.path.join(args.out_dir, 'train_v_test_dist.pdf')) plt.close() plt.figure() sns.distplot(doublet_score[:num_cells], label='Observed') plt.legend() plt.savefig(os.path.join(args.out_dir, 'real_cells_dist.pdf')) plt.close() scvi_umap = umap.UMAP(n_neighbors=16).fit_transform(latent) fig, ax = plt.subplots(1, 1, figsize=(10, 10)) ax.scatter(scvi_umap[:, 0], scvi_umap[:, 1], c=doublet_score[:num_cells], s=8, cmap="GnBu") ax.set_xlabel("UMAP 1") ax.set_ylabel("UMAP 2") ax.set_xticks([], []) ax.set_yticks([], []) fig.savefig(os.path.join(args.out_dir, 'umap_solo_scores.pdf'))
def runScvi(adata, batch, hvg=None): # Use non-normalized (count) data for scvi! # Expects data only on HVGs checkSanity(adata, batch, hvg) # Check for counts data layer if 'counts' not in adata.layers: raise TypeError( 'Adata does not contain a `counts` layer in `adata.layers[`counts`]`' ) from scvi.models import VAE from scvi.inference import UnsupervisedTrainer from sklearn.preprocessing import LabelEncoder from scvi.dataset import AnnDatasetFromAnnData # Defaults from SCVI github tutorials scanpy_pbmc3k and harmonization n_epochs = np.min([round((20000 / adata.n_obs) * 400), 400]) n_latent = 30 n_hidden = 128 n_layers = 2 net_adata = adata.copy() net_adata.X = adata.layers['counts'] del net_adata.layers['counts'] # Ensure that the raw counts are not accidentally used del net_adata.raw # Note that this only works from anndata 0.7 # Define batch indices le = LabelEncoder() net_adata.obs['batch_indices'] = le.fit_transform( net_adata.obs[batch].values) net_adata = AnnDatasetFromAnnData(net_adata) vae = VAE( net_adata.nb_genes, reconstruction_loss='nb', n_batch=net_adata.n_batches, n_layers=n_layers, n_latent=n_latent, n_hidden=n_hidden, ) trainer = UnsupervisedTrainer( vae, net_adata, train_size=1.0, use_cuda=False, ) trainer.train(n_epochs=n_epochs, lr=1e-3) full = trainer.create_posterior(trainer.model, net_adata, indices=np.arange(len(net_adata))) latent, _, _ = full.sequential().get_latent() adata.obsm['X_emb'] = latent return adata
n_latent=30, n_layers=2, dispersion='gene', ) print('Prepare the trainer') trainer = UnsupervisedTrainer(vae, all_dataset, train_size=1.0) print('Train neural network') n_epochs = 100 trainer.train(n_epochs=n_epochs) print('Get posteriors (latent space)') full = trainer.create_posterior( trainer.model, all_dataset, indices=np.arange(len(all_dataset)), ) latent, batch_indices, labels = full.sequential().get_latent() batch_indices = batch_indices.ravel() print('Use scanpy and Leiden to cluster in latent space') adata_latent = sc.AnnData(latent) sc.pp.neighbors(adata_latent, use_rep='X', n_neighbors=30, metric='minkowski') sc.tl.leiden(adata_latent, resolution=0.8) clusters = adata_latent.obs.leiden.values.to_dense().astype( str)
def benchmark_scvi(dataset, dataset_name, cfg, **kwargs): log_name = dataset_name n_genes = min(dataset.X.shape[1], cfg.n_genes) vae = VAE( dataset.nb_genes, n_batch=dataset.n_batches, n_labels=dataset.n_labels, n_hidden=128, n_latent=30, n_layers=2, dispersion="gene", ) trainer = UnsupervisedTrainer(vae, dataset, train_size=0.75) n_epochs = cfg.epochs if not "epochs" in kwargs else kwargs["epochs"] trainer.train(n_epochs=n_epochs) full = trainer.create_posterior(trainer.model, dataset, indices=np.arange(len(dataset))) latents, batch_indices, labels = full.sequential().get_latent() res = {} res["knn purity"] = [] res["entropy batch mixing"] = [] res["knn purity"].append(get_knn_purity(latents, labels.reshape((-1, 1)))) ebm = entropy_batch_mixing(latents, batch_indices) res["entropy batch mixing"].append( ebm[1] if isinstance(ebm, tuple) else ebm) cfg.input_dim = latents.shape[1] cfg.count_classes = np.unique(dataset.batch_indices).shape[0] cfg.count_labels = np.unique(dataset.labels).shape[0] ( latents_train, latents_test, batches_train, batches_test, labels_train, labels_test, ) = train_test_split( latents, batch_indices, labels, test_size=0.25, stratify=batch_indices.reshape(-1), ) latents_train = torch.Tensor(latents_train).cuda() latents_test = torch.Tensor(latents_test).cuda() batches_train_tensor = torch.zeros(latents_train.shape[0], cfg.count_classes) batches_train_tensor = batches_train_tensor.scatter( 1, LongTensor(batches_train.astype("int16")).view(-1, 1), 1) batches_train_tensor = batches_train_tensor.cuda() labels_train_tensor = torch.zeros(latents_train.shape[0], cfg.count_labels) labels_train_tensor = labels_train_tensor.scatter( 1, LongTensor(labels_train.astype("int16")).view(-1, 1), 1) labels_train_tensor = labels_train_tensor.cuda() train_dataset = torch.utils.data.TensorDataset(latents_train, batches_train_tensor, labels_train_tensor) dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.batch_size) cfg.classifier_input_dim = cfg.bottleneck ohe_classifier, form_classifier = train_classifiers( cfg, dataloader, cfg.count_labels, cfg.count_classes) preds_batches = ohe_classifier(latents_test) preds_labels = form_classifier(latents_test) res["batch classifing accuracy"] = ( preds_batches.argmax(1).cpu().detach().numpy() == batches_test).mean() res["labels classifing accuracy"] = ( preds_labels.argmax(1).cpu().detach().numpy() == labels_test).mean() (Path(cfg.metrics_dir) / 'scVI').mkdir(parents=True, exist_ok=True) with open(os.path.join(Path(cfg.metrics_dir) / "scVI", log_name + ".json"), "w") as file: for key in res.keys(): if type(key) is not str: try: res[str(key)] = res[key] except: try: res[repr(key)] = res[key] except: raise TypeError("Unexpected key") json.dump(res, file) del vae, trainer del latents, batch_indices, labels, full del preds_batches, preds_labels, train_dataset, dataloader cuda.empty_cache()