def unsupervised_training_one_epoch(dataset: GeneExpressionDataset): vae = VAE(dataset.nb_genes, dataset.n_batches, dataset.n_labels) trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda) trainer.train(n_epochs=1)
def trainVAE(gene_dataset, filename, rep, nlayers=2, n_hidden=128, reconstruction_loss: str = 'zinb'): vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels, n_hidden=n_hidden, n_latent=10, n_layers=nlayers, dispersion='gene', reconstruction_loss=reconstruction_loss) trainer = UnsupervisedTrainer(vae, gene_dataset, train_size=1.0) filename = '../' + filename + '/' + 'vae' + '.' + reconstruction_loss + '.rep' + str( rep) + '.pkl' if os.path.isfile(filename): trainer.model.load_state_dict(torch.load(filename)) trainer.model.eval() else: trainer.train(n_epochs=250) torch.save(trainer.model.state_dict(), filename) full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset))) return full
def correct_scvi(Xs, genes): import torch use_cuda = True torch.cuda.set_device(1) from scvi.dataset.dataset import GeneExpressionDataset from scvi.inference import UnsupervisedTrainer from scvi.models import SCANVI, VAE from scvi.dataset.anndata import AnnDataset all_ann = [AnnDataset(AnnData(X, var=genes)) for X in Xs] all_dataset = GeneExpressionDataset.concat_datasets(*all_ann) vae = VAE(all_dataset.nb_genes, n_batch=all_dataset.n_batches, n_labels=all_dataset.n_labels, n_hidden=128, n_latent=30, n_layers=2, dispersion='gene') trainer = UnsupervisedTrainer(vae, all_dataset, train_size=0.99999) n_epochs = 100 #trainer.train(n_epochs=n_epochs) #torch.save(trainer.model.state_dict(), # 'data/harmonization.vae.pkl') trainer.model.load_state_dict(torch.load('data/harmonization.vae.pkl')) trainer.model.eval() full = trainer.create_posterior(trainer.model, all_dataset, indices=np.arange(len(all_dataset))) latent, batch_indices, labels = full.sequential().get_latent() return latent
def scVI_latent(csv_file, csv_path, vae_model=VAE, train_size=1.0, n_labels=0, seed=1234, n_cores=1, lr=1e-3, use_cuda=False): set_seed(seed) dat = CsvDataset(csv_file, save_path=csv_path, new_n_genes=None) # Based on recommendations in basic_tutorial.ipynb n_epochs = 400 if (len(dat) < 10000) else 200 # trainer and model vae = vae_model(dat.nb_genes, n_labels=n_labels) trainer = UnsupervisedTrainer( vae, dat, train_size=train_size, # default to 0.8, documentation recommends 1 use_cuda=use_cuda) # limit cpu usage torch.set_num_threads(n_cores) trainer.train(n_epochs=n_epochs, lr=lr) full = trainer.create_posterior(trainer.model, dat, indices=np.arange(len(dat))) # Updating the "minibatch" size after training is useful in low memory configurations Z_hat = full.sequential().get_latent()[0] adata = anndata.AnnData(dat.X) for i, z in enumerate(Z_hat.T): adata.obs[f'Z_{i}'] = z # reordering for convenience and correspondance with PCA's ordering cellLoads = adata.obs.reindex(adata.obs.std().sort_values().index, axis=1) return (cellLoads)
def benchmark(dataset, n_epochs=250, use_cuda=True): vae = VAE(dataset.nb_genes, n_batch=dataset.n_batches) trainer = UnsupervisedTrainer(vae, dataset, use_cuda=use_cuda) trainer.train(n_epochs=n_epochs) trainer.test_set.reconstruction_error() trainer.test_set.marginal_ll() return trainer
def train(self, adata, condition_key, cell_type_key, n_epochs=300, patience=30, lr_reducer=20): le = LabelEncoder() adata.obs['labels'] = le.fit_transform(adata.obs[cell_type_key].values) adata.obs['batch_indices'] = le.fit_transform( adata.obs[condition_key].values) net_adata = AnnDatasetFromAnnData(adata) early_stopping_kwargs = { "early_stopping_metric": "elbo", "save_best_state_metric": "elbo", "patience": patience, "threshold": 0, "reduce_lr_on_plateau": True, "lr_patience": lr_reducer, "lr_factor": 0.1, } self.trainer = UnsupervisedTrainer( self.model, net_adata, train_size=0.8, use_cuda=True, frequency=1, early_stopping_kwargs=early_stopping_kwargs, ) self.trainer.train(n_epochs=n_epochs, lr=0.001)
def test_gamma_de(): cortex_dataset = CortexDataset() cortex_vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches) trainer_cortex_vae = UnsupervisedTrainer(cortex_vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda) trainer_cortex_vae.train(n_epochs=2) full = trainer_cortex_vae.create_posterior(trainer_cortex_vae.model, cortex_dataset, indices=np.arange( len(cortex_dataset))) n_samples = 10 M_permutation = 100 cell_idx1 = cortex_dataset.labels.ravel() == 0 cell_idx2 = cortex_dataset.labels.ravel() == 1 full.differential_expression_score(cell_idx1, cell_idx2, n_samples=n_samples, M_permutation=M_permutation) full.differential_expression_gamma(cell_idx1, cell_idx2, n_samples=n_samples, M_permutation=M_permutation)
def scVI_ld(csv_file, csv_path, ndims, vae_model = VAE, n_labels = 0, n_cores=1, seed= 1234, lr = 1e-3, use_cuda = False): set_seed(seed) dat = CsvDataset(csv_file, save_path=csv_path, new_n_genes=None) # Based on recommendations in linear_decoder.ipynb n_epochs = 250 # trainer and model ldvae = LDVAE( dat.nb_genes, n_batch = dat.n_batches, n_latent = ndims, n_labels = n_labels ) trainerLD = UnsupervisedTrainer(ldvae, dat, use_cuda=use_cuda) # limit cpu usage torch.set_num_threads(n_cores) trainerLD.train(n_epochs=n_epochs, lr=lr) # extract mean value for the ld full = trainerLD.create_posterior(trainerLD.model, dat, indices=np.arange(len(dat))) Z_hat = full.sequential().get_latent()[0] adata = anndata.AnnData(dat.X) for i, z in enumerate(Z_hat.T): adata.obs[f'Z_{i}'] = z # reordering for convenience and correspondance with PCA's ordering cellLoads = adata.obs.reindex(adata.obs.std().sort_values().index, axis = 1) return(cellLoads)
def train_model( mdl_class, dataset, mdl_params: dict, train_params: dict, train_fn_params: dict, filename: str = None, ): """ :param mdl_class: Class of algorithm :param dataset: Dataset :param mdl_params: :param train_params: :param train_fn_params: :param filename :return: """ # if os.path.exists(filename): # res = load_pickle(filename) # return res["vae"], res["trainer"] if "test_indices" not in train_params: warnings.warn("No `test_indices` attribute found.") my_vae = mdl_class(n_input=dataset.nb_genes, n_batch=dataset.n_batches, **mdl_params) my_trainer = UnsupervisedTrainer(my_vae, dataset, **train_params) my_trainer.train(**train_fn_params) print(my_trainer.train_losses) return my_vae, my_trainer
def benchmark(dataset, n_epochs=250, use_cuda=True): vae = VAE(dataset.nb_genes, n_batch=dataset.n_batches) trainer = UnsupervisedTrainer(vae, dataset, use_cuda=use_cuda) trainer.train(n_epochs=n_epochs) trainer.test_set.ll(verbose=True) trainer.test_set.marginal_ll(verbose=True) return trainer
def scVI_norm(csv_file, csv_path, vae_model=VAE, train_size=1.0, n_labels=0, seed=1234, n_cores=1, lr=1e-3, use_cuda=False): set_seed(seed) dat = CsvDataset(csv_file, save_path=csv_path, new_n_genes=None) dat.subsample_genes(1000, mode="variance") # Based on recommendations in basic_tutorial.ipynb n_epochs = 400 if (len(dat) < 10000) else 200 # trainer and model vae = vae_model(dat.nb_genes, n_labels=n_labels) trainer = UnsupervisedTrainer( vae, dat, train_size=train_size, # default to 0.8, documentation recommends 1 use_cuda=use_cuda) # limit cpu usage torch.set_num_threads(n_cores) trainer.train(n_epochs=n_epochs, lr=lr) full = trainer.create_posterior(trainer.model, dat, indices=np.arange(len(dat))) # Updating the "minibatch" size after training is useful in low memory configurations normalized_values = full.sequential().get_sample_scale() return [normalized_values, dat.gene_names]
def run(self): n_epochs = 100 n_latent = 10 n_hidden = 128 n_layers = 2 net_data = self.data.copy() net_data.X = self.data.layers['counts'] del net_data.layers['counts'] net_data.raw = None # Ensure that the raw counts are not accidentally used # Define batch indices le = LabelEncoder() net_data.obs['batch_indices'] = le.fit_transform( net_data.obs[self.batch].values) net_data = AnnDatasetFromAnnData(net_data) vae = VAE(net_data.nb_genes, reconstruction_loss='nb', n_batch=net_data.n_batches, n_layers=n_layers, n_latent=n_latent, n_hidden=n_hidden) trainer = UnsupervisedTrainer(vae, net_data, train_size=1, use_cuda=False) trainer.train(n_epochs=n_epochs, lr=1e-3) full = trainer.create_posterior(trainer.model, net_data, indices=np.arange(len(net_data))) latent, _, _ = full.sequential().get_latent() self.data.obsm['X_emb'] = latent self.dump_to_h5ad("scvi")
def test_special_dataset_size(self): gene_dataset = GeneExpressionDataset() x = np.random.randint(1, 100, (17 * 2, 10)) y = np.random.randint(1, 100, (17 * 2, 10)) gene_dataset.populate_from_data(x) protein_data = CellMeasurement( name="protein_expression", data=y, columns_attr_name="protein_names", columns=np.arange(10), ) gene_dataset.initialize_cell_measurement(protein_data) # Test UnsupervisedTrainer vae = VAE( gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels, ) trainer = UnsupervisedTrainer( vae, gene_dataset, train_size=0.5, use_cuda=False, data_loader_kwargs={"batch_size": 8}, ) trainer.train(n_epochs=1) # Test JVATrainer jvae = JVAE( [gene_dataset.nb_genes, gene_dataset.nb_genes], gene_dataset.nb_genes, [slice(None)] * 2, ["zinb", "zinb"], [True, True], n_batch=1, ) cls = Classifier(gene_dataset.nb_genes, n_labels=2, logits=True) trainer = JVAETrainer( jvae, cls, [gene_dataset, gene_dataset], train_size=0.5, use_cuda=False, data_loader_kwargs={"batch_size": 8}, ) trainer.train(n_epochs=1) totalvae = TOTALVI(gene_dataset.nb_genes, len(gene_dataset.protein_names)) trainer = TotalTrainer( totalvae, gene_dataset, train_size=0.5, use_cuda=False, data_loader_kwargs={"batch_size": 8}, early_stopping_kwargs=None, ) trainer.train(n_epochs=1)
def base_benchmark(gene_dataset): vae = VAE(gene_dataset.nb_genes, gene_dataset.n_batches, gene_dataset.n_labels) trainer = UnsupervisedTrainer(vae, gene_dataset, train_size=0.5, use_cuda=use_cuda) trainer.train(n_epochs=1) return trainer
def ldvae_benchmark(dataset, n_epochs, use_cuda=True): ldvae = LDVAE(dataset.nb_genes, n_batch=dataset.n_batches) trainer = UnsupervisedTrainer(ldvae, dataset, use_cuda=use_cuda) trainer.train(n_epochs=n_epochs) trainer.test_set.reconstruction_error() trainer.test_set.marginal_ll() ldvae.get_loadings() return trainer
def train_seq(self, n_epochs=20, reconstruction_seq='nb'): dataset = self.data.data_seq vae = VAE( dataset.nb_genes, dispersion="gene", n_latent=self.n_latent, reconstruction_loss=reconstruction_seq, ) self.trainer_seq = UnsupervisedTrainer(vae, dataset, train_size=0.95, use_cuda=self.USE_CUDA) self.trainer_seq.train(n_epochs=n_epochs, lr=0.001)
def test_iaf2(save_path): dataset = CortexDataset(save_path=save_path) vae = IALogNormalPoissonVAE(n_input=dataset.nb_genes, n_batch=dataset.n_batches, do_h=True).cuda() trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, ratio_loss=True) trainer.train(n_epochs=1000) print(trainer.train_losses) z, l = trainer.test_set.get_latents(n_samples=5, device='cpu') return
def test_encoder_only(): # torch.autograd.set_detect_anomaly(mode=True) dataset = LatentLogPoissonDataset(n_genes=5, n_latent=2, n_cells=300, n_comps=1) dataset = LatentLogPoissonDataset(n_genes=3, n_latent=2, n_cells=15, n_comps=2) dataset = LatentLogPoissonDataset(n_genes=5, n_latent=2, n_cells=150, n_comps=1, learn_prior_scale=True) # _, _, marginals = dataset.compute_posteriors( # x_obs=torch.randint(0, 150, size=(1, 5), dtype=torch.float), # mcmc_kwargs={"num_samples": 20, "warmup_steps": 20, "num_chains": 1} # ) # stats = marginals.diagnostics() # print(stats) dataset.cuda() vae_mdl = LogNormalPoissonVAE( dataset.nb_genes, dataset.n_batches, autoregressive=False, full_cov=True, n_latent=2, gt_decoder=dataset.nn_model, ) params = vae_mdl.encoder_params trainer = UnsupervisedTrainer( model=vae_mdl, gene_dataset=dataset, use_cuda=True, train_size=0.7, n_epochs_kl_warmup=1, ratio_loss=True, ) trainer.train( n_epochs=2, lr=1e-3, params=params, ) full = trainer.create_posterior(trainer.model, dataset, indices=np.arange(len(dataset))) lkl_estimate = vae_mdl.marginal_ll(full, n_samples_mc=50)
def test_differential_expression(save_path): dataset = CortexDataset(save_path=save_path) n_cells = len(dataset) all_indices = np.arange(n_cells) vae = VAE(dataset.nb_genes, dataset.n_batches) trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda) trainer.train(n_epochs=2) post = trainer.create_posterior(vae, dataset, shuffle=False, indices=all_indices) # Sample scale example px_scales = post.scale_sampler(n_samples_per_cell=4, n_samples=None, selection=all_indices)["scale"] assert (px_scales.shape[1] == dataset.nb_genes ), "posterior scales should have shape (n_samples, n_genes)" # Differential expression different models idx_1 = [1, 2, 3] idx_2 = [4, 5, 6, 7] de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="vanilla", use_permutation=True, M_permutation=100, ) de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="change", use_permutation=True, M_permutation=100, ) print(de_dataframe.keys()) assert (de_dataframe["confidence_interval_0.5_min"] <= de_dataframe["confidence_interval_0.5_max"]).all() assert (de_dataframe["confidence_interval_0.95_min"] <= de_dataframe["confidence_interval_0.95_max"]).all() # DE estimation example de_probabilities = de_dataframe.loc[:, "proba_de"] assert ((0.0 <= de_probabilities) & (de_probabilities <= 1.0)).all()
def trainVAE(gene_dataset, rmCellTypes,rep): vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels, n_hidden=128, n_latent=10, n_layers=2, dispersion='gene') trainer = UnsupervisedTrainer(vae, gene_dataset, train_size=1.0) if os.path.isfile('../NoOverlap/vae.%s%s.pkl' % (rmCellTypes,rep)): trainer.model.load_state_dict(torch.load('../NoOverlap/vae.%s%s.pkl' % (rmCellTypes,rep))) trainer.model.eval() else: trainer.train(n_epochs=150) torch.save(trainer.model.state_dict(), '../NoOverlap/vae.%s%s.pkl' % (rmCellTypes,rep)) full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset))) latent, batch_indices, labels = full.sequential().get_latent() batch_indices = batch_indices.ravel() return latent, batch_indices,labels,trainer
def full_init(self): self.model = self.model_type( n_input=self.dataset.nb_genes, n_batch=self.dataset.n_batches, reconstruction_loss=self.reconstruction_loss, n_latent=self.n_latent, full_cov=self.full_cov) self.trainer = UnsupervisedTrainer(model=self.model, gene_dataset=self.dataset, use_cuda=True, train_size=0.7, kl=1, frequency=1) self.is_fully_init = True
def test_iwae(save_path): import time dataset = CortexDataset(save_path=save_path) torch.manual_seed(42) vae = VAE(n_input=dataset.nb_genes, n_batch=dataset.n_batches).cuda() start = time.time() trainer = UnsupervisedTrainer(vae, gene_dataset=dataset, ratio_loss=True, k_importance_weighted=5, single_backward=True) trainer.train(n_epochs=10) stop1 = time.time() - start vae = VAE(n_input=dataset.nb_genes, n_batch=dataset.n_batches).cuda() start = time.time() trainer = UnsupervisedTrainer(vae, gene_dataset=dataset, ratio_loss=True, k_importance_weighted=5, single_backward=False) trainer.train(n_epochs=10) stop2 = time.time() - start print('Time single backward : ', stop1) print('Time all elements : ', stop2)
def train_fish(self, n_epochs=20): dataset = self.data.data_fish vae = VAE( dataset.nb_genes, n_batch=dataset.n_batches, dispersion="gene-batch", n_latent=self.n_latent, reconstruction_loss="nb", ) self.trainer_fish = UnsupervisedTrainer(vae, dataset, train_size=0.95, use_cuda=self.USE_CUDA) self.trainer_fish.train(n_epochs=n_epochs, lr=0.001)
def test_multibatches_features(): data = [ np.random.randint(1, 5, size=(20, 10)), np.random.randint(1, 10, size=(20, 10)), np.random.randint(1, 10, size=(20, 10)), np.random.randint(1, 10, size=(30, 10)), ] dataset = GeneExpressionDataset() dataset.populate_from_per_batch_list(data) vae = VAE(dataset.nb_genes, dataset.n_batches) trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda) trainer.train(n_epochs=2) trainer.test_set.imputation(n_samples=2, transform_batch=0) trainer.train_set.imputation(n_samples=2, transform_batch=[0, 1, 2])
def test_sampling_zl(save_path): cortex_dataset = CortexDataset(save_path=save_path) cortex_vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches) trainer_cortex_vae = UnsupervisedTrainer( cortex_vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda ) trainer_cortex_vae.train(n_epochs=2) cortex_cls = Classifier((cortex_vae.n_latent + 1), n_labels=cortex_dataset.n_labels) trainer_cortex_cls = ClassifierTrainer( cortex_cls, cortex_dataset, sampling_model=cortex_vae, sampling_zl=True ) trainer_cortex_cls.train(n_epochs=2) trainer_cortex_cls.test_set.accuracy()
def get_trainer(self, vae, train_size): batch_size = 128 while self.gene_dataset.nb_cells % batch_size == 1: batch_size += 1 # adjust batch size such that no batch has only one cell trainer = UnsupervisedTrainer( vae, self.gene_dataset, train_size=train_size, use_cuda=self.use_cuda, frequency=1, data_loader_kwargs={'batch_size': batch_size}) if self.train_size == 1.0: trainer._posteriors['test_set'].to_monitor = [] trainer.metrics_to_monitor = {} return trainer
def training_score_scvi(train, **kwargs): from scvi.dataset import GeneExpressionDataset from scvi.inference import UnsupervisedTrainer from scvi.models import VAE data = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(train)) vae = VAE(n_input=train.shape[1]) m = UnsupervisedTrainer(vae, data, verbose=False) m.train(n_epochs=100) # Training permuted the data for minibatching. Unpermute before "imputing" # (estimating lambda) lam = np.vstack([ m.train_set.sequential().imputation(), m.test_set.sequential().imputation() ]) return st.poisson(mu=lam).logpmf(train).sum()
def train_both(self, n_epochs=20): vae_both = VAE( self.full_dataset.nb_genes, n_latent=self.n_latent, n_batch=self.full_dataset.n_batches, dispersion="gene-batch", reconstruction_loss=self.reconstruction_seq, ) self.trainer_both = UnsupervisedTrainer( vae_both, self.full_dataset, train_size=0.95, use_cuda=self.USE_CUDA, frequency=1, ) self.trainer_both.train(n_epochs=n_epochs, lr=0.001)
def generalization_score_scvi(train, test, **kwargs): from scvi.dataset import GeneExpressionDataset from scvi.inference import UnsupervisedTrainer from scvi.models import VAE data = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(train)) vae = VAE(n_input=train.shape[1]) m = UnsupervisedTrainer(vae, data, verbose=False) m.train(n_epochs=100) # Training permuted the data for minibatching. Unpermute before "imputing" # (estimating lambda) with torch.autograd.set_grad_enabled(False): lam = np.vstack([ m.train_set.sequential().imputation(), m.test_set.sequential().imputation() ]) return pois_llik(lam, train, test)
def test_autozi(save_path): data = SyntheticDataset(n_batches=1) for disp_zi in ["gene", "gene-label"]: autozivae = AutoZIVAE( n_input=data.nb_genes, dispersion=disp_zi, zero_inflation=disp_zi, n_labels=data.n_labels, ) trainer_autozivae = UnsupervisedTrainer( model=autozivae, gene_dataset=data, train_size=0.5 ) trainer_autozivae.train(n_epochs=2, lr=1e-2) trainer_autozivae.test_set.elbo() trainer_autozivae.test_set.reconstruction_error() trainer_autozivae.test_set.marginal_ll()