def totalvi_benchmark(dataset, n_epochs, use_cuda=True): totalvae = TOTALVI(dataset.nb_genes, len(dataset.protein_names), n_batch=dataset.n_batches) trainer = TotalTrainer(totalvae, dataset, train_size=0.5, use_cuda=use_cuda, early_stopping_kwargs=None) trainer.train(n_epochs=n_epochs) trainer.test_set.reconstruction_error() trainer.test_set.marginal_ll() trainer.test_set.get_protein_background_mean() trainer.test_set.get_latent() trainer.test_set.generate() trainer.test_set.get_sample_dropout() trainer.test_set.get_normalized_denoised_expression(transform_batch=0) trainer.test_set.get_normalized_denoised_expression(transform_batch=0) trainer.test_set.imputation() trainer.test_set.get_protein_mean() trainer.test_set.one_vs_all_degenes(n_samples=2, M_permutation=10) trainer.test_set.generate_feature_correlation_matrix(n_samples=2) trainer.test_set.generate_feature_correlation_matrix(n_samples=2, transform_batch=0) return trainer
def test_special_dataset_size(self): gene_dataset = GeneExpressionDataset() x = np.random.randint(1, 100, (17 * 2, 10)) y = np.random.randint(1, 100, (17 * 2, 10)) gene_dataset.populate_from_data(x) protein_data = CellMeasurement( name="protein_expression", data=y, columns_attr_name="protein_names", columns=np.arange(10), ) gene_dataset.initialize_cell_measurement(protein_data) # Test UnsupervisedTrainer vae = VAE( gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels, ) trainer = UnsupervisedTrainer( vae, gene_dataset, train_size=0.5, use_cuda=False, data_loader_kwargs={"batch_size": 8}, ) trainer.train(n_epochs=1) # Test JVATrainer jvae = JVAE( [gene_dataset.nb_genes, gene_dataset.nb_genes], gene_dataset.nb_genes, [slice(None)] * 2, ["zinb", "zinb"], [True, True], n_batch=1, ) cls = Classifier(gene_dataset.nb_genes, n_labels=2, logits=True) trainer = JVAETrainer( jvae, cls, [gene_dataset, gene_dataset], train_size=0.5, use_cuda=False, data_loader_kwargs={"batch_size": 8}, ) trainer.train(n_epochs=1) totalvae = TOTALVI(gene_dataset.nb_genes, len(gene_dataset.protein_names)) trainer = TotalTrainer( totalvae, gene_dataset, train_size=0.5, use_cuda=False, data_loader_kwargs={"batch_size": 8}, early_stopping_kwargs=None, ) trainer.train(n_epochs=1)
def totalvi_benchmark(dataset, n_epochs, use_cuda=True): totalvae = TOTALVI( dataset.nb_genes, len(dataset.protein_names), n_batch=dataset.n_batches ) trainer = TotalTrainer(totalvae, dataset, train_size=0.5, use_cuda=use_cuda) trainer.train(n_epochs=n_epochs) trainer.test_set.reconstruction_error() trainer.test_set.marginal_ll() trainer.test_set.get_protein_background_mean() trainer.test_set.get_latent() trainer.test_set.generate() trainer.test_set.get_sample_dropout() trainer.test_set.get_normalized_denoised_expression() trainer.test_set.imputation() return trainer
def test_totalvi(save_path): synthetic_dataset_one_batch = SyntheticDataset(n_batches=1) totalvi_benchmark(synthetic_dataset_one_batch, n_epochs=1, use_cuda=use_cuda) synthetic_dataset_two_batches = SyntheticDataset(n_batches=2) totalvi_benchmark(synthetic_dataset_two_batches, n_epochs=1, use_cuda=use_cuda) # adversarial testing dataset = synthetic_dataset_two_batches totalvae = TOTALVI(dataset.nb_genes, len(dataset.protein_names), n_batch=dataset.n_batches) trainer = TotalTrainer( totalvae, dataset, train_size=0.5, use_cuda=use_cuda, early_stopping_kwargs=None, use_adversarial_loss=True, ) trainer.train(n_epochs=1) with tempfile.TemporaryDirectory() as temp_dir: posterior_save_path = os.path.join(temp_dir, "posterior_data") original_post = trainer.create_posterior( totalvae, dataset, indices=np.arange(len(dataset)), type_class=TotalPosterior, ) original_post.save_posterior(posterior_save_path) new_totalvae = TOTALVI(dataset.nb_genes, len(dataset.protein_names), n_batch=dataset.n_batches) new_post = load_posterior(posterior_save_path, model=new_totalvae, use_cuda=False) assert new_post.posterior_type == "TotalPosterior" assert np.array_equal(new_post.gene_dataset.protein_expression, dataset.protein_expression)
def test_totalvi(save_path): synthetic_dataset_one_batch = SyntheticDataset(n_batches=1) totalvi_benchmark(synthetic_dataset_one_batch, n_epochs=1, use_cuda=use_cuda) synthetic_dataset_two_batches = SyntheticDataset(n_batches=2) totalvi_benchmark(synthetic_dataset_two_batches, n_epochs=1, use_cuda=use_cuda) # adversarial testing dataset = synthetic_dataset_two_batches totalvae = TOTALVI( dataset.nb_genes, len(dataset.protein_names), n_batch=dataset.n_batches ) trainer = TotalTrainer( totalvae, dataset, train_size=0.5, use_cuda=use_cuda, early_stopping_kwargs=None, use_adversarial_loss=True, ) trainer.train(n_epochs=1)
early_stopping_kwargs = { "early_stopping_metric": "elbo", "save_best_state_metric": "elbo", "patience": 45, "threshold": 0, "reduce_lr_on_plateau": True, "lr_patience": 30, "lr_factor": 0.6, "posterior_class": TotalPosterior, } trainer = TotalTrainer( model, dataset, train_size=0.9, test_size=0.1, use_cuda=use_cuda, frequency=1, data_loader_kwargs={"batch_size": 256, "pin_memory": False}, early_stopping_kwargs=early_stopping_kwargs, ) trainer.train(lr=lr, n_epochs=500) # create posterior on full data full_posterior = trainer.create_posterior( model, dataset, indices=np.arange(len(dataset)), type_class=TotalPosterior, ) torch.save( trainer.model.state_dict(), "differential_expression/saved_models/" + n + ".pt" )
def test_differential_expression(save_path): dataset = CortexDataset(save_path=save_path) n_cells = len(dataset) all_indices = np.arange(n_cells) vae = VAE(dataset.nb_genes, dataset.n_batches) trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda) trainer.train(n_epochs=2) post = trainer.create_posterior(vae, dataset, shuffle=False, indices=all_indices) with tempfile.TemporaryDirectory() as temp_dir: posterior_save_path = os.path.join(temp_dir, "posterior_data") post.save_posterior(posterior_save_path) new_vae = VAE(dataset.nb_genes, dataset.n_batches) new_post = load_posterior(posterior_save_path, model=new_vae, use_cuda=False) assert np.array_equal(new_post.indices, post.indices) assert np.array_equal(new_post.gene_dataset.X, post.gene_dataset.X) # Sample scale example px_scales = post.scale_sampler( n_samples_per_cell=4, n_samples=None, selection=all_indices )["scale"] assert ( px_scales.shape[1] == dataset.nb_genes ), "posterior scales should have shape (n_samples, n_genes)" # Differential expression different models idx_1 = [1, 2, 3] idx_2 = [4, 5, 6, 7] de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="vanilla", use_permutation=True, M_permutation=100, ) de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="change", use_permutation=True, M_permutation=100, cred_interval_lvls=[0.5, 0.95], ) print(de_dataframe.keys()) assert ( de_dataframe["confidence_interval_0.5_min"] <= de_dataframe["confidence_interval_0.5_max"] ).all() assert ( de_dataframe["confidence_interval_0.95_min"] <= de_dataframe["confidence_interval_0.95_max"] ).all() # DE estimation example de_probabilities = de_dataframe.loc[:, "proba_de"] assert ((0.0 <= de_probabilities) & (de_probabilities <= 1.0)).all() # Test totalVI DE sp = os.path.join(save_path, "10X") dataset = Dataset10X(dataset_name="pbmc_10k_protein_v3", save_path=sp) n_cells = len(dataset) all_indices = np.arange(n_cells) vae = TOTALVI( dataset.nb_genes, len(dataset.protein_names), n_batch=dataset.n_batches ) trainer = TotalTrainer( vae, dataset, train_size=0.5, use_cuda=use_cuda, early_stopping_kwargs=None ) trainer.train(n_epochs=2) post = trainer.create_posterior( vae, dataset, shuffle=False, indices=all_indices, type_class=TotalPosterior ) # Differential expression different models idx_1 = [1, 2, 3] idx_2 = [4, 5, 6, 7] de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="vanilla", use_permutation=True, M_permutation=100, ) de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="change", use_permutation=True, M_permutation=100, )