def test_extra_covariates_transfer(): adata = synthetic_iid() adata.obs["cont1"] = np.random.normal(size=(adata.shape[0], )) adata.obs["cont2"] = np.random.normal(size=(adata.shape[0], )) adata.obs["cat1"] = np.random.randint(0, 5, size=(adata.shape[0], )) adata.obs["cat2"] = np.random.randint(0, 5, size=(adata.shape[0], )) adata_manager = generic_setup_adata_manager( adata, batch_key="batch", labels_key="labels", protein_expression_obsm_key="protein_expression", protein_names_uns_key="protein_names", continuous_covariate_keys=["cont1", "cont2"], categorical_covariate_keys=["cat1", "cat2"], ) bdata = synthetic_iid() bdata.obs["cont1"] = np.random.normal(size=(bdata.shape[0], )) bdata.obs["cont2"] = np.random.normal(size=(bdata.shape[0], )) bdata.obs["cat1"] = 0 bdata.obs["cat2"] = 1 adata_manager.transfer_setup(bdata) # give it a new category bdata.obs["cat1"] = 6 bdata_manager = adata_manager.transfer_setup(bdata, extend_categories=True) assert (bdata_manager.get_state_registry( REGISTRY_KEYS.CAT_COVS_KEY).mappings["cat1"][-1] == 6)
def test_gimvi(): adata_seq = synthetic_iid() adata_spatial = synthetic_iid() GIMVI.setup_anndata( adata_seq, batch_key="batch", labels_key="labels", ) GIMVI.setup_anndata( adata_spatial, batch_key="batch", labels_key="labels", ) model = GIMVI(adata_seq, adata_spatial, n_latent=10) assert hasattr(model.module, "library_log_means_0") and not hasattr( model.module, "library_log_means_1") model.train(1, check_val_every_n_epoch=1, train_size=0.5) model.get_latent_representation() model.get_imputed_values() adata_spatial.var_names += "asdf" GIMVI.setup_anndata( adata_spatial, batch_key="batch", labels_key="labels", ) with pytest.raises(ValueError): model = GIMVI(adata_seq, adata_spatial)
def test_totalvi_online_update(save_path): # basic case n_latent = 5 adata1 = synthetic_iid() model = TOTALVI(adata1, n_latent=n_latent, use_batch_norm="decoder") model.train(1, check_val_every_n_epoch=1) dir_path = os.path.join(save_path, "saved_model/") model.save(dir_path, overwrite=True) adata2 = synthetic_iid(run_setup_anndata=False) adata2.obs["batch"] = adata2.obs.batch.cat.rename_categories(["batch_2", "batch_3"]) model2 = TOTALVI.load_query_data(adata2, dir_path) assert model2.module.background_pro_alpha.requires_grad is True model2.train(max_epochs=1) model2.get_latent_representation() # batch 3 has no proteins adata2 = synthetic_iid(run_setup_anndata=False) adata2.obs["batch"] = adata2.obs.batch.cat.rename_categories(["batch_2", "batch_3"]) adata2.obsm["protein_expression"][adata2.obs.batch == "batch_3"] = 0 # load from model in memory model3 = TOTALVI.load_query_data(adata2, model) model3.module.protein_batch_mask[2] model3.module.protein_batch_mask[3] model3.train(max_epochs=1) model3.get_latent_representation()
def test_gimvi(): adata_seq = synthetic_iid() adata_spatial = synthetic_iid() model = GIMVI(adata_seq, adata_spatial, n_latent=10) model.get_latent_representation() model.get_imputed_values() model.train(1, frequency=1, early_stopping_kwargs=None, train_size=0.5) assert len(model.history["elbo_train_0"]) == 2 assert len(model.history["elbo_train_1"]) == 2 assert len(model.history["elbo_test_0"]) == 2 assert len(model.history["elbo_test_1"]) == 2 trainer = model.trainer results = pd.DataFrame( trainer.get_loss_magnitude(), index=["reconstruction", "kl_divergence", "discriminator"], columns=["Sequencing", "Spatial"], ) results.columns.name = "Dataset" results.index.name = "Loss" trainer.get_discriminator_confusion() adata_spatial.var_names += "asdf" with pytest.raises(ValueError): model = GIMVI(adata_seq, adata_spatial)
def test_extra_covariates_transfer(): adata = synthetic_iid() adata.obs["cont1"] = np.random.normal(size=(adata.shape[0],)) adata.obs["cont2"] = np.random.normal(size=(adata.shape[0],)) adata.obs["cat1"] = np.random.randint(0, 5, size=(adata.shape[0],)) adata.obs["cat2"] = np.random.randint(0, 5, size=(adata.shape[0],)) setup_anndata( adata, batch_key="batch", labels_key="labels", protein_expression_obsm_key="protein_expression", protein_names_uns_key="protein_names", continuous_covariate_keys=["cont1", "cont2"], categorical_covariate_keys=["cat1", "cat2"], ) bdata = synthetic_iid() bdata.obs["cont1"] = np.random.normal(size=(bdata.shape[0],)) bdata.obs["cont2"] = np.random.normal(size=(bdata.shape[0],)) bdata.obs["cat1"] = 0 bdata.obs["cat2"] = 1 transfer_anndata_setup(adata_source=adata, adata_target=bdata) # give it a new category del bdata.uns["_scvi"] bdata.obs["cat1"] = 6 transfer_anndata_setup( adata_source=adata, adata_target=bdata, extend_categories=True ) assert bdata.uns["_scvi"]["extra_categoricals"]["mappings"]["cat1"][-1] == 6
def test_scvi_library_size_update(save_path): n_latent = 5 adata1 = synthetic_iid() model = SCVI(adata1, n_latent=n_latent, use_observed_lib_size=False) assert (getattr(model.module, "library_log_means", None) is not None and model.module.library_log_means.shape == (1, 2) and model.module.library_log_means.count_nonzero().item() == 2) assert getattr( model.module, "library_log_vars", None) is not None and model.module.library_log_vars.shape == (1, 2) model.train(1, check_val_every_n_epoch=1) dir_path = os.path.join(save_path, "saved_model/") model.save(dir_path, overwrite=True) # also test subset var option adata2 = synthetic_iid(run_setup_anndata=False, n_genes=110) adata2.obs["batch"] = adata2.obs.batch.cat.rename_categories( ["batch_2", "batch_3"]) model2 = SCVI.load_query_data(adata2, dir_path, inplace_subset_query_vars=True) assert (getattr(model2.module, "library_log_means", None) is not None and model2.module.library_log_means.shape == (1, 4) and model2.module.library_log_means[:, :2].equal( model.module.library_log_means) and model2.module.library_log_means.count_nonzero().item() == 4) assert (getattr(model2.module, "library_log_vars", None) is not None and model2.module.library_log_vars.shape == (1, 4) and model2.module.library_log_vars[:, :2].equal( model.module.library_log_vars))
def test_lda_model(): use_gpu = torch.cuda.is_available() n_topics = 5 adata = synthetic_iid(run_setup_anndata=False) # Test with float and Sequence priors. AmortizedLDA.setup_anndata(adata) mod1 = AmortizedLDA( adata, n_topics=n_topics, cell_topic_prior=1.5, topic_feature_prior=1.5 ) mod1.train( max_epochs=1, batch_size=256, lr=0.01, use_gpu=use_gpu, ) mod2 = AmortizedLDA( adata, n_topics=n_topics, cell_topic_prior=[1.5 for _ in range(n_topics)], topic_feature_prior=[1.5 for _ in range(adata.n_vars)], ) mod2.train( max_epochs=1, batch_size=256, lr=0.01, use_gpu=use_gpu, ) mod = AmortizedLDA(adata, n_topics=n_topics) mod.train( max_epochs=5, batch_size=256, lr=0.01, use_gpu=use_gpu, ) adata_gbt = mod.get_feature_by_topic().to_numpy() assert np.allclose(adata_gbt.sum(axis=0), 1) adata_lda = mod.get_latent_representation(adata).to_numpy() assert ( adata_lda.shape == (adata.n_obs, n_topics) and np.all((adata_lda <= 1) & (adata_lda >= 0)) and np.allclose(adata_lda.sum(axis=1), 1) ) mod.get_elbo() mod.get_perplexity() adata2 = synthetic_iid(run_setup_anndata=False) AmortizedLDA.setup_anndata(adata2) adata2_lda = mod.get_latent_representation(adata2).to_numpy() assert ( adata2_lda.shape == (adata2.n_obs, n_topics) and np.all((adata2_lda <= 1) & (adata2_lda >= 0)) and np.allclose(adata2_lda.sum(axis=1), 1) ) mod.get_elbo(adata2) mod.get_perplexity(adata2)
def test_save_and_load(save_path, legacy=False): prefix = "GIMVI_" adata = synthetic_iid() GIMVI.setup_anndata( adata, batch_key="batch", ) adata2 = synthetic_iid() GIMVI.setup_anndata( adata2, batch_key="batch", ) # GIMVI model = GIMVI(adata, adata2) model.train(3, train_size=0.5) z1 = model.get_latent_representation([adata]) z2 = model.get_latent_representation([adata]) np.testing.assert_array_equal(z1, z2) if legacy: legacy_save(model, save_path, overwrite=True, save_anndata=True, prefix=prefix) else: model.save(save_path, overwrite=True, save_anndata=True, prefix=prefix) model = GIMVI.load(save_path, prefix=prefix) model.get_latent_representation() tmp_adata = scvi.data.synthetic_iid(n_genes=200) tmp_adata2 = scvi.data.synthetic_iid(n_genes=200) with pytest.raises(ValueError): GIMVI.load(save_path, adata_seq=tmp_adata, adata_spatial=tmp_adata2, prefix=prefix) model = GIMVI.load(save_path, adata_seq=adata, adata_spatial=adata2, prefix=prefix) z2 = model.get_latent_representation([adata]) np.testing.assert_array_equal(z1, z2) model = GIMVI.load( save_path, adata_seq=adata, adata_spatial=adata2, use_gpu=False, prefix=prefix, ) z2 = model.get_latent_representation([adata]) np.testing.assert_almost_equal(z1, z2, decimal=3) assert model.is_trained is True
def test_gimvi(): adata_seq = synthetic_iid() adata_spatial = synthetic_iid() model = GIMVI(adata_seq, adata_spatial, n_latent=10) model.train(1, check_val_every_n_epoch=1, train_size=0.5) model.get_latent_representation() model.get_imputed_values() adata_spatial.var_names += "asdf" with pytest.raises(ValueError): model = GIMVI(adata_seq, adata_spatial)
def test_scanvi(save_path): adata = synthetic_iid(run_setup_anndata=False) SCANVI.setup_anndata( adata, batch_key="batch", labels_key="labels", ) model = SCANVI(adata, "label_0", n_latent=10) model.train(1, train_size=0.5, check_val_every_n_epoch=1) logged_keys = model.history.keys() assert "elbo_validation" in logged_keys assert "reconstruction_loss_validation" in logged_keys assert "kl_local_validation" in logged_keys assert "elbo_train" in logged_keys assert "reconstruction_loss_train" in logged_keys assert "kl_local_train" in logged_keys assert "classification_loss_validation" in logged_keys adata2 = synthetic_iid() predictions = model.predict(adata2, indices=[1, 2, 3]) assert len(predictions) == 3 model.predict() df = model.predict(adata2, soft=True) assert isinstance(df, pd.DataFrame) model.predict(adata2, soft=True, indices=[1, 2, 3]) model.get_normalized_expression(adata2) model.differential_expression(groupby="labels", group1="label_1") model.differential_expression(groupby="labels", group1="label_1", group2="label_2") # test that all data labeled runs unknown_label = "asdf" a = scvi.data.synthetic_iid() scvi.model.SCANVI.setup_anndata(a, batch_key="batch", labels_key="labels") m = scvi.model.SCANVI(a, unknown_label) m.train(1) # test mix of labeled and unlabeled data unknown_label = "label_0" a = scvi.data.synthetic_iid() scvi.model.SCANVI.setup_anndata(a, batch_key="batch", labels_key="labels") m = scvi.model.SCANVI(a, unknown_label) m.train(1, train_size=0.9) # test from_scvi_model a = scvi.data.synthetic_iid() m = scvi.model.SCVI(a, use_observed_lib_size=False) a2 = scvi.data.synthetic_iid() scanvi_model = scvi.model.SCANVI.from_scvi_model(m, "label_0", adata=a2) scanvi_model = scvi.model.SCANVI.from_scvi_model(m, "label_0", use_labels_groups=False) scanvi_model.train(1)
def test_gimvi_model_library_size(): adata_seq = synthetic_iid() adata_spatial = synthetic_iid() model = GIMVI( adata_seq, adata_spatial, model_library_size=[True, True], n_latent=10 ) assert hasattr(model.module, "library_log_means_0") and hasattr( model.module, "library_log_means_1" ) model.train(1, check_val_every_n_epoch=1, train_size=0.5) model.get_latent_representation() model.get_imputed_values()
def test_save_load_model(cls, adata, save_path, prefix=None, legacy=False): if cls is TOTALVI: cls.setup_anndata( adata, batch_key="batch", labels_key="labels", protein_expression_obsm_key="protein_expression", protein_names_uns_key="protein_names", ) else: cls.setup_anndata(adata, batch_key="batch", labels_key="labels") model = cls(adata, latent_distribution="normal") model.train(1, train_size=0.2) z1 = model.get_latent_representation(adata) test_idx1 = model.validation_indices if legacy: legacy_save(model, save_path, overwrite=True, save_anndata=True, prefix=prefix) else: model.save(save_path, overwrite=True, save_anndata=True, prefix=prefix) model = cls.load(save_path, prefix=prefix) model.get_latent_representation() # Load with mismatched genes. tmp_adata = synthetic_iid(n_genes=200, ) with pytest.raises(ValueError): cls.load(save_path, adata=tmp_adata, prefix=prefix) # Load with different batches. tmp_adata = synthetic_iid() tmp_adata.obs["batch"] = tmp_adata.obs["batch"].cat.rename_categories( ["batch_2", "batch_3"]) with pytest.raises(ValueError): cls.load(save_path, adata=tmp_adata, prefix=prefix) model = cls.load(save_path, adata=adata, prefix=prefix) assert "batch" in model.adata_manager.data_registry assert model.adata_manager.data_registry["batch"] == dict( attr_name="obs", attr_key="_scvi_batch") z2 = model.get_latent_representation() test_idx2 = model.validation_indices np.testing.assert_array_equal(z1, z2) np.testing.assert_array_equal(test_idx1, test_idx2) assert model.is_trained is True
def test_view_anndata_setup(save_path): adata = synthetic_iid(run_setup_anndata=False) adata.obs["cont1"] = np.random.uniform(5, adata.n_obs) adata.obs["cont2"] = np.random.uniform(5, adata.n_obs) adata.obs["cont1"][ 0] = 939543895847598301.423432423523512351234123421341234 adata.obs["cont2"][1] = 0.12938471298374691827634 adata.obs["cat1"] = np.random.randint(0, 5, adata.n_obs).astype(str) adata.obs["cat1"][8] = "asdf" adata.obs["cat1"][9] = "f34" adata.obs["cat2"] = np.random.randint(0, 7, adata.n_obs) setup_anndata( adata, protein_expression_obsm_key="protein_expression", batch_key="batch", labels_key="labels", categorical_covariate_keys=["cat1", "cat2"], continuous_covariate_keys=["cont1", "cont2"], ) # test it works with adata view_anndata_setup(adata) # test it works with scvi setup dict view_anndata_setup(adata.uns["_scvi"]) adata = scvi.data.synthetic_iid() m = scvi.model.SCVI(adata) folder_path = os.path.join(save_path, "tmp") m.save(folder_path, save_anndata=True) # test it works with a saved model folder view_anndata_setup(folder_path) adata_path = os.path.join(folder_path, "adata.h5ad") # test it works with the path to an anndata view_anndata_setup(adata_path) m = scvi.model.SCVI(adata) m.save(folder_path, overwrite=True) # test it works without saving the anndata view_anndata_setup(folder_path) # test it throws error if adata was not setup with pytest.raises(ValueError): adata = synthetic_iid(run_setup_anndata=False) view_anndata_setup(adata) # test it throws error if we dont pass dict, anndata or str in with pytest.raises(ValueError): view_anndata_setup(0)
def test_scanvi_online_update(save_path): # ref has semi-observed labels n_latent = 5 adata1 = synthetic_iid(run_setup_anndata=False) new_labels = adata1.obs.labels.to_numpy() new_labels[0] = "Unknown" adata1.obs["labels"] = pd.Categorical(new_labels) setup_anndata(adata1, batch_key="batch", labels_key="labels") model = SCANVI(adata1, "Unknown", n_latent=n_latent, encode_covariates=True) model.train(n_epochs_unsupervised=1, n_epochs_semisupervised=1, frequency=1) dir_path = os.path.join(save_path, "saved_model/") model.save(dir_path, overwrite=True) adata2 = synthetic_iid(run_setup_anndata=False) adata2.obs["batch"] = adata2.obs.batch.cat.rename_categories(["batch_2", "batch_3"]) adata2.obs["labels"] = "Unknown" model = SCANVI.load_query_data(adata2, dir_path, freeze_batchnorm_encoder=True) model.train( n_epochs_unsupervised=1, n_epochs_semisupervised=1, train_base_model=False ) model.get_latent_representation() model.predict() # ref has fully-observed labels n_latent = 5 adata1 = synthetic_iid(run_setup_anndata=False) new_labels = adata1.obs.labels.to_numpy() adata1.obs["labels"] = pd.Categorical(new_labels) setup_anndata(adata1, batch_key="batch", labels_key="labels") model = SCANVI(adata1, "Unknown", n_latent=n_latent, encode_covariates=True) model.train(n_epochs_unsupervised=1, n_epochs_semisupervised=1, frequency=1) dir_path = os.path.join(save_path, "saved_model/") model.save(dir_path, overwrite=True) # query has one new label adata2 = synthetic_iid(run_setup_anndata=False) adata2.obs["batch"] = adata2.obs.batch.cat.rename_categories(["batch_2", "batch_3"]) new_labels = adata2.obs.labels.to_numpy() new_labels[0] = "Unknown" adata2.obs["labels"] = pd.Categorical(new_labels) model = SCANVI.load_query_data(adata2, dir_path, freeze_batchnorm_encoder=True) model._unlabeled_indices = np.arange(adata2.n_obs) model._labeled_indices = [] model.train( n_epochs_unsupervised=1, n_epochs_semisupervised=1, train_base_model=False ) model.get_latent_representation() model.predict()
def test_scanvi(): adata = synthetic_iid() model = SCANVI(adata, "undefined_0", n_latent=10) model.train(1) adata2 = synthetic_iid() predictions = model.predict(adata2, indices=[1, 2, 3]) assert len(predictions) == 3 model.predict() model.predict(adata2, soft=True) model.predict(adata2, soft=True, indices=[1, 2, 3]) model.get_normalized_expression(adata2) model.differential_expression(groupby="labels", group1="undefined_1") model.differential_expression(groupby="labels", group1="undefined_1", group2="undefined_2")
def test_scanvi(): adata = synthetic_iid() model = SCANVI(adata, "label_0", n_latent=10) model.train(1, train_size=0.5, frequency=1) assert len(model.history["unsupervised_trainer_history"]) == 2 assert len(model.history["semisupervised_trainer_history"]) == 7 adata2 = synthetic_iid() predictions = model.predict(adata2, indices=[1, 2, 3]) assert len(predictions) == 3 model.predict() model.predict(adata2, soft=True) model.predict(adata2, soft=True, indices=[1, 2, 3]) model.get_normalized_expression(adata2) model.differential_expression(groupby="labels", group1="label_1") model.differential_expression(groupby="labels", group1="label_1", group2="label_2")
def test_pyro_bayesian_regression_jit(): use_gpu = int(torch.cuda.is_available()) adata = synthetic_iid() train_dl = AnnDataLoader(adata, shuffle=True, batch_size=128) pyro.clear_param_store() model = BayesianRegressionModule(adata.shape[1], 1) train_dl = AnnDataLoader(adata, shuffle=True, batch_size=128) plan = PyroTrainingPlan(model, loss_fn=pyro.infer.JitTrace_ELBO()) trainer = Trainer(gpus=use_gpu, max_epochs=2, callbacks=[PyroJitGuideWarmup(train_dl)]) trainer.fit(plan, train_dl) # 100 features, 1 for sigma, 1 for bias assert list(model.guide.parameters())[0].shape[0] == 102 if use_gpu == 1: model.cuda() # test Predictive num_samples = 5 predictive = model.create_predictive(num_samples=num_samples) for tensor_dict in train_dl: args, kwargs = model._get_fn_args_from_batch(tensor_dict) _ = { k: v.detach().cpu().numpy() for k, v in predictive(*args, **kwargs).items() if k != "obs" }
def test_save_best_state_callback(save_path): n_latent = 5 adata = synthetic_iid() SCVI.setup_anndata(adata, batch_key="batch", labels_key="labels") model = SCVI(adata, n_latent=n_latent) callbacks = [SaveBestState(verbose=True)] model.train(3, check_val_every_n_epoch=1, train_size=0.5, callbacks=callbacks)
def test_multiple_covariates_scvi(save_path): adata = synthetic_iid() adata.obs["cont1"] = np.random.normal(size=(adata.shape[0], )) adata.obs["cont2"] = np.random.normal(size=(adata.shape[0], )) adata.obs["cat1"] = np.random.randint(0, 5, size=(adata.shape[0], )) adata.obs["cat2"] = np.random.randint(0, 5, size=(adata.shape[0], )) SCVI.setup_anndata( adata, batch_key="batch", labels_key="labels", continuous_covariate_keys=["cont1", "cont2"], categorical_covariate_keys=["cat1", "cat2"], ) m = SCVI(adata) m.train(1) m = SCANVI(adata, unlabeled_category="Unknown") m.train(1) TOTALVI.setup_anndata( adata, batch_key="batch", protein_expression_obsm_key="protein_expression", protein_names_uns_key="protein_names", continuous_covariate_keys=["cont1", "cont2"], categorical_covariate_keys=["cat1", "cat2"], ) m = TOTALVI(adata) m.train(1)
def test_multivi(): data = synthetic_iid(run_setup_anndata=False) MULTIVI.setup_anndata( data, batch_key="batch", ) vae = MULTIVI( data, n_genes=50, n_regions=50, ) vae.train(1, save_best=False) vae.train(1, adversarial_mixing=False) vae.train(3) vae.get_elbo(indices=vae.validation_indices) vae.get_accessibility_estimates() vae.get_accessibility_estimates(normalize_cells=True) vae.get_accessibility_estimates(normalize_regions=True) vae.get_normalized_expression() vae.get_library_size_factors() vae.get_region_factors() vae.get_reconstruction_error(indices=vae.validation_indices) vae.get_latent_representation() vae.differential_accessibility(groupby="labels", group1="label_1") vae.differential_expression(groupby="labels", group1="label_1")
def test_pyro_bayesian_train_sample_mixin_with_local_full_data(): use_gpu = torch.cuda.is_available() adata = synthetic_iid() mod = BayesianRegressionModel(adata, per_cell_weight=True) mod.train( max_epochs=2, batch_size=None, lr=0.01, train_size=1, # does not work when there is a validation set. use_gpu=use_gpu, ) # 100 assert list(mod.module.guide.state_dict() ["locs.linear.weight_unconstrained"].shape) == [1, 100] # test posterior sampling samples = mod.sample_posterior(num_samples=10, use_gpu=use_gpu, batch_size=adata.n_obs, return_samples=True) assert len(samples["posterior_samples"]["sigma"]) == 10 assert samples["posterior_samples"]["per_cell_weights"].shape == ( 10, adata.n_obs, 1, )
def test_destvi(save_path): # Step1 learn CondSCVI n_latent = 2 n_labels = 5 n_layers = 2 dataset = synthetic_iid(n_labels=n_labels) sc_model = CondSCVI(dataset, n_latent=n_latent, n_layers=n_layers) sc_model.train(1, train_size=1) # step 2 learn destVI with multiple amortization scheme for amor_scheme in ["both", "none", "proportion", "latent"]: spatial_model = DestVI.from_rna_model( dataset, sc_model, amortization=amor_scheme, ) spatial_model.train(max_epochs=1) assert not np.isnan(spatial_model.history["elbo_train"].values[0][0]) assert spatial_model.get_proportions().shape == (dataset.n_obs, n_labels) assert spatial_model.get_gamma(return_numpy=True).shape == ( dataset.n_obs, n_latent, n_labels, ) assert spatial_model.get_scale_for_ct("label_0", np.arange(50)).shape == ( 50, dataset.n_vars, )
def test_pyro_bayesian_regression(save_path): use_gpu = int(torch.cuda.is_available()) adata = synthetic_iid() train_dl = AnnDataLoader(adata, shuffle=True, batch_size=128) pyro.clear_param_store() model = BayesianRegressionModule(adata.shape[1], 1) plan = PyroTrainingPlan(model) plan.n_obs_training = len(train_dl.indices) trainer = Trainer( gpus=use_gpu, max_epochs=2, ) trainer.fit(plan, train_dl) if use_gpu == 1: model.cuda() # test Predictive num_samples = 5 predictive = model.create_predictive(num_samples=num_samples) for tensor_dict in train_dl: args, kwargs = model._get_fn_args_from_batch(tensor_dict) _ = { k: v.detach().cpu().numpy() for k, v in predictive(*args, **kwargs).items() if k != "obs" } # test save and load # cpu/gpu has minor difference model.cpu() quants = model.guide.quantiles([0.5]) sigma_median = quants["sigma"][0].detach().cpu().numpy() linear_median = quants["linear.weight"][0].detach().cpu().numpy() model_save_path = os.path.join(save_path, "model_params.pt") torch.save(model.state_dict(), model_save_path) pyro.clear_param_store() new_model = BayesianRegressionModule(adata.shape[1], 1) # run model one step to get autoguide params try: new_model.load_state_dict(torch.load(model_save_path)) except RuntimeError as err: if isinstance(new_model, PyroBaseModuleClass): plan = PyroTrainingPlan(new_model) plan.n_obs_training = len(train_dl.indices) trainer = Trainer( gpus=use_gpu, max_steps=1, ) trainer.fit(plan, train_dl) new_model.load_state_dict(torch.load(model_save_path)) else: raise err quants = new_model.guide.quantiles([0.5]) sigma_median_new = quants["sigma"][0].detach().cpu().numpy() linear_median_new = quants["linear.weight"][0].detach().cpu().numpy() np.testing.assert_array_equal(sigma_median_new, sigma_median) np.testing.assert_array_equal(linear_median_new, linear_median)
def test_stereoscope(save_path): dataset = synthetic_iid(n_labels=5, run_setup_anndata=False) RNAStereoscope.setup_anndata( dataset, labels_key="labels", ) # train with no proportions sc_model = RNAStereoscope(dataset) sc_model.train(max_epochs=1) # train again with proportions sc_model = RNAStereoscope(dataset, ct_weights=np.ones((5, ))) sc_model.train(max_epochs=1) # test save/load sc_model.save(save_path, overwrite=True, save_anndata=True) sc_model = RNAStereoscope.load(save_path) st_model = SpatialStereoscope.from_rna_model(dataset, sc_model, prior_weight="minibatch") st_model.train(max_epochs=1) st_model.get_proportions() # test save/load st_model.save(save_path, overwrite=True, save_anndata=True) st_model = SpatialStereoscope.load(save_path) st_model.get_proportions() # try imputation code y = np.array(50 * ["label_0"]) st_model.get_scale_for_ct(y)
def test_peakvi(): data = synthetic_iid() vae = PEAKVI( data, model_depth=False, ) vae.train(1, save_best=False) vae = PEAKVI( data, region_factors=False, ) vae.train(1, save_best=False) vae = PEAKVI( data, ) vae.train(3) vae.get_elbo(indices=vae.validation_indices) vae.get_accessibility_estimates() vae.get_accessibility_estimates(normalize_cells=True) vae.get_accessibility_estimates(normalize_regions=True) vae.get_library_size_factors() vae.get_region_factors() vae.get_reconstruction_error(indices=vae.validation_indices) vae.get_latent_representation() vae.differential_accessibility(groupby="labels", group1="label_1")
def test_lda_model_save_load(save_path): use_gpu = torch.cuda.is_available() n_topics = 5 adata = synthetic_iid(run_setup_anndata=False) AmortizedLDA.setup_anndata(adata) mod = AmortizedLDA(adata, n_topics=n_topics) mod.train( max_epochs=5, batch_size=256, lr=0.01, use_gpu=use_gpu, ) hist_elbo = mod.history_["elbo_train"] feature_by_topic_1 = mod.get_feature_by_topic(n_samples=5000) latent_1 = mod.get_latent_representation(n_samples=5000) save_path = os.path.join(save_path, "tmp") mod.save(save_path, overwrite=True, save_anndata=True) mod = AmortizedLDA.load(save_path) np.testing.assert_array_equal(mod.history_["elbo_train"], hist_elbo) feature_by_topic_2 = mod.get_feature_by_topic(n_samples=5000) latent_2 = mod.get_latent_representation(n_samples=5000) np.testing.assert_almost_equal( feature_by_topic_1.to_numpy(), feature_by_topic_2.to_numpy(), decimal=2 ) np.testing.assert_almost_equal(latent_1.to_numpy(), latent_2.to_numpy(), decimal=2)
def test_data_splitter(): a = synthetic_iid() adata_manager = generic_setup_adata_manager(a, batch_key="batch", labels_key="labels") # test leaving validataion_size empty works ds = DataSplitter(adata_manager, train_size=0.4) ds.setup() # check the number of indices _, _, _ = ds.train_dataloader(), ds.val_dataloader(), ds.test_dataloader() n_train_idx = len(ds.train_idx) n_validation_idx = len(ds.val_idx) if ds.val_idx is not None else 0 n_test_idx = len(ds.test_idx) if ds.test_idx is not None else 0 assert n_train_idx + n_validation_idx + n_test_idx == a.n_obs assert np.isclose(n_train_idx / a.n_obs, 0.4) assert np.isclose(n_validation_idx / a.n_obs, 0.6) assert np.isclose(n_test_idx / a.n_obs, 0) # test test size ds = DataSplitter(adata_manager, train_size=0.4, validation_size=0.3) ds.setup() # check the number of indices _, _, _ = ds.train_dataloader(), ds.val_dataloader(), ds.test_dataloader() n_train_idx = len(ds.train_idx) n_validation_idx = len(ds.val_idx) if ds.val_idx is not None else 0 n_test_idx = len(ds.test_idx) if ds.test_idx is not None else 0 assert n_train_idx + n_validation_idx + n_test_idx == a.n_obs assert np.isclose(n_train_idx / a.n_obs, 0.4) assert np.isclose(n_validation_idx / a.n_obs, 0.3) assert np.isclose(n_test_idx / a.n_obs, 0.3) # test that 0 < train_size <= 1 with pytest.raises(ValueError): ds = DataSplitter(adata_manager, train_size=2) ds.setup() ds.train_dataloader() with pytest.raises(ValueError): ds = DataSplitter(adata_manager, train_size=-2) ds.setup() ds.train_dataloader() # test that 0 <= validation_size < 1 with pytest.raises(ValueError): ds = DataSplitter(adata_manager, train_size=0.1, validation_size=1) ds.setup() ds.val_dataloader() with pytest.raises(ValueError): ds = DataSplitter(adata_manager, train_size=0.1, validation_size=-1) ds.setup() ds.val_dataloader() # test that train_size + validation_size <= 1 with pytest.raises(ValueError): ds = DataSplitter(adata_manager, train_size=1, validation_size=0.1) ds.setup() ds.train_dataloader() ds.val_dataloader()
def test_solo(save_path): n_latent = 5 adata = synthetic_iid(run_setup_anndata=False) setup_anndata(adata) model = SCVI(adata, n_latent=n_latent) model.train(1, check_val_every_n_epoch=1, train_size=0.5) solo = SOLO.from_scvi_model(model) solo.train(1, check_val_every_n_epoch=1, train_size=0.9) assert "validation_loss" in solo.history.keys() solo.predict() bdata = synthetic_iid(run_setup_anndata=False) solo = SOLO.from_scvi_model(model, bdata) solo.train(1, check_val_every_n_epoch=1, train_size=0.9) assert "validation_loss" in solo.history.keys() solo.predict()
def test_data_format(): # if data was dense np array, check after setup_anndata, data is C_CONTIGUOUS adata = synthetic_iid() old_x = adata.X old_pro = adata.obsm["protein_expression"] old_obs = adata.obs adata.X = np.asfortranarray(old_x) adata.obsm["protein_expression"] = np.asfortranarray(old_pro) assert adata.X.flags["C_CONTIGUOUS"] is False assert adata.obsm["protein_expression"].flags["C_CONTIGUOUS"] is False adata_manager = generic_setup_adata_manager( adata, protein_expression_obsm_key="protein_expression") assert adata.X.flags["C_CONTIGUOUS"] is True assert adata.obsm["protein_expression"].flags["C_CONTIGUOUS"] is True assert np.array_equal(old_x, adata.X) assert np.array_equal(old_pro, adata.obsm["protein_expression"]) assert np.array_equal(old_obs, adata.obs) assert np.array_equal(adata.X, adata_manager.get_from_registry(REGISTRY_KEYS.X_KEY)) assert np.array_equal( adata.obsm["protein_expression"], adata_manager.get_from_registry(REGISTRY_KEYS.PROTEIN_EXP_KEY), ) # if obsm is dataframe, make it C_CONTIGUOUS if it isnt adata = synthetic_iid() pe = np.asfortranarray(adata.obsm["protein_expression"]) adata.obsm["protein_expression"] = pd.DataFrame(pe, index=adata.obs_names) assert adata.obsm["protein_expression"].to_numpy( ).flags["C_CONTIGUOUS"] is False adata_manager = generic_setup_adata_manager( adata, protein_expression_obsm_key="protein_expression") new_pe = adata_manager.get_from_registry(REGISTRY_KEYS.PROTEIN_EXP_KEY) assert new_pe.to_numpy().flags["C_CONTIGUOUS"] is True assert np.array_equal(pe, new_pe) assert np.array_equal(adata.X, adata_manager.get_from_registry(REGISTRY_KEYS.X_KEY)) assert np.array_equal( adata.obsm["protein_expression"], adata_manager.get_from_registry(REGISTRY_KEYS.PROTEIN_EXP_KEY), )
def test_saving_and_loading(save_path): def test_save_load_model(cls, adata, save_path): model = cls(adata, latent_distribution="normal") model.train(1, train_size=0.2) z1 = model.get_latent_representation(adata) test_idx1 = model.validation_indices model.save(save_path, overwrite=True, save_anndata=True) model = cls.load(save_path) model.get_latent_representation() tmp_adata = scvi.data.synthetic_iid(n_genes=200) with pytest.raises(ValueError): cls.load(save_path, tmp_adata) model = cls.load(save_path, adata) z2 = model.get_latent_representation() test_idx2 = model.validation_indices np.testing.assert_array_equal(z1, z2) np.testing.assert_array_equal(test_idx1, test_idx2) assert model.is_trained is True save_path = os.path.join(save_path, "tmp") adata = synthetic_iid() for cls in [SCVI, LinearSCVI, TOTALVI]: print(cls) test_save_load_model(cls, adata, save_path) # AUTOZI model = AUTOZI(adata, latent_distribution="normal") model.train(1, train_size=0.5) ab1 = model.get_alphas_betas() model.save(save_path, overwrite=True, save_anndata=True) model = AUTOZI.load(save_path) model.get_latent_representation() tmp_adata = scvi.data.synthetic_iid(n_genes=200) with pytest.raises(ValueError): AUTOZI.load(save_path, tmp_adata) model = AUTOZI.load(save_path, adata) ab2 = model.get_alphas_betas() np.testing.assert_array_equal(ab1["alpha_posterior"], ab2["alpha_posterior"]) np.testing.assert_array_equal(ab1["beta_posterior"], ab2["beta_posterior"]) assert model.is_trained is True # SCANVI model = SCANVI(adata, "label_0") model.train(max_epochs=1, train_size=0.5) p1 = model.predict() model.save(save_path, overwrite=True, save_anndata=True) model = SCANVI.load(save_path) model.get_latent_representation() tmp_adata = scvi.data.synthetic_iid(n_genes=200) with pytest.raises(ValueError): SCANVI.load(save_path, tmp_adata) model = SCANVI.load(save_path, adata) p2 = model.predict() np.testing.assert_array_equal(p1, p2) assert model.is_trained is True