def _load_seqfishplus( save_path: str = "data/", tissue_region: str = "subventricular cortex", run_setup_anndata: bool = True, ) -> anndata.AnnData: if tissue_region == "subventricular cortex": file_prefix = "cortex_svz" elif tissue_region == "olfactory bulb": file_prefix = "ob" else: raise ValueError( '`tissue_type` must be "subventricular cortex" or "olfactory bulb", but got {}' .format(tissue_region)) save_path = os.path.abspath(save_path) url = "https://github.com/CaiGroup/seqFISH-PLUS/raw/master/sourcedata.zip" save_fn = "seqfishplus.zip" _download(url, save_path, save_fn) adata = _load_seqfishplus_data(os.path.join(save_path, save_fn), file_prefix, save_path, gene_by_cell=False) adata.obs["batch"] = np.zeros(adata.shape[0], dtype=np.int64) adata.obs["labels"] = np.zeros(adata.shape[0], dtype=np.int64) if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_annotation_simulation(name: str, save_path: str = "data/", run_setup_anndata: bool = True) -> AnnData: """\ Simulated datasets for scANVI tutorials name One of "1", "2", or "3" """ save_path = os.path.abspath(save_path) url = "https://github.com/YosefLab/scVI-data/raw/master/simulation/simulation_{}.loom".format( name) save_fn = "simulation_{}.loom".format(name) _download(url, save_path, save_fn) adata = _load_loom(os.path.join(save_path, save_fn)) adata.obs["labels"] = adata.obs.ClusterID.values del adata.obs["ClusterID"] adata.obs["batch"] = adata.obs.BatchID.values del adata.obs["BatchID"] if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def test_extra_covariates_transfer(): adata = synthetic_iid() adata.obs["cont1"] = np.random.normal(size=(adata.shape[0],)) adata.obs["cont2"] = np.random.normal(size=(adata.shape[0],)) adata.obs["cat1"] = np.random.randint(0, 5, size=(adata.shape[0],)) adata.obs["cat2"] = np.random.randint(0, 5, size=(adata.shape[0],)) setup_anndata( adata, batch_key="batch", labels_key="labels", protein_expression_obsm_key="protein_expression", protein_names_uns_key="protein_names", continuous_covariate_keys=["cont1", "cont2"], categorical_covariate_keys=["cat1", "cat2"], ) bdata = synthetic_iid() bdata.obs["cont1"] = np.random.normal(size=(bdata.shape[0],)) bdata.obs["cont2"] = np.random.normal(size=(bdata.shape[0],)) bdata.obs["cat1"] = 0 bdata.obs["cat2"] = 1 transfer_anndata_setup(adata_source=adata, adata_target=bdata) # give it a new category del bdata.uns["_scvi"] bdata.obs["cat1"] = 6 transfer_anndata_setup( adata_source=adata, adata_target=bdata, extend_categories=True ) assert bdata.uns["_scvi"]["extra_categoricals"]["mappings"]["cat1"][-1] == 6
def test_solo(save_path): n_latent = 5 adata = synthetic_iid(run_setup_anndata=False) setup_anndata(adata) model = SCVI(adata, n_latent=n_latent) model.train(1, check_val_every_n_epoch=1, train_size=0.5) solo = SOLO.from_scvi_model(model) solo.train(1, check_val_every_n_epoch=1, train_size=0.9) assert "validation_loss" in solo.history.keys() solo.predict()
def test_linear_scvi(save_path): adata = synthetic_iid() adata = adata[:, :10].copy() setup_anndata(adata) model = LinearSCVI(adata, n_latent=10) model.train(1, check_val_every_n_epoch=1, train_size=0.5) assert len(model.history["elbo_train"]) == 1 assert len(model.history["elbo_validation"]) == 1 model.get_loadings() model.differential_expression(groupby="labels", group1="label_1") model.differential_expression(groupby="labels", group1="label_1", group2="label_2")
def test_linear_scvi(): adata = synthetic_iid() adata = adata[:, :10].copy() setup_anndata(adata) model = LinearSCVI(adata, n_latent=10) model.train(1, frequency=1, train_size=0.5) assert len(model.history["elbo_train_set"]) == 2 assert len(model.history["elbo_test_set"]) == 2 model.get_loadings() model.differential_expression(groupby="labels", group1="label_1") model.differential_expression(groupby="labels", group1="label_1", group2="label_2")
def test_solo_multiple_batch(save_path): n_latent = 5 adata = synthetic_iid() setup_anndata(adata, batch_key="batch") model = SCVI(adata, n_latent=n_latent) model.train(1, check_val_every_n_epoch=1, train_size=0.5) solo = SOLO.from_scvi_model(model, restrict_to_batch="batch_0") solo.train(1, check_val_every_n_epoch=1, train_size=0.9) assert "validation_loss" in solo.history.keys() solo.predict()
def _load_seqfish(save_path: str = "data/", run_setup_anndata: bool = True) -> anndata.AnnData: save_path = os.path.abspath(save_path) url = "https://www.cell.com/cms/attachment/2080562255/2072099886/mmc6.xlsx" save_fn = "SeqFISH.xlsx" _download(url, save_path, save_fn) adata = _load_seqfish_data(os.path.join(save_path, save_fn)) adata.obs["batch"] = np.zeros(adata.shape[0], dtype=np.int64) adata.obs["labels"] = np.zeros(adata.shape[0], dtype=np.int64) if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def test_backed_anndata_scvi(save_path): adata = scvi.data.synthetic_iid() path = os.path.join(save_path, "test_data.h5ad") adata.write_h5ad(path) adata = anndata.read_h5ad(path, backed="r+") setup_anndata(adata, batch_key="batch") model = SCVI(adata, n_latent=5) model.train(1, train_size=0.5) assert model.is_trained is True z = model.get_latent_representation() assert z.shape == (adata.shape[0], 5) model.get_elbo()
def test_view_anndata_setup(save_path): adata = synthetic_iid(run_setup_anndata=False) adata.obs["cont1"] = np.random.uniform(5, adata.n_obs) adata.obs["cont2"] = np.random.uniform(5, adata.n_obs) adata.obs["cont1"][ 0] = 939543895847598301.423432423523512351234123421341234 adata.obs["cont2"][1] = 0.12938471298374691827634 adata.obs["cat1"] = np.random.randint(0, 5, adata.n_obs).astype(str) adata.obs["cat1"][8] = "asdf" adata.obs["cat1"][9] = "f34" adata.obs["cat2"] = np.random.randint(0, 7, adata.n_obs) setup_anndata( adata, protein_expression_obsm_key="protein_expression", batch_key="batch", labels_key="labels", categorical_covariate_keys=["cat1", "cat2"], continuous_covariate_keys=["cont1", "cont2"], ) # test it works with adata view_anndata_setup(adata) # test it works with scvi setup dict view_anndata_setup(adata.uns["_scvi"]) adata = scvi.data.synthetic_iid() m = scvi.model.SCVI(adata) folder_path = os.path.join(save_path, "tmp") m.save(folder_path, save_anndata=True) # test it works with a saved model folder view_anndata_setup(folder_path) adata_path = os.path.join(folder_path, "adata.h5ad") # test it works with the path to an anndata view_anndata_setup(adata_path) m = scvi.model.SCVI(adata) m.save(folder_path, overwrite=True) # test it works without saving the anndata view_anndata_setup(folder_path) # test it throws error if adata was not setup with pytest.raises(ValueError): adata = synthetic_iid(run_setup_anndata=False) view_anndata_setup(adata) # test it throws error if we dont pass dict, anndata or str in with pytest.raises(ValueError): view_anndata_setup(0)
def test_linear_scvi(): # test using raw adata = synthetic_iid() adata.raw = adata adata = adata[:, :10].copy() setup_anndata(adata, use_raw=True) model = LinearSCVI(adata, n_latent=10) model.train(1) loadings = model.get_loadings() pd.testing.assert_index_equal(loadings.index, adata.raw.var_names) model.differential_expression(groupby="labels", group1="undefined_1") model.differential_expression(groupby="labels", group1="undefined_1", group2="undefined_2")
def test_scanvi_online_update(save_path): # ref has semi-observed labels n_latent = 5 adata1 = synthetic_iid(run_setup_anndata=False) new_labels = adata1.obs.labels.to_numpy() new_labels[0] = "Unknown" adata1.obs["labels"] = pd.Categorical(new_labels) setup_anndata(adata1, batch_key="batch", labels_key="labels") model = SCANVI(adata1, "Unknown", n_latent=n_latent, encode_covariates=True) model.train(n_epochs_unsupervised=1, n_epochs_semisupervised=1, frequency=1) dir_path = os.path.join(save_path, "saved_model/") model.save(dir_path, overwrite=True) adata2 = synthetic_iid(run_setup_anndata=False) adata2.obs["batch"] = adata2.obs.batch.cat.rename_categories(["batch_2", "batch_3"]) adata2.obs["labels"] = "Unknown" model = SCANVI.load_query_data(adata2, dir_path, freeze_batchnorm_encoder=True) model.train( n_epochs_unsupervised=1, n_epochs_semisupervised=1, train_base_model=False ) model.get_latent_representation() model.predict() # ref has fully-observed labels n_latent = 5 adata1 = synthetic_iid(run_setup_anndata=False) new_labels = adata1.obs.labels.to_numpy() adata1.obs["labels"] = pd.Categorical(new_labels) setup_anndata(adata1, batch_key="batch", labels_key="labels") model = SCANVI(adata1, "Unknown", n_latent=n_latent, encode_covariates=True) model.train(n_epochs_unsupervised=1, n_epochs_semisupervised=1, frequency=1) dir_path = os.path.join(save_path, "saved_model/") model.save(dir_path, overwrite=True) # query has one new label adata2 = synthetic_iid(run_setup_anndata=False) adata2.obs["batch"] = adata2.obs.batch.cat.rename_categories(["batch_2", "batch_3"]) new_labels = adata2.obs.labels.to_numpy() new_labels[0] = "Unknown" adata2.obs["labels"] = pd.Categorical(new_labels) model = SCANVI.load_query_data(adata2, dir_path, freeze_batchnorm_encoder=True) model._unlabeled_indices = np.arange(adata2.n_obs) model._labeled_indices = [] model.train( n_epochs_unsupervised=1, n_epochs_semisupervised=1, train_base_model=False ) model.get_latent_representation() model.predict()
def from_scvi_model(cls, scvi_model: SCVI, adata: Optional[AnnData] = None): """ Instantiate a SOLO model from an scvi model. Parameters ---------- scvi_model Pre-trained model of :class:`~scvi.model.SCVI`. This model should have been trained on data comprising one lane. The adata object used to initialize this model should have only been setup with count data, i.e., no `batch_key`, `labels_key`, etc. adata Optional anndata to use that is compatible with scvi_model. Returns ------- SOLO model """ _validate_scvi_model(scvi_model) doublet_adata = cls.create_doublets(scvi_model.adata) # if model is using observed lib size, needs to get lib sample # which is just observed lib size on log scale give_mean_lib = not scvi_model.module.use_observed_lib_size # get latent representations and make input anndata latent_rep = scvi_model.get_latent_representation() lib_size = scvi_model.get_latent_library_size(give_mean=give_mean_lib) latent_adata = AnnData(np.concatenate([latent_rep, lib_size], axis=1)) latent_adata.obs[LABELS_KEY] = "singlet" logger.info("Creating doublets, preparing SOLO model.") f = io.StringIO() with redirect_stdout(f): setup_anndata(doublet_adata) doublet_latent_rep = scvi_model.get_latent_representation( doublet_adata) doublet_lib_size = scvi_model.get_latent_library_size( doublet_adata, give_mean=give_mean_lib) doublet_adata = AnnData( np.concatenate([doublet_latent_rep, doublet_lib_size], axis=1)) doublet_adata.obs[LABELS_KEY] = "doublet" full_adata = latent_adata.concatenate(doublet_adata) setup_anndata(full_adata, labels_key=LABELS_KEY) return cls(full_adata)
def _load_smfish( save_path: str = "data/", use_high_level_cluster: bool = True, run_setup_anndata: bool = True, ) -> anndata.AnnData: save_path = os.path.abspath(save_path) url = "http://linnarssonlab.org/osmFISH/osmFISH_SScortex_mouse_all_cells.loom" save_fn = "osmFISH_SScortex_mouse_all_cell.loom" _download(url, save_path, save_fn) adata = _load_smfish_data(os.path.join(save_path, save_fn), use_high_level_cluster=use_high_level_cluster) adata.obs["batch"] = np.zeros(adata.shape[0], dtype=np.int64) if run_setup_anndata: setup_anndata(adata, labels_key="labels", batch_key="batch") return adata
def test_scvi_sparse(save_path): n_latent = 5 adata = synthetic_iid(run_setup_anndata=False) adata.X = csr_matrix(adata.X) setup_anndata(adata) model = SCVI(adata, n_latent=n_latent) model.train(1, train_size=0.5) assert model.is_trained is True z = model.get_latent_representation() assert z.shape == (adata.shape[0], n_latent) model.get_elbo() model.get_marginal_ll(n_mc_samples=3) model.get_reconstruction_error() model.get_normalized_expression() model.differential_expression(groupby="labels", group1="label_1")
def _load_heart_cell_atlas_subsampled( save_path: str = "data/", run_setup_anndata: bool = True, remove_nuisance_clusters: bool = True, ): """ Combined single cell and single nuclei RNA-Seq data of 485K cardiac cells with annotations. Dataset was filtered down randomly to 20k cells using :func:`~scanpy.pp.subsample`. The original data can be sourced from https://www.heartcellatlas.org/#DataSources. Parameters ---------- save_path Location to use when saving/loading the data. run_setup_anndata If true, runs setup_anndata() on dataset before returning remove_nuisance_clusters Remove doublets and unsassigned cells Returns ------- AnnData Notes ----- The data were filtered using the following sequence:: >>> adata = anndata.read_h5ad(path_to_anndata) >>> bdata = sc.pp.subsample(adata, n_obs=20000, copy=True) >>> sc.pp.filter_genes(bdata, min_counts=3) >>> bdata.write_h5ad(path, compression="gzip") """ url = "https://github.com/YosefLab/scVI-data/blob/master/hca_subsampled_20k.h5ad?raw=true" save_fn = "hca_subsampled_20k.h5ad" _download(url, save_path, save_fn) dataset = anndata.read_h5ad(os.path.join(save_path, save_fn)) if remove_nuisance_clusters: remove = ["doublets", "NotAssigned"] keep = [c not in remove for c in dataset.obs.cell_type.values] dataset = dataset[keep, :].copy() if run_setup_anndata: setup_anndata(dataset, ) return dataset
def test_extra_covariates(): adata = synthetic_iid() adata.obs["cont1"] = np.random.normal(size=(adata.shape[0], )) adata.obs["cont2"] = np.random.normal(size=(adata.shape[0], )) adata.obs["cat1"] = np.random.randint(0, 5, size=(adata.shape[0], )) adata.obs["cat2"] = np.random.randint(0, 5, size=(adata.shape[0], )) setup_anndata( adata, batch_key="batch", labels_key="labels", protein_expression_obsm_key="protein_expression", protein_names_uns_key="protein_names", continuous_covariate_keys=["cont1", "cont2"], categorical_covariate_keys=["cat1", "cat2"], ) df1 = adata.obsm["_scvi_extra_continuous"] df2 = adata.obs[["cont1", "cont2"]] pd.testing.assert_frame_equal(df1, df2)
def _load_retina(save_path: str = "data/", run_setup_anndata: bool = True) -> AnnData: """\ Loads retina dataset The dataset of bipolar cells contains after their original pipeline for filtering 27,499 cells and 13,166 genes coming from two batches. We use the cluster annotation from 15 cell-types from the author. We also extract their normalized data with Combat and use it for benchmarking. """ save_path = os.path.abspath(save_path) url = "https://github.com/YosefLab/scVI-data/raw/master/retina.loom" save_fn = "retina.loom" _download(url, save_path, save_fn) adata = _load_loom(os.path.join(save_path, save_fn)) cell_types = [ "RBC", "MG", "BC5A", "BC7", "BC6", "BC5C", "BC1A", "BC3B", "BC1B", "BC2", "BC5D", "BC3A", "BC5B", "BC4", "BC8_9", ] adata.obs["labels"] = [ cell_types[i] for i in adata.obs["ClusterID"].values.astype(int).ravel() ] del adata.obs["ClusterID"] adata.obs["batch"] = pd.Categorical(adata.obs["BatchID"].values.copy()) del adata.obs["BatchID"] if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_frontalcortex_dropseq(save_path: str = "data/", run_setup_anndata: bool = True) -> AnnData: save_path = os.path.abspath(save_path) url = "https://github.com/YosefLab/scVI-data/raw/master/fc-dropseq.loom" save_fn = "fc-dropseq.loom" _download(url, save_path, save_fn) adata = _load_loom(os.path.join(save_path, save_fn)) adata.obs["batch"] = adata.obs["Clusters"] del adata.obs["Clusters"] adata.obs["labels"] = np.zeros(adata.shape[0], dtype=np.int64) # reorder labels such that layers of the cortex are in order # order_labels = [5, 6, 3, 2, 4, 0, 1, 8, 7, 9, 10, 11, 12, 13] # self.reorder_cell_types(self.cell_types[order_labels]) if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def test_saving(save_path): save_path = os.path.join(save_path, "tmp_adata.h5ad") adata = synthetic_iid(run_setup_anndata=False) adata.obs["cont1"] = np.random.uniform(5, adata.n_obs) adata.obs["cont2"] = np.random.uniform(5, adata.n_obs) adata.obs["cat1"] = np.random.randint(0, 3, adata.n_obs).astype(str) adata.obs["cat1"][1] = "asdf" adata.obs["cat1"][2] = "f34" adata.obs["cat2"] = np.random.randint(0, 7, adata.n_obs) setup_anndata( adata, protein_expression_obsm_key="protein_expression", batch_key="batch", labels_key="labels", categorical_covariate_keys=["cat1", "cat2"], continuous_covariate_keys=["cont1", "cont2"], ) adata.write(save_path) anndata.read(save_path)
def test_data_format(): # if data was dense np array, check after setup_anndata, data is C_CONTIGUOUS adata = synthetic_iid(run_setup_anndata=False) old_x = adata.X old_pro = adata.obsm["protein_expression"] old_obs = adata.obs adata.X = np.asfortranarray(old_x) adata.obsm["protein_expression"] = np.asfortranarray(old_pro) assert adata.X.flags["C_CONTIGUOUS"] is False assert adata.obsm["protein_expression"].flags["C_CONTIGUOUS"] is False setup_anndata(adata, protein_expression_obsm_key="protein_expression") assert adata.X.flags["C_CONTIGUOUS"] is True assert adata.obsm["protein_expression"].flags["C_CONTIGUOUS"] is True assert np.array_equal(old_x, adata.X) assert np.array_equal(old_pro, adata.obsm["protein_expression"]) assert np.array_equal(old_obs, adata.obs) assert np.array_equal(adata.X, get_from_registry(adata, _CONSTANTS.X_KEY)) assert np.array_equal( adata.obsm["protein_expression"], get_from_registry(adata, _CONSTANTS.PROTEIN_EXP_KEY), ) # if obsm is dataframe, make it C_CONTIGUOUS if it isnt adata = synthetic_iid() pe = np.asfortranarray(adata.obsm["protein_expression"]) adata.obsm["protein_expression"] = pd.DataFrame(pe, index=adata.obs_names) assert adata.obsm["protein_expression"].to_numpy( ).flags["C_CONTIGUOUS"] is False setup_anndata(adata, protein_expression_obsm_key="protein_expression") new_pe = get_from_registry(adata, "protein_expression") assert new_pe.to_numpy().flags["C_CONTIGUOUS"] is True assert np.array_equal(pe, new_pe) assert np.array_equal(adata.X, get_from_registry(adata, _CONSTANTS.X_KEY)) assert np.array_equal( adata.obsm["protein_expression"], get_from_registry(adata, _CONSTANTS.PROTEIN_EXP_KEY), )
def _load_prefrontalcortex_starmap(save_path: str = "data/", run_setup_anndata: bool = True) -> AnnData: """\ Loads a starMAP dataset of 3,704 cells and 166 genes from the mouse pre-frontal cortex (Wang et al., 2018) """ save_path = os.path.abspath(save_path) url = "https://github.com/YosefLab/scVI-data/raw/master/mpfc-starmap.loom" save_fn = "mpfc-starmap.loom" _download(url, save_path, save_fn) adata = _load_loom(os.path.join(save_path, save_fn)) adata.obs["labels"] = adata.obs.Clusters.values del adata.obs["Clusters"] adata.obs["batch"] = adata.obs.BatchID.values del adata.obs["BatchID"] adata.obs["x_coord"] = adata.obsm["Spatial_coordinates"][:, 0] adata.obs["y_coord"] = adata.obsm["Spatial_coordinates"][:, 1] if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def test_backed_anndata(save_path): adata = scvi.data.synthetic_iid() path = os.path.join(save_path, "test_data.h5ad") adata.write_h5ad(path) adata = anndata.read_h5ad(path, backed="r+") setup_anndata(adata, batch_key="batch") # test get item bd = AnnTorchDataset(adata) bd[np.arange(adata.n_obs)] # sparse adata = scvi.data.synthetic_iid() adata.X = csr_matrix(adata.X) path = os.path.join(save_path, "test_data2.h5ad") adata.write_h5ad(path) adata = anndata.read_h5ad(path, backed="r+") setup_anndata(adata, batch_key="batch") # test get item bd = AnnTorchDataset(adata) bd[np.arange(adata.n_obs)]
def _generate_synthetic( batch_size: int = 128, n_genes: int = 100, n_proteins: int = 100, n_batches: int = 2, n_labels: int = 3, run_setup_anndata: bool = True, ) -> AnnData: data = np.random.negative_binomial(5, 0.3, size=(batch_size * n_batches, n_genes)) mask = np.random.binomial(n=1, p=0.7, size=(batch_size * n_batches, n_genes)) data = data * mask # We put the batch index first labels = np.random.randint(0, n_labels, size=(batch_size * n_batches,)) labels = np.array(["label_%d" % i for i in labels]) batch = [] for i in range(n_batches): batch += ["batch_{}".format(i)] * batch_size adata = AnnData(data) adata.obs["batch"] = pd.Categorical(batch) adata.obs["labels"] = pd.Categorical(labels) # Protein measurements p_data = np.random.negative_binomial(5, 0.3, size=(adata.shape[0], n_proteins)) adata.obsm["protein_expression"] = p_data adata.uns["protein_names"] = np.arange(n_proteins).astype(str) if run_setup_anndata: setup_anndata( adata, batch_key="batch", labels_key="labels", protein_expression_obsm_key="protein_expression", protein_names_uns_key="protein_names", ) return adata
def _load_purified_pbmc_dataset( save_path: str = "data/", subset_datasets: List[str] = None, run_setup_anndata: bool = True, ) -> anndata.AnnData: url = "https://github.com/YosefLab/scVI-data/raw/master/PurifiedPBMCDataset.h5ad" save_fn = "PurifiedPBMCDataset.h5ad" _download(url, save_path, save_fn) path_to_file = os.path.join(save_path, save_fn) adata = anndata.read(path_to_file) dataset_names = [ "cd4_t_helper", "regulatory_t", "naive_t", "memory_t", "cytotoxic_t", "naive_cytotoxic", "b_cells", "cd4_t_helper", "cd34", "cd56_nk", "cd14_monocytes", ] if subset_datasets is not None: row_indices = [] for dataset in subset_datasets: assert dataset in dataset_names idx = np.where(adata.obs["cell_types"] == dataset)[0] row_indices.append(idx) row_indices = np.concatenate(row_indices) adata = adata[row_indices].copy() if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def test_multiple_covariates(save_path): adata = synthetic_iid() adata.obs["cont1"] = np.random.normal(size=(adata.shape[0],)) adata.obs["cont2"] = np.random.normal(size=(adata.shape[0],)) adata.obs["cat1"] = np.random.randint(0, 5, size=(adata.shape[0],)) adata.obs["cat2"] = np.random.randint(0, 5, size=(adata.shape[0],)) setup_anndata( adata, batch_key="batch", labels_key="labels", protein_expression_obsm_key="protein_expression", protein_names_uns_key="protein_names", continuous_covariate_keys=["cont1", "cont2"], categorical_covariate_keys=["cat1", "cat2"], ) m = SCVI(adata) m.train(1) m = SCANVI(adata, unlabeled_category="Unknown") m.train(1) m = TOTALVI(adata) m.train(1)
def test_scvi(save_path): n_latent = 5 adata = synthetic_iid() model = SCVI(adata, n_latent=n_latent) model.train(1, check_val_every_n_epoch=1, train_size=0.5) model = SCVI(adata, n_latent=n_latent, var_activation=Softplus()) model.train(1, check_val_every_n_epoch=1, train_size=0.5) # tests __repr__ print(model) assert model.is_trained is True z = model.get_latent_representation() assert z.shape == (adata.shape[0], n_latent) assert len(model.history["elbo_train"]) == 1 model.get_elbo() model.get_marginal_ll(n_mc_samples=3) model.get_reconstruction_error() model.get_normalized_expression(transform_batch="batch_1") adata2 = synthetic_iid() model.get_elbo(adata2) model.get_marginal_ll(adata2, n_mc_samples=3) model.get_reconstruction_error(adata2) latent = model.get_latent_representation(adata2, indices=[1, 2, 3]) assert latent.shape == (3, n_latent) denoised = model.get_normalized_expression(adata2) assert denoised.shape == adata.shape denoised = model.get_normalized_expression( adata2, indices=[1, 2, 3], transform_batch="batch_1" ) denoised = model.get_normalized_expression( adata2, indices=[1, 2, 3], transform_batch=["batch_0", "batch_1"] ) assert denoised.shape == (3, adata2.n_vars) sample = model.posterior_predictive_sample(adata2) assert sample.shape == adata2.shape sample = model.posterior_predictive_sample( adata2, indices=[1, 2, 3], gene_list=["1", "2"] ) assert sample.shape == (3, 2) sample = model.posterior_predictive_sample( adata2, indices=[1, 2, 3], gene_list=["1", "2"], n_samples=3 ) assert sample.shape == (3, 2, 3) model.get_feature_correlation_matrix(correlation_type="pearson") model.get_feature_correlation_matrix( adata2, indices=[1, 2, 3], correlation_type="spearman", rna_size_factor=500, n_samples=5, ) model.get_feature_correlation_matrix( adata2, indices=[1, 2, 3], correlation_type="spearman", rna_size_factor=500, n_samples=5, transform_batch=["batch_0", "batch_1"], ) params = model.get_likelihood_parameters() assert params["mean"].shape == adata.shape assert ( params["mean"].shape == params["dispersions"].shape == params["dropout"].shape ) params = model.get_likelihood_parameters(adata2, indices=[1, 2, 3]) assert params["mean"].shape == (3, adata.n_vars) params = model.get_likelihood_parameters( adata2, indices=[1, 2, 3], n_samples=3, give_mean=True ) assert params["mean"].shape == (3, adata.n_vars) model.get_latent_library_size() model.get_latent_library_size(adata2, indices=[1, 2, 3]) # test transfer_anndata_setup adata2 = synthetic_iid(run_setup_anndata=False) transfer_anndata_setup(adata, adata2) model.get_elbo(adata2) # test automatic transfer_anndata_setup + on a view adata = synthetic_iid() model = SCVI(adata) adata2 = synthetic_iid(run_setup_anndata=False) model.get_elbo(adata2[:10]) # test that we catch incorrect mappings adata = synthetic_iid() adata2 = synthetic_iid(run_setup_anndata=False) transfer_anndata_setup(adata, adata2) adata2.uns["_scvi"]["categorical_mappings"]["_scvi_labels"]["mapping"] = np.array( ["label_4", "label_0", "label_2"] ) with pytest.raises(ValueError): model.get_elbo(adata2) # test that same mapping different order doesn't raise error adata = synthetic_iid() adata2 = synthetic_iid(run_setup_anndata=False) transfer_anndata_setup(adata, adata2) adata2.uns["_scvi"]["categorical_mappings"]["_scvi_labels"]["mapping"] = np.array( ["label_1", "label_0", "label_2"] ) model.get_elbo(adata2) # should automatically transfer setup # test mismatched categories raises ValueError adata2 = synthetic_iid(run_setup_anndata=False) adata2.obs.labels.cat.rename_categories(["a", "b", "c"], inplace=True) with pytest.raises(ValueError): model.get_elbo(adata2) # test differential expression model.differential_expression(groupby="labels", group1="label_1") model.differential_expression( groupby="labels", group1="label_1", group2="label_2", mode="change" ) model.differential_expression(groupby="labels") model.differential_expression(idx1=[0, 1, 2], idx2=[3, 4, 5]) model.differential_expression(idx1=[0, 1, 2]) # transform batch works with all different types a = synthetic_iid(run_setup_anndata=False) batch = np.zeros(a.n_obs) batch[:64] += 1 a.obs["batch"] = batch setup_anndata(a, batch_key="batch") m = SCVI(a) m.train(1, train_size=0.5) m.get_normalized_expression(transform_batch=1) m.get_normalized_expression(transform_batch=[0, 1]) # test get_likelihood_parameters() when dispersion=='gene-cell' model = SCVI(adata, dispersion="gene-cell") model.get_likelihood_parameters() # test train callbacks work a = synthetic_iid() m = scvi.model.SCVI(a) lr_monitor = LearningRateMonitor() m.train( callbacks=[lr_monitor], max_epochs=10, log_every_n_steps=1, plan_kwargs={"reduce_lr_on_plateau": True}, ) assert "lr-Adam" in m.history.keys()
def test_scvi(): n_latent = 5 adata = synthetic_iid() model = SCVI(adata, n_latent=n_latent) model.train(1, frequency=1, train_size=0.5) assert model.is_trained is True z = model.get_latent_representation() assert z.shape == (adata.shape[0], n_latent) # len of history should be 2 since metrics is also run once at the very end after training assert len(model.history["elbo_train_set"]) == 2 model.get_elbo() model.get_marginal_ll() model.get_reconstruction_error() model.get_normalized_expression(transform_batch="batch_1") adata2 = synthetic_iid() model.get_elbo(adata2) model.get_marginal_ll(adata2) model.get_reconstruction_error(adata2) latent = model.get_latent_representation(adata2, indices=[1, 2, 3]) assert latent.shape == (3, n_latent) denoised = model.get_normalized_expression(adata2) assert denoised.shape == adata.shape denoised = model.get_normalized_expression(adata2, indices=[1, 2, 3], transform_batch="batch_1") denoised = model.get_normalized_expression( adata2, indices=[1, 2, 3], transform_batch=["batch_0", "batch_1"]) assert denoised.shape == (3, adata2.n_vars) sample = model.posterior_predictive_sample(adata2) assert sample.shape == adata2.shape sample = model.posterior_predictive_sample(adata2, indices=[1, 2, 3], gene_list=["1", "2"]) assert sample.shape == (3, 2) sample = model.posterior_predictive_sample(adata2, indices=[1, 2, 3], gene_list=["1", "2"], n_samples=3) assert sample.shape == (3, 2, 3) model.get_feature_correlation_matrix(correlation_type="pearson") model.get_feature_correlation_matrix( adata2, indices=[1, 2, 3], correlation_type="spearman", rna_size_factor=500, n_samples=5, ) model.get_feature_correlation_matrix( adata2, indices=[1, 2, 3], correlation_type="spearman", rna_size_factor=500, n_samples=5, transform_batch=["batch_0", "batch_1"], ) params = model.get_likelihood_parameters() assert params["mean"].shape == adata.shape assert (params["mean"].shape == params["dispersions"].shape == params["dropout"].shape) params = model.get_likelihood_parameters(adata2, indices=[1, 2, 3]) assert params["mean"].shape == (3, adata.n_vars) params = model.get_likelihood_parameters(adata2, indices=[1, 2, 3], n_samples=3, give_mean=True) assert params["mean"].shape == (3, adata.n_vars) model.get_latent_library_size() model.get_latent_library_size(adata2, indices=[1, 2, 3]) # test transfer_anndata_setup adata2 = synthetic_iid(run_setup_anndata=False) transfer_anndata_setup(adata, adata2) model.get_elbo(adata2) # test automatic transfer_anndata_setup + on a view adata = synthetic_iid() model = SCVI(adata) adata2 = synthetic_iid(run_setup_anndata=False) model.get_elbo(adata2[:10]) # test that we catch incorrect mappings adata = synthetic_iid() adata2 = synthetic_iid(run_setup_anndata=False) transfer_anndata_setup(adata, adata2) adata2.uns["_scvi"]["categorical_mappings"]["_scvi_labels"][ "mapping"] = np.array(["label_1", "label_0", "label_2"]) with pytest.raises(ValueError): model.get_elbo(adata2) # test mismatched categories raises ValueError adata2 = synthetic_iid(run_setup_anndata=False) adata2.obs.labels.cat.rename_categories(["a", "b", "c"], inplace=True) with pytest.raises(ValueError): model.get_elbo(adata2) # test differential expression model.differential_expression(groupby="labels", group1="label_1") model.differential_expression(groupby="labels", group1="label_1", group2="label_2", mode="change") model.differential_expression(groupby="labels") model.differential_expression(idx1=[0, 1, 2], idx2=[3, 4, 5]) model.differential_expression(idx1=[0, 1, 2]) # transform batch works with all different types a = synthetic_iid(run_setup_anndata=False) batch = np.zeros(a.n_obs) batch[:64] += 1 a.obs["batch"] = batch setup_anndata(a, batch_key="batch") m = SCVI(a) m.train(1, train_size=0.5) m.get_normalized_expression(transform_batch=1) m.get_normalized_expression(transform_batch=[0, 1])
def test_transfer_anndata_setup(): # test transfer_anndata function adata1 = synthetic_iid(run_setup_anndata=False) adata2 = synthetic_iid(run_setup_anndata=False) adata2.X = adata1.X setup_anndata(adata1) transfer_anndata_setup(adata1, adata2) np.testing.assert_array_equal(adata1.obs["_scvi_local_l_mean"], adata2.obs["_scvi_local_l_mean"]) # test if layer was used initially, again used in transfer setup adata1 = synthetic_iid(run_setup_anndata=False) adata2 = synthetic_iid(run_setup_anndata=False) raw_counts = adata1.X.copy() adata1.layers["raw"] = raw_counts adata2.layers["raw"] = raw_counts zeros = np.zeros_like(adata1.X) ones = np.ones_like(adata1.X) adata1.X = zeros adata2.X = ones setup_anndata(adata1, layer="raw") transfer_anndata_setup(adata1, adata2) np.testing.assert_array_equal(adata1.obs["_scvi_local_l_mean"], adata2.obs["_scvi_local_l_mean"]) # test that an unknown batch throws an error adata1 = synthetic_iid() adata2 = synthetic_iid(run_setup_anndata=False) adata2.obs["batch"] = [2] * adata2.n_obs with pytest.raises(ValueError): transfer_anndata_setup(adata1, adata2) # TODO: test that a batch with wrong dtype throws an error # adata1 = synthetic_iid() # adata2 = synthetic_iid(run_setup_anndata=False) # adata2.obs["batch"] = ["0"] * adata2.n_obs # with pytest.raises(ValueError): # transfer_anndata_setup(adata1, adata2) # test that an unknown label throws an error adata1 = synthetic_iid() adata2 = synthetic_iid(run_setup_anndata=False) adata2.obs["labels"] = ["label_123"] * adata2.n_obs with pytest.raises(ValueError): transfer_anndata_setup(adata1, adata2) # test that correct mapping was applied adata1 = synthetic_iid() adata2 = synthetic_iid(run_setup_anndata=False) adata2.obs["labels"] = ["label_1"] * adata2.n_obs transfer_anndata_setup(adata1, adata2) labels_mapping = adata1.uns["_scvi"]["categorical_mappings"][ "_scvi_labels"]["mapping"] correct_label = np.where(labels_mapping == "label_1")[0][0] adata2.obs["_scvi_labels"][0] == correct_label # test that transfer_anndata_setup correctly looks for adata.obs['batch'] adata1 = synthetic_iid() adata2 = synthetic_iid(run_setup_anndata=False) del adata2.obs["batch"] with pytest.raises(KeyError): transfer_anndata_setup(adata1, adata2) # test that transfer_anndata_setup assigns same batch and label to cells # if the original anndata was also same batch and label adata1 = synthetic_iid(run_setup_anndata=False) setup_anndata(adata1) adata2 = synthetic_iid(run_setup_anndata=False) del adata2.obs["batch"] transfer_anndata_setup(adata1, adata2) assert adata2.obs["_scvi_batch"][0] == 0 assert adata2.obs["_scvi_labels"][0] == 0
def test_scvidataset_getitem(): adata = synthetic_iid() setup_anndata( adata, batch_key="batch", labels_key="labels", protein_expression_obsm_key="protein_expression", protein_names_uns_key="protein_names", ) # check that we can successfully pass in a list of tensors to get tensors_to_get = ["batch_indices", "local_l_var"] bd = ScviDataset(adata, getitem_tensors=tensors_to_get) np.testing.assert_array_equal(tensors_to_get, list(bd[1].keys())) # check that we can successfully pass in a dict of tensors and their associated types bd = ScviDataset(adata, getitem_tensors={ "X": np.int, "local_l_var": np.float64 }) assert bd[1]["X"].dtype == np.int64 assert bd[1]["local_l_var"].dtype == np.float64 # check that by default we get all the registered tensors bd = ScviDataset(adata) all_registered_tensors = list(adata.uns["_scvi"]["data_registry"].keys()) np.testing.assert_array_equal(all_registered_tensors, list(bd[1].keys())) assert bd[1]["X"].shape[0] == bd.n_vars # check that ScviDataset returns numpy array adata1 = synthetic_iid() bd = ScviDataset(adata1) for key, value in bd[1].items(): assert type(value) == np.ndarray # check ScviDataset returns numpy array counts were sparse adata = synthetic_iid(run_setup_anndata=False) adata.X = sparse.csr_matrix(adata.X) setup_anndata(adata) bd = ScviDataset(adata) for key, value in bd[1].items(): assert type(value) == np.ndarray # check ScviDataset returns numpy array if pro exp was sparse adata = synthetic_iid(run_setup_anndata=False) adata.obsm["protein_expression"] = sparse.csr_matrix( adata.obsm["protein_expression"]) setup_anndata(adata, batch_key="batch", protein_expression_obsm_key="protein_expression") bd = ScviDataset(adata) for key, value in bd[1].items(): assert type(value) == np.ndarray # check pro exp is being returned as numpy array even if its DF adata = synthetic_iid(run_setup_anndata=False) adata.obsm["protein_expression"] = pd.DataFrame( adata.obsm["protein_expression"], index=adata.obs_names) setup_anndata(adata, batch_key="batch", protein_expression_obsm_key="protein_expression") bd = ScviDataset(adata) for key, value in bd[1].items(): assert type(value) == np.ndarray