def __init__( self, adata_seq: AnnData, adata_spatial: AnnData, generative_distributions: List = ["zinb", "nb"], model_library_size: List = [True, False], n_latent: int = 10, use_gpu: bool = True, **model_kwargs, ): super(GIMVI, self).__init__(use_gpu=use_gpu) self.use_gpu = use_gpu and torch.cuda.is_available() self.adatas = [adata_seq, adata_spatial] self.scvi_setup_dicts_ = { "seq": adata_seq.uns["_scvi"], "spatial": adata_spatial.uns["_scvi"], } seq_var_names = _get_var_names_from_setup_anndata(adata_seq) spatial_var_names = _get_var_names_from_setup_anndata(adata_spatial) if not set(spatial_var_names) <= set(seq_var_names): raise ValueError("spatial genes needs to be subset of seq genes") spatial_gene_loc = [ np.argwhere(seq_var_names == g)[0] for g in spatial_var_names ] spatial_gene_loc = np.concatenate(spatial_gene_loc) gene_mappings = [slice(None), spatial_gene_loc] sum_stats = [d.uns["_scvi"]["summary_stats"] for d in self.adatas] n_inputs = [s["n_vars"] for s in sum_stats] total_genes = adata_seq.uns["_scvi"]["summary_stats"]["n_vars"] # since we are combining datasets, we need to increment the batch_idx # of one of the datasets adata_seq_n_batches = adata_seq.uns["_scvi"]["summary_stats"][ "n_batch"] adata_spatial.obs["_scvi_batch"] += adata_seq_n_batches n_batches = sum([s["n_batch"] for s in sum_stats]) self.model = JVAE( n_inputs, total_genes, gene_mappings, generative_distributions, model_library_size, n_batch=n_batches, n_latent=n_latent, **model_kwargs, ) self._model_summary_string = ( "GimVI Model with the following params: \nn_latent: {}, n_inputs: {}, n_genes: {}, " + "n_batch: {}, generative distributions: {}").format( n_latent, n_inputs, total_genes, n_batches, generative_distributions) self.init_params_ = self._get_init_params(locals())
def __init__( self, adata_seq: AnnData, adata_spatial: AnnData, generative_distributions: List = ["zinb", "nb"], model_library_size: List = [True, False], n_latent: int = 10, use_cuda: bool = True, **model_kwargs, ): super(GIMVI, self).__init__(use_cuda=use_cuda) self.use_cuda = use_cuda and torch.cuda.is_available() self.adatas = [adata_seq, adata_spatial] seq_var_names = _get_var_names_from_setup_anndata(adata_seq) spatial_var_names = _get_var_names_from_setup_anndata(adata_spatial) spatial_gene_loc = [ np.argwhere(seq_var_names == g)[0] for g in spatial_var_names ] spatial_gene_loc = np.concatenate(spatial_gene_loc) gene_mappings = [slice(None), spatial_gene_loc] sum_stats = [d.uns["_scvi"]["summary_stats"] for d in self.adatas] n_inputs = [s["n_genes"] for s in sum_stats] total_genes = adata_seq.uns["_scvi"]["summary_stats"]["n_genes"] n_batches = sum([s["n_batch"] for s in sum_stats]) self.model = JVAE( n_inputs, total_genes, gene_mappings, generative_distributions, model_library_size, n_batch=n_batches, n_latent=n_latent, **model_kwargs, ) self._model_summary_string = "gimVI model with params" self.init_params_ = self._get_init_params(locals())
def get_loadings(self) -> pd.DataFrame: """ Extract per-gene weights in the linear decoder. Shape is genes by `n_latent`. """ cols = ["Z_{}".format(i) for i in range(self.n_latent)] var_names = _get_var_names_from_setup_anndata(self.adata) loadings = pd.DataFrame(self.module.get_loadings(), index=var_names, columns=cols) return loadings
def differential_accessibility( self, adata: Optional[AnnData] = None, groupby: Optional[str] = None, group1: Optional[Iterable[str]] = None, group2: Optional[str] = None, idx1: Optional[Union[Sequence[int], Sequence[bool], str]] = None, idx2: Optional[Union[Sequence[int], Sequence[bool], str]] = None, mode: Literal["vanilla", "change"] = "change", delta: float = 0.05, batch_size: Optional[int] = None, all_stats: bool = True, batch_correction: bool = False, batchid1: Optional[Iterable[str]] = None, batchid2: Optional[Iterable[str]] = None, fdr_target: float = 0.05, silent: bool = False, two_sided: bool = True, **kwargs, ) -> pd.DataFrame: r""" A unified method for differential accessibility analysis. Implements `"vanilla"` DE [Lopez18]_ and `"change"` mode DE [Boyeau19]_. Parameters ---------- {doc_differential_expression} two_sided Whether to perform a two-sided test, or a one-sided test. **kwargs Keyword args for :func:`scvi.utils.DifferentialComputation.get_bayes_factors` Returns ------- Differential accessibility DataFrame with the following columns: prob_da the probability of the region being differentially accessible is_da_fdr whether the region passes a multiple hypothesis correction procedure with the target_fdr threshold bayes_factor Bayes Factor indicating the level of significance of the analysis effect_size the effect size, computed as (accessibility in population 2) - (accessibility in population 1) emp_effect the empirical effect, based on observed detection rates instead of the estimated accessibility scores from the PeakVI model est_prob1 the estimated probability of accessibility in population 1 est_prob2 the estimated probability of accessibility in population 2 emp_prob1 the empirical (observed) probability of accessibility in population 1 emp_prob2 the empirical (observed) probability of accessibility in population 2 """ adata = self._validate_anndata(adata) col_names = _get_var_names_from_setup_anndata(adata) model_fn = partial(self.get_accessibility_estimates, use_z_mean=False, batch_size=batch_size) # TODO check if change_fn in kwargs and raise error if so def change_fn(a, b): return a - b if two_sided: def m1_domain_fn(samples): return np.abs(samples) >= delta else: def m1_domain_fn(samples): return samples >= delta result = _de_core( adata=adata, model_fn=model_fn, groupby=groupby, group1=group1, group2=group2, idx1=idx1, idx2=idx2, all_stats=all_stats, all_stats_fn=scatac_raw_counts_properties, col_names=col_names, mode=mode, batchid1=batchid1, batchid2=batchid2, delta=delta, batch_correction=batch_correction, fdr=fdr_target, change_fn=change_fn, m1_domain_fn=m1_domain_fn, silent=silent, **kwargs, ) # manually change the results DataFrame to fit a PeakVI differential accessibility results result = pd.DataFrame( { "prob_da": result.proba_de, "is_da_fdr": result.loc[:, "is_de_fdr_{}".format(fdr_target)], "bayes_factor": result.bayes_factor, "effect_size": result.scale2 - result.scale1, "emp_effect": result.emp_mean2 - result.emp_mean1, "est_prob1": result.scale1, "est_prob2": result.scale2, "emp_prob1": result.emp_mean1, "emp_prob2": result.emp_mean2, }, ) return result.reindex(adata.var.index)
def differential_expression( self, adata: Optional[AnnData] = None, groupby: Optional[str] = None, group1: Optional[Iterable[str]] = None, group2: Optional[str] = None, idx1: Optional[Union[Sequence[int], Sequence[bool]]] = None, idx2: Optional[Union[Sequence[int], Sequence[bool]]] = None, mode: Literal["vanilla", "change"] = "change", delta: float = 0.25, batch_size: Optional[int] = None, all_stats: bool = True, batch_correction: bool = False, batchid1: Optional[Iterable[str]] = None, batchid2: Optional[Iterable[str]] = None, fdr_target: float = 0.05, silent: bool = False, **kwargs, ) -> pd.DataFrame: r""" A unified method for differential expression analysis. Implements `"vanilla"` DE [Lopez18]_ and `"change"` mode DE [Boyeau19]_. Parameters ---------- {doc_differential_expression} **kwargs Keyword args for :func:`scvi.utils.DifferentialComputation.get_bayes_factors` Returns ------- Differential expression DataFrame. """ adata = self._validate_anndata(adata) col_names = _get_var_names_from_setup_anndata(adata)[:self.n_genes] model_fn = partial( self.get_normalized_expression, batch_size=batch_size, ) all_stats_fn = partial( scrna_raw_counts_properties, var_idx=np.arange(adata.shape[1])[:self.n_genes], ) result = _de_core( adata, model_fn=model_fn, groupby=groupby, group1=group1, group2=group2, idx1=idx1, idx2=idx2, all_stats=all_stats, all_stats_fn=all_stats_fn, col_names=col_names, mode=mode, batchid1=batchid1, batchid2=batchid2, delta=delta, batch_correction=batch_correction, fdr=fdr_target, silent=silent, **kwargs, ) return result
def get_normalized_expression( self, adata: Optional[AnnData] = None, indices: Optional[Sequence[int]] = None, n_samples_overall: Optional[int] = None, transform_batch: Optional[Sequence[Union[Number, str]]] = None, gene_list: Optional[Sequence[str]] = None, use_z_mean: bool = True, n_samples: int = 1, batch_size: Optional[int] = None, return_mean: bool = True, ) -> Union[np.ndarray, pd.DataFrame]: r""" Returns the normalized (decoded) gene expression. This is denoted as :math:`\rho_n` in the scVI paper. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. transform_batch Batch to condition on. If transform_batch is: - None, then real observed batch is used. - int, then batch transform_batch is used. gene_list Return frequencies of expression for a subset of genes. This can save memory when working with large datasets and few genes are of interest. library_size Scale the expression frequencies to a common library size. This allows gene expression levels to be interpreted on a common scale of relevant magnitude. If set to `"latent"`, use the latent libary size. use_z_mean If True, use the mean of the latent distribution, otherwise sample from it n_samples Number of posterior samples to use for estimation. batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. return_mean Whether to return the mean of the samples. Returns ------- If `n_samples` > 1 and `return_mean` is False, then the shape is `(samples, cells, genes)`. Otherwise, shape is `(cells, genes)`. In this case, return type is :class:`~pandas.DataFrame` unless `return_numpy` is True. """ adata = self._validate_anndata(adata) if indices is None: indices = np.arange(adata.n_obs) if n_samples_overall is not None: indices = np.random.choice(indices, n_samples_overall) scdl = self._make_data_loader(adata=adata, indices=indices, batch_size=batch_size) transform_batch = _get_batch_code_from_category(adata, transform_batch) if gene_list is None: gene_mask = slice(None) else: all_genes = _get_var_names_from_setup_anndata(adata) gene_mask = [gene in gene_list for gene in all_genes] exprs = [] for tensors in scdl: per_batch_exprs = [] for batch in transform_batch: if batch is not None: batch_indices = tensors[_CONSTANTS.BATCH_KEY] tensors[_CONSTANTS.BATCH_KEY] = ( torch.ones_like(batch_indices) * batch) _, generative_outputs = self.module.forward( tensors=tensors, inference_kwargs=dict(n_samples=n_samples), generative_kwargs=dict(use_z_mean=use_z_mean), compute_loss=False, ) output = generative_outputs["px_scale"] output = output[..., gene_mask] output = output.cpu().numpy() per_batch_exprs.append(output) per_batch_exprs = np.stack( per_batch_exprs ) # shape is (len(transform_batch) x batch_size x n_var) exprs += [per_batch_exprs.mean(0)] if n_samples > 1: # The -2 axis correspond to cells. exprs = np.concatenate(exprs, axis=-2) else: exprs = np.concatenate(exprs, axis=0) if n_samples > 1 and return_mean: exprs = exprs.mean(0) return exprs
def get_feature_correlation_matrix( self, adata=None, indices=None, n_samples: int = 10, batch_size: int = 64, rna_size_factor: int = 1000, transform_batch: Optional[Sequence[Union[Number, str]]] = None, correlation_type: Literal["spearman", "pearson"] = "spearman", log_transform: bool = False, ) -> pd.DataFrame: """ Generate gene-gene correlation matrix using scvi uncertainty and expression. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. n_samples Number of posterior samples to use for estimation. batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. rna_size_factor size factor for RNA prior to sampling gamma distribution transform_batch Batches to condition on. If transform_batch is: - None, then real observed batch is used - int, then batch transform_batch is used - list of int, then values are averaged over provided batches. correlation_type One of "pearson", "spearman". log_transform Whether to log transform denoised values prior to correlation calculation. Returns ------- Gene-protein-gene-protein correlation matrix """ from scipy.stats import spearmanr adata = self._validate_anndata(adata) if not isinstance(transform_batch, IterableClass): transform_batch = [transform_batch] transform_batch = _get_batch_code_from_category(adata, transform_batch) corr_mats = [] for b in transform_batch: denoised_data = self._get_denoised_samples( n_samples=n_samples, batch_size=batch_size, rna_size_factor=rna_size_factor, transform_batch=b, ) flattened = np.zeros( (denoised_data.shape[0] * n_samples, denoised_data.shape[1]) ) for i in range(n_samples): flattened[ denoised_data.shape[0] * (i) : denoised_data.shape[0] * (i + 1) ] = denoised_data[:, :, i] if log_transform is True: flattened[:, : self.n_genes] = np.log( flattened[:, : self.n_genes] + 1e-8 ) flattened[:, self.n_genes :] = np.log1p(flattened[:, self.n_genes :]) if correlation_type == "pearson": corr_matrix = np.corrcoef(flattened, rowvar=False) else: corr_matrix, _ = spearmanr(flattened, axis=0) corr_mats.append(corr_matrix) corr_matrix = np.mean(np.stack(corr_mats), axis=0) var_names = _get_var_names_from_setup_anndata(adata) names = np.concatenate( [np.asarray(var_names), self.scvi_setup_dict_["protein_names"]] ) return pd.DataFrame(corr_matrix, index=names, columns=names)
def posterior_predictive_sample( self, adata: Optional[AnnData] = None, indices: Optional[Sequence[int]] = None, n_samples: int = 1, batch_size: Optional[int] = None, gene_list: Optional[Sequence[str]] = None, protein_list: Optional[Sequence[str]] = None, ) -> np.ndarray: r""" Generate observation samples from the posterior predictive distribution. The posterior predictive distribution is written as :math:`p(\hat{x}, \hat{y} \mid x, y)`. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. n_samples Number of required samples for each cell batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. gene_list Names of genes of interest protein_list Names of proteins of interest Returns ------- x_new : :class:`~numpy.ndarray` tensor with shape (n_cells, n_genes, n_samples) """ if self.module.gene_likelihood not in ["nb"]: raise ValueError("Invalid gene_likelihood") adata = self._validate_anndata(adata) if gene_list is None: gene_mask = slice(None) else: all_genes = _get_var_names_from_setup_anndata(adata) gene_mask = [True if gene in gene_list else False for gene in all_genes] if protein_list is None: protein_mask = slice(None) else: all_proteins = self.scvi_setup_dict_["protein_names"] protein_mask = [True if p in protein_list else False for p in all_proteins] scdl = self._make_data_loader( adata=adata, indices=indices, batch_size=batch_size ) scdl_list = [] for tensors in scdl: rna_sample, protein_sample = self.module.sample( tensors, n_samples=n_samples ) rna_sample = rna_sample[..., gene_mask] protein_sample = protein_sample[..., protein_mask] data = torch.cat([rna_sample, protein_sample], dim=-1).numpy() scdl_list += [data] if n_samples > 1: scdl_list[-1] = np.transpose(scdl_list[-1], (1, 2, 0)) scdl_list = np.concatenate(scdl_list, axis=0) return scdl_list
def differential_expression( self, adata: Optional[AnnData] = None, groupby: Optional[str] = None, group1: Optional[Iterable[str]] = None, group2: Optional[str] = None, idx1: Optional[Union[Sequence[int], Sequence[bool]]] = None, idx2: Optional[Union[Sequence[int], Sequence[bool]]] = None, mode: Literal["vanilla", "change"] = "change", delta: float = 0.25, batch_size: Optional[int] = None, all_stats: bool = True, batch_correction: bool = False, batchid1: Optional[Iterable[str]] = None, batchid2: Optional[Iterable[str]] = None, fdr_target: float = 0.05, silent: bool = False, protein_prior_count: float = 0.1, scale_protein: bool = False, sample_protein_mixing: bool = False, include_protein_background: bool = False, **kwargs, ) -> pd.DataFrame: r""" A unified method for differential expression analysis. Implements `"vanilla"` DE [Lopez18]_ and `"change"` mode DE [Boyeau19]_. Parameters ---------- {doc_differential_expression} protein_prior_count Prior count added to protein expression before LFC computation scale_protein Force protein values to sum to one in every single cell (post-hoc normalization) sample_protein_mixing Sample the protein mixture component, i.e., use the parameter to sample a Bernoulli that determines if expression is from foreground/background. include_protein_background Include the protein background component as part of the protein expression **kwargs Keyword args for :func:`scvi.utils.DifferentialComputation.get_bayes_factors` Returns ------- Differential expression DataFrame. """ adata = self._validate_anndata(adata) model_fn = partial( self._expression_for_de, scale_protein=scale_protein, sample_protein_mixing=sample_protein_mixing, include_protein_background=include_protein_background, protein_prior_count=protein_prior_count, batch_size=batch_size, ) col_names = np.concatenate( [ np.asarray(_get_var_names_from_setup_anndata(adata)), self.scvi_setup_dict_["protein_names"], ] ) result = _de_core( adata, model_fn, groupby, group1, group2, idx1, idx2, all_stats, cite_seq_raw_counts_properties, col_names, mode, batchid1, batchid2, delta, batch_correction, fdr_target, silent, **kwargs, ) return result
def get_normalized_expression( self, adata=None, indices=None, transform_batch: Optional[Sequence[Union[Number, str]]] = None, gene_list: Optional[Sequence[str]] = None, protein_list: Optional[Sequence[str]] = None, library_size: Optional[Union[float, Literal["latent"]]] = 1, n_samples: int = 1, sample_protein_mixing: bool = False, scale_protein: bool = False, include_protein_background: bool = False, batch_size: Optional[int] = None, return_mean: bool = True, return_numpy: Optional[bool] = None, ) -> Tuple[Union[np.ndarray, pd.DataFrame], Union[np.ndarray, pd.DataFrame]]: r""" Returns the normalized gene expression and protein expression. This is denoted as :math:`\rho_n` in the totalVI paper for genes, and TODO for proteins, :math:`(1-\pi_{nt})\alpha_{nt}\beta_{nt}`. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. transform_batch Batch to condition on. If transform_batch is: - None, then real observed batch is used - int, then batch transform_batch is used - List[int], then average over batches in list gene_list Return frequencies of expression for a subset of genes. This can save memory when working with large datasets and few genes are of interest. protein_list Return protein expression for a subset of genes. This can save memory when working with large datasets and few genes are of interest. library_size Scale the expression frequencies to a common library size. This allows gene expression levels to be interpreted on a common scale of relevant magnitude. n_samples Get sample scale from multiple samples. sample_protein_mixing Sample mixing bernoulli, setting background to zero scale_protein Make protein expression sum to 1 include_protein_background Include background component for protein expression batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. return_mean Whether to return the mean of the samples. return_numpy Return a `np.ndarray` instead of a `pd.DataFrame`. Includes gene names as columns. If either n_samples=1 or return_mean=True, defaults to False. Otherwise, it defaults to True. Returns ------- - **gene_normalized_expression** - normalized expression for RNA - **protein_normalized_expression** - normalized expression for proteins If ``n_samples`` > 1 and ``return_mean`` is False, then the shape is ``(samples, cells, genes)``. Otherwise, shape is ``(cells, genes)``. Return type is ``pd.DataFrame`` unless ``return_numpy`` is True. """ adata = self._validate_anndata(adata) post = self._make_data_loader( adata=adata, indices=indices, batch_size=batch_size ) if gene_list is None: gene_mask = slice(None) else: all_genes = _get_var_names_from_setup_anndata(adata) gene_mask = [True if gene in gene_list else False for gene in all_genes] if protein_list is None: protein_mask = slice(None) else: all_proteins = self.scvi_setup_dict_["protein_names"] protein_mask = [True if p in protein_list else False for p in all_proteins] if indices is None: indices = np.arange(adata.n_obs) if n_samples > 1 and return_mean is False: if return_numpy is False: warnings.warn( "return_numpy must be True if n_samples > 1 and return_mean is False, returning np.ndarray" ) return_numpy = True if not isinstance(transform_batch, IterableClass): transform_batch = [transform_batch] transform_batch = _get_batch_code_from_category(adata, transform_batch) scale_list_gene = [] scale_list_pro = [] for tensors in post: x = tensors[_CONSTANTS.X_KEY] y = tensors[_CONSTANTS.PROTEIN_EXP_KEY] px_scale = torch.zeros_like(x) py_scale = torch.zeros_like(y) if n_samples > 1: px_scale = torch.stack(n_samples * [px_scale]) py_scale = torch.stack(n_samples * [py_scale]) for b in transform_batch: if b is not None: batch_indices = tensors[_CONSTANTS.BATCH_KEY] tensors[_CONSTANTS.BATCH_KEY] = torch.ones_like(batch_indices) * b inference_kwargs = dict(n_samples=n_samples) inference_outputs, generative_outputs = self.module.forward( tensors=tensors, inference_kwargs=inference_kwargs, compute_loss=False, ) if library_size == "latent": px_scale += generative_outputs["px_"]["rate"].cpu() else: px_scale += generative_outputs["px_"]["scale"].cpu() px_scale = px_scale[..., gene_mask] py_ = generative_outputs["py_"] # probability of background protein_mixing = 1 / (1 + torch.exp(-py_["mixing"].cpu())) if sample_protein_mixing is True: protein_mixing = torch.distributions.Bernoulli( protein_mixing ).sample() protein_val = py_["rate_fore"].cpu() * (1 - protein_mixing) if include_protein_background is True: protein_val += py_["rate_back"].cpu() * protein_mixing if scale_protein is True: protein_val = torch.nn.functional.normalize( protein_val, p=1, dim=-1 ) protein_val = protein_val[..., protein_mask] py_scale += protein_val px_scale /= len(transform_batch) py_scale /= len(transform_batch) scale_list_gene.append(px_scale) scale_list_pro.append(py_scale) if n_samples > 1: # concatenate along batch dimension -> result shape = (samples, cells, features) scale_list_gene = torch.cat(scale_list_gene, dim=1) scale_list_pro = torch.cat(scale_list_pro, dim=1) # (cells, features, samples) scale_list_gene = scale_list_gene.permute(1, 2, 0) scale_list_pro = scale_list_pro.permute(1, 2, 0) else: scale_list_gene = torch.cat(scale_list_gene, dim=0) scale_list_pro = torch.cat(scale_list_pro, dim=0) if return_mean is True and n_samples > 1: scale_list_gene = torch.mean(scale_list_gene, dim=-1) scale_list_pro = torch.mean(scale_list_pro, dim=-1) scale_list_gene = scale_list_gene.cpu().numpy() scale_list_pro = scale_list_pro.cpu().numpy() if return_numpy is None or return_numpy is False: gene_df = pd.DataFrame( scale_list_gene, columns=adata.var_names[gene_mask], index=adata.obs_names[indices], ) pro_df = pd.DataFrame( scale_list_pro, columns=self.scvi_setup_dict_["protein_names"][protein_mask], index=adata.obs_names[indices], ) return gene_df, pro_df else: return scale_list_gene, scale_list_pro
def get_accessibility_estimates( self, adata: Optional[AnnData] = None, indices: Sequence[int] = None, n_samples_overall: Optional[int] = None, region_list: Optional[Sequence[str]] = None, transform_batch: Optional[Union[str, int]] = None, use_z_mean: bool = True, threshold: Optional[float] = None, normalize_cells: bool = False, normalize_regions: bool = False, batch_size: int = 128, return_numpy: bool = False, ) -> Union[pd.DataFrame, np.ndarray, csr_matrix]: """ Impute the full accessibility matrix. Returns a matrix of accessibility probabilities for each cell and genomic region in the input (for return matrix A, A[i,j] is the probability that region j is accessible in cell i). Parameters ---------- adata AnnData object that has been registered with scvi. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. n_samples_overall Number of samples to return in total region_list Return accessibility estimates for this subset of regions. if `None`, all regions are used. This can save memory when dealing with large datasets. transform_batch Batch to condition on. If transform_batch is: - None, then real observed batch is used - int, then batch transform_batch is used use_z_mean If True (default), use the distribution mean. Otherwise, sample from the distribution. threshold If provided, values below the threshold are replaced with 0 and a sparse matrix is returned instead. This is recommended for very large matrices. Must be between 0 and 1. normalize_cells Whether to reintroduce library size factors to scale the normalized probabilities. This makes the estimates closer to the input, but removes the library size correction. False by default. normalize_regions Whether to reintroduce region factors to scale the normalized probabilities. This makes the estimates closer to the input, but removes the region-level bias correction. False by default. batch_size Minibatch size for data loading into model return_numpy If `True` and `threshold=None`, return :class:`~numpy.ndarray`. If `True` and `threshold` is given, return :class:`~scipy.sparse.csr_matrix`. If `False`, return :class:`~pandas.DataFrame`. DataFrame includes regions names as columns. """ adata = self._validate_anndata(adata) if indices is None: indices = np.arange(adata.n_obs) if n_samples_overall is not None: indices = np.random.choice(indices, n_samples_overall) post = self._make_data_loader(adata=adata, indices=indices, batch_size=batch_size) transform_batch = _get_batch_code_from_category(adata, transform_batch) if region_list is None: region_mask = slice(None) else: all_regions = _get_var_names_from_setup_anndata(adata) region_mask = [region in region_list for region in all_regions] if threshold is not None and (threshold < 0 or threshold > 1): raise ValueError("the provided threshold must be between 0 and 1") imputed = [] for tensors in post: get_generative_input_kwargs = dict( transform_batch=transform_batch[0]) generative_kwargs = dict(use_z_mean=use_z_mean) inference_outputs, generative_outputs = self.module.forward( tensors=tensors, get_generative_input_kwargs=get_generative_input_kwargs, generative_kwargs=generative_kwargs, compute_loss=False, ) p = generative_outputs["p"].cpu() if normalize_cells: p *= inference_outputs["d"].cpu() if normalize_regions: p *= torch.sigmoid(self.module.region_factors).cpu() if threshold: p[p < threshold] = 0 p = csr_matrix(p.numpy()) if region_list is not None: p = p[:, region_mask] imputed.append(p) if threshold: # imputed is a list of csr_matrix objects imputed = vstack(imputed, format="csr") else: # imputed is a list of tensors imputed = torch.cat(imputed).numpy() if return_numpy: return imputed elif threshold: return pd.DataFrame.sparse.from_spmatrix( imputed, index=adata.obs_names[indices], columns=adata.var_names[region_mask], ) else: return pd.DataFrame( imputed, index=adata.obs_names[indices], columns=adata.var_names[region_mask], )
def posterior_predictive_sample( self, adata: Optional[AnnData] = None, indices: Optional[Sequence[int]] = None, n_samples: int = 1, batch_size: Optional[int] = None, gene_list: Optional[Sequence[str]] = None, protein_list: Optional[Sequence[str]] = None, ) -> np.ndarray: r""" Generate observation samples from the posterior predictive distribution. The posterior predictive distribution is written as :math:`p(\hat{x}, \hat{y} \mid x, y)`. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. n_samples Number of required samples for each cell batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. gene_list Names of genes of interest protein_list Names of proteins of interest Returns ------- x_new : :class:`~numpy.ndarray` tensor with shape (n_cells, n_genes, n_samples) """ if self.model.gene_likelihood not in ["nb"]: raise ValueError("Invalid gene_likelihood") adata = self._validate_anndata(adata) if gene_list is None: gene_mask = slice(None) else: all_genes = _get_var_names_from_setup_anndata(adata) gene_mask = [True if gene in gene_list else False for gene in all_genes] if protein_list is None: protein_mask = slice(None) else: all_proteins = adata.uns["scvi_protein_names"] protein_mask = [True if p in protein_list else False for p in all_proteins] post = self._make_scvi_dl(adata=adata, indices=indices, batch_size=batch_size) scdl_list = [] for tensors in post: x = tensors[_CONSTANTS.X_KEY] batch_idx = tensors[_CONSTANTS.BATCH_KEY] labels = tensors[_CONSTANTS.LABELS_KEY] y = tensors[_CONSTANTS.PROTEIN_EXP_KEY] with torch.no_grad(): outputs = self.model.inference( x, y, batch_index=batch_idx, label=labels, n_samples=n_samples ) px_ = outputs["px_"] py_ = outputs["py_"] pi = 1 / (1 + torch.exp(-py_["mixing"])) mixing_sample = torch.distributions.Bernoulli(pi).sample() protein_rate = ( py_["rate_fore"] * (1 - mixing_sample) + py_["rate_back"] * mixing_sample ) rate = torch.cat( (px_["rate"][..., gene_mask], protein_rate[..., protein_mask]), dim=-1 ) if len(px_["r"].size()) == 2: px_dispersion = px_["r"] else: px_dispersion = torch.ones_like(x) * px_["r"] if len(py_["r"].size()) == 2: py_dispersion = py_["r"] else: py_dispersion = torch.ones_like(y) * py_["r"] dispersion = torch.cat( (px_dispersion[..., gene_mask], py_dispersion[..., protein_mask]), dim=-1, ) # This gamma is really l*w using scVI manuscript notation p = rate / (rate + dispersion) r = dispersion l_train = torch.distributions.Gamma(r, (1 - p) / p).sample() data = torch.distributions.Poisson(l_train).sample().cpu().numpy() # """ # In numpy (shape, scale) => (concentration, rate), with scale = p /(1 - p) # rate = (1 - p) / p # = 1/scale # used in pytorch # """ scdl_list += [data] if n_samples > 1: scdl_list[-1] = np.transpose(scdl_list[-1], (1, 2, 0)) scdl_list = np.concatenate(scdl_list, axis=0) return scdl_list
def posterior_predictive_sample( self, adata: Optional[AnnData] = None, indices: Optional[Sequence[int]] = None, n_samples: int = 1, batch_size: Optional[int] = None, gene_list: Optional[Sequence[str]] = None, protein_list: Optional[Sequence[str]] = None, ) -> np.ndarray: r""" Generate observation samples from the posterior predictive distribution. The posterior predictive distribution is written as :math:`p(\hat{x}, \hat{y} \mid x, y)`. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. n_samples Number of required samples for each cell batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. gene_list Names of genes of interest protein_list Names of proteins of interest Returns ------- x_new : :class:`~numpy.ndarray` tensor with shape (n_cells, n_genes, n_samples) """ if self.model.gene_likelihood not in ["nb"]: raise ValueError("Invalid gene_likelihood") adata = self._validate_anndata(adata) if gene_list is None: gene_mask = slice(None) else: all_genes = _get_var_names_from_setup_anndata(adata) gene_mask = [ True if gene in gene_list else False for gene in all_genes ] if protein_list is None: protein_mask = slice(None) else: all_proteins = adata.uns["scvi_protein_names"] protein_mask = [ True if p in protein_list else False for p in all_proteins ] post = self._make_scvi_dl(adata=adata, indices=indices, batch_size=batch_size) scdl_list = [] for tensors in post: x = tensors[_CONSTANTS.X_KEY] batch_idx = tensors[_CONSTANTS.BATCH_KEY] labels = tensors[_CONSTANTS.LABELS_KEY] y = tensors[_CONSTANTS.PROTEIN_EXP_KEY] with torch.no_grad(): outputs = self.model.inference(x, y, batch_index=batch_idx, label=labels, n_samples=n_samples) px_ = outputs["px_"] py_ = outputs["py_"] rna_dist = NegativeBinomial(mu=px_["rate"], theta=px_["r"]) protein_dist = NegativeBinomialMixture( mu1=py_["rate_back"], mu2=py_["rate_fore"], theta1=py_["r"], mixture_logits=py_["mixing"], ) rna_sample = rna_dist.sample().cpu()[..., gene_mask] protein_sample = protein_dist.sample().cpu()[..., protein_mask] data = torch.cat([rna_sample, protein_sample], dim=-1).numpy() scdl_list += [data] if n_samples > 1: scdl_list[-1] = np.transpose(scdl_list[-1], (1, 2, 0)) scdl_list = np.concatenate(scdl_list, axis=0) return scdl_list
def get_feature_correlation_matrix( self, adata: Optional[AnnData] = None, indices: Optional[Sequence[int]] = None, n_samples: int = 10, batch_size: int = 64, rna_size_factor: int = 1000, transform_batch: Optional[Union[int, List[int]]] = None, correlation_type: Literal["spearman", "pearson"] = "spearman", ) -> pd.DataFrame: """ Generate gene-gene correlation matrix using scvi uncertainty and expression. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. n_samples Number of posterior samples to use for estimation. batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. rna_size_factor size factor for RNA prior to sampling gamma distribution. transform_batch Batches to condition on. If transform_batch is: - None, then real observed batch is used. - int, then batch transform_batch is used. - list of int, then values are averaged over provided batches. correlation_type One of "pearson", "spearman". Returns ------- Gene-gene correlation matrix """ from scipy.stats import spearmanr adata = self._validate_anndata(adata) if (transform_batch is None) or (isinstance(transform_batch, int)): transform_batch = [transform_batch] corr_mats = [] for b in transform_batch: denoised_data = self._get_denoised_samples( adata=adata, indices=indices, n_samples=n_samples, batch_size=batch_size, rna_size_factor=rna_size_factor, transform_batch=b, ) flattened = np.zeros( (denoised_data.shape[0] * n_samples, denoised_data.shape[1]) ) for i in range(n_samples): flattened[ denoised_data.shape[0] * (i) : denoised_data.shape[0] * (i + 1) ] = denoised_data[:, :, i] if correlation_type == "pearson": corr_matrix = np.corrcoef(flattened, rowvar=False) elif correlation_type == "spearman": corr_matrix, _ = spearmanr(flattened) else: raise ValueError( "Unknown correlation type. Choose one of 'spearman', 'pearson'." ) corr_mats.append(corr_matrix) corr_matrix = np.mean(np.stack(corr_mats), axis=0) var_names = _get_var_names_from_setup_anndata(adata) return pd.DataFrame(corr_matrix, index=var_names, columns=var_names)
def get_normalized_expression( self, adata: Optional[AnnData] = None, indices: Optional[Sequence[int]] = None, transform_batch: Optional[Sequence[Union[str, int]]] = None, gene_list: Optional[Sequence[str]] = None, library_size: Union[float, Literal["latent"]] = 1, n_samples: int = 1, batch_size: Optional[int] = None, return_mean: bool = True, return_numpy: Optional[bool] = None, ) -> Union[np.ndarray, pd.DataFrame]: r""" Returns the normalized (decoded) gene expression. This is denoted as :math:`\rho_n` in the scVI paper. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. transform_batch Batch to condition on. If transform_batch is: - None, then real observed batch is used. - int, then batch transform_batch is used. gene_list Return frequencies of expression for a subset of genes. This can save memory when working with large datasets and few genes are of interest. library_size Scale the expression frequencies to a common library size. This allows gene expression levels to be interpreted on a common scale of relevant magnitude. If set to `"latent"`, use the latent libary size. n_samples Number of posterior samples to use for estimation. batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. return_mean Whether to return the mean of the samples. return_numpy Return a :class:`~numpy.ndarray` instead of a :class:`~pandas.DataFrame`. DataFrame includes gene names as columns. If either `n_samples=1` or `return_mean=True`, defaults to `False`. Otherwise, it defaults to `True`. Returns ------- If `n_samples` > 1 and `return_mean` is False, then the shape is `(samples, cells, genes)`. Otherwise, shape is `(cells, genes)`. In this case, return type is :class:`~pandas.DataFrame` unless `return_numpy` is True. """ adata = self._validate_anndata(adata) scdl = self._make_scvi_dl(adata=adata, indices=indices, batch_size=batch_size) if transform_batch is not None: transform_batch = _get_batch_code_from_category(adata, transform_batch) if gene_list is None: gene_mask = slice(None) else: all_genes = _get_var_names_from_setup_anndata(adata) gene_mask = [True if gene in gene_list else False for gene in all_genes] if n_samples > 1 and return_mean is False: if return_numpy is False: logger.warning( "return_numpy must be True if n_samples > 1 and return_mean is False, returning np.ndarray" ) return_numpy = True if indices is None: indices = np.arange(adata.n_obs) if library_size == "latent": model_fn = self.model.get_sample_rate scaling = 1 else: model_fn = self.model.get_sample_scale scaling = library_size exprs = [] for tensors in scdl: x = tensors[_CONSTANTS.X_KEY] batch_idx = tensors[_CONSTANTS.BATCH_KEY] labels = tensors[_CONSTANTS.LABELS_KEY] exprs += [ np.array( ( model_fn( x, batch_index=batch_idx, y=labels, n_samples=n_samples, transform_batch=transform_batch, )[..., gene_mask] * scaling ).cpu() ) ] if n_samples > 1: # The -2 axis correspond to cells. exprs = np.concatenate(exprs, axis=-2) else: exprs = np.concatenate(exprs, axis=0) if n_samples > 1 and return_mean: exprs = exprs.mean(0) if return_numpy is None or return_numpy is False: return pd.DataFrame( exprs, columns=adata.var_names[gene_mask], index=adata.obs_names[indices], ) else: return exprs
def posterior_predictive_sample( self, adata: Optional[AnnData] = None, indices: Optional[Sequence[int]] = None, n_samples: int = 1, gene_list: Optional[Sequence[str]] = None, batch_size: Optional[int] = None, ) -> np.ndarray: r""" Generate observation samples from the posterior predictive distribution. The posterior predictive distribution is written as :math:`p(\hat{x} \mid x)`. Parameters ---------- adata AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. n_samples Number of samples for each cell. gene_list Names of genes of interest. batch_size Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`. Returns ------- x_new : :py:class:`torch.Tensor` tensor with shape (n_cells, n_genes, n_samples) """ if self.model.gene_likelihood not in ["zinb", "nb", "poisson"]: raise ValueError("Invalid gene_likelihood.") adata = self._validate_anndata(adata) scdl = self._make_scvi_dl(adata=adata, indices=indices, batch_size=batch_size) if indices is None: indices = np.arange(adata.n_obs) if gene_list is None: gene_mask = slice(None) else: all_genes = _get_var_names_from_setup_anndata(adata) gene_mask = [True if gene in gene_list else False for gene in all_genes] x_new = [] for tensors in scdl: x = tensors[_CONSTANTS.X_KEY] batch_idx = tensors[_CONSTANTS.BATCH_KEY] labels = tensors[_CONSTANTS.LABELS_KEY] outputs = self.model.inference( x, batch_index=batch_idx, y=labels, n_samples=n_samples ) px_r = outputs["px_r"] px_rate = outputs["px_rate"] px_dropout = outputs["px_dropout"] if self.model.gene_likelihood == "poisson": l_train = px_rate l_train = torch.clamp(l_train, max=1e8) dist = torch.distributions.Poisson( l_train ) # Shape : (n_samples, n_cells_batch, n_genes) elif self.model.gene_likelihood == "nb": dist = NegativeBinomial(mu=px_rate, theta=px_r) elif self.model.gene_likelihood == "zinb": dist = ZeroInflatedNegativeBinomial( mu=px_rate, theta=px_r, zi_logits=px_dropout ) else: raise ValueError( "{} reconstruction error not handled right now".format( self.model.gene_likelihood ) ) if n_samples > 1: exprs = dist.sample().permute( [1, 2, 0] ) # Shape : (n_cells_batch, n_genes, n_samples) else: exprs = dist.sample() if gene_list is not None: exprs = exprs[:, gene_mask, ...] x_new.append(exprs.cpu()) x_new = torch.cat(x_new) # Shape (n_cells, n_genes, n_samples) return x_new.numpy()