def __init__( self, adata: AnnData, n_hidden: Optional[int] = None, n_latent: Optional[int] = None, n_layers_encoder: int = 2, n_layers_decoder: int = 2, dropout_rate: float = 0.1, model_depth: bool = True, region_factors: bool = True, use_batch_norm: Literal["encoder", "decoder", "none", "both"] = "none", use_layer_norm: Literal["encoder", "decoder", "none", "both"] = "both", latent_distribution: Literal["normal", "ln"] = "normal", deeply_inject_covariates: bool = False, encode_covariates: bool = False, **model_kwargs, ): super(PEAKVI, self).__init__(adata) n_cats_per_cov = (self.adata_manager.get_state_registry( REGISTRY_KEYS.CAT_COVS_KEY).n_cats_per_key if REGISTRY_KEYS.CAT_COVS_KEY in self.adata_manager.data_registry else []) self.module = PEAKVAE( n_input_regions=self.summary_stats.n_vars, n_batch=self.summary_stats.n_batch, n_hidden=n_hidden, n_latent=n_latent, n_layers_encoder=n_layers_encoder, n_layers_decoder=n_layers_decoder, n_continuous_cov=self.summary_stats.get("n_extra_continuous_covs", 0), n_cats_per_cov=n_cats_per_cov, dropout_rate=dropout_rate, model_depth=model_depth, region_factors=region_factors, use_batch_norm=use_batch_norm, use_layer_norm=use_layer_norm, latent_distribution=latent_distribution, deeply_inject_covariates=deeply_inject_covariates, encode_covariates=encode_covariates, **model_kwargs, ) self._model_summary_string = ( "PeakVI Model with params: \nn_hidden: {}, n_latent: {}, n_layers_encoder: {}, " "n_layers_decoder: {} , dropout_rate: {}, latent_distribution: {}, deep injection: {}, " "encode_covariates: {}").format( self.module.n_hidden, self.module.n_latent, n_layers_encoder, n_layers_decoder, dropout_rate, latent_distribution, deeply_inject_covariates, encode_covariates, ) self.n_latent = n_latent self.init_params_ = self._get_init_params(locals())
class PEAKVI(ArchesMixin, VAEMixin, UnsupervisedTrainingMixin, BaseModelClass): """ Peak Variational Inference [Ashuach21]_ Parameters ---------- adata AnnData object that has been registered via :func:`~scvi.data.setup_anndata`. n_hidden Number of nodes per hidden layer. If `None`, defaults to square root of number of regions. n_latent Dimensionality of the latent space. If `None`, defaults to square root of `n_hidden`. n_layers_encoder Number of hidden layers used for encoder NN. n_layers_decoder Number of hidden layers used for decoder NN. dropout_rate Dropout rate for neural networks model_depth Model sequencing depth / library size (default: True) region_factors Include region-specific factors in the model (default: True) latent_distribution One of * ``'normal'`` - Normal distribution (Default) * ``'ln'`` - Logistic normal distribution (Normal(0, I) transformed by softmax) deeply_inject_covariates Whether to deeply inject covariates into all layers of the decoder. If False (default), covairates will only be included in the input layer. **model_kwargs Keyword args for :class:`~scvi.module.PEAKVAE` Examples -------- >>> adata = anndata.read_h5ad(path_to_anndata) >>> scvi.dataset.setup_anndata(adata, batch_key="batch") >>> vae = scvi.model.PEAKVI(adata) >>> vae.train() Notes ----- See further usage examples in the following tutorials: 1. :doc:`/user_guide/notebooks/PeakVI` """ def __init__( self, adata: AnnData, n_hidden: Optional[int] = None, n_latent: Optional[int] = None, n_layers_encoder: int = 2, n_layers_decoder: int = 2, dropout_rate: float = 0.1, model_depth: bool = True, region_factors: bool = True, use_batch_norm: Literal["encoder", "decoder", "none", "both"] = "none", use_layer_norm: Literal["encoder", "decoder", "none", "both"] = "both", latent_distribution: Literal["normal", "ln"] = "normal", deeply_inject_covariates: bool = False, encode_covariates: bool = False, **model_kwargs, ): super(PEAKVI, self).__init__(adata) n_cats_per_cov = ( self.scvi_setup_dict_["extra_categoricals"]["n_cats_per_key"] if "extra_categoricals" in self.scvi_setup_dict_ else []) self.module = PEAKVAE( n_input_regions=self.summary_stats["n_vars"], n_batch=self.summary_stats["n_batch"], n_hidden=n_hidden, n_latent=n_latent, n_layers_encoder=n_layers_encoder, n_layers_decoder=n_layers_decoder, n_continuous_cov=self.summary_stats["n_continuous_covs"], n_cats_per_cov=n_cats_per_cov, dropout_rate=dropout_rate, model_depth=model_depth, region_factors=region_factors, use_batch_norm=use_batch_norm, use_layer_norm=use_layer_norm, latent_distribution=latent_distribution, deeply_inject_covariates=deeply_inject_covariates, encode_covariates=encode_covariates, **model_kwargs, ) self._model_summary_string = ( "PeakVI Model with params: \nn_hidden: {}, n_latent: {}, n_layers_encoder: {}, " "n_layers_decoder: {} , dropout_rate: {}, latent_distribution: {}, deep injection: {}, " "encode_covariates: {}").format( self.module.n_hidden, self.module.n_latent, n_layers_encoder, n_layers_decoder, dropout_rate, latent_distribution, deeply_inject_covariates, encode_covariates, ) self.n_latent = n_latent self.init_params_ = self._get_init_params(locals()) def train( self, max_epochs: int = 500, lr: float = 1e-4, use_gpu: Optional[Union[str, int, bool]] = None, train_size: float = 0.9, validation_size: Optional[float] = None, batch_size: int = 128, weight_decay: float = 1e-3, eps: float = 1e-08, early_stopping: bool = True, early_stopping_patience: int = 50, save_best: bool = True, check_val_every_n_epoch: Optional[int] = None, n_steps_kl_warmup: Union[int, None] = None, n_epochs_kl_warmup: Union[int, None] = 50, plan_kwargs: Optional[dict] = None, **kwargs, ): """ Trains the model using amortized variational inference. Parameters ---------- max_epochs Number of passes through the dataset. lr Learning rate for optimization. use_gpu Use default GPU if available (if None or True), or index of GPU to use (if int), or name of GPU (if str, e.g., `'cuda:0'`), or use CPU (if False). train_size Size of training set in the range [0.0, 1.0]. validation_size Size of the test set. If `None`, defaults to 1 - `train_size`. If `train_size + validation_size < 1`, the remaining cells belong to a test set. batch_size Minibatch size to use during training. weight_decay weight decay regularization term for optimization eps Optimizer eps early_stopping Whether to perform early stopping with respect to the validation set. early_stopping_patience How many epochs to wait for improvement before early stopping save_best Save the best model state with respect to the validation loss (default), or use the final state in the training procedure check_val_every_n_epoch Check val every n train epochs. By default, val is not checked, unless `early_stopping` is `True`. If so, val is checked every epoch. n_steps_kl_warmup Number of training steps (minibatches) to scale weight on KL divergences from 0 to 1. Only activated when `n_epochs_kl_warmup` is set to None. If `None`, defaults to `floor(0.75 * adata.n_obs)`. n_epochs_kl_warmup Number of epochs to scale weight on KL divergences from 0 to 1. Overrides `n_steps_kl_warmup` when both are not `None`. plan_kwargs Keyword args for :class:`~scvi.train.TrainingPlan`. Keyword arguments passed to `train()` will overwrite values present in `plan_kwargs`, when appropriate. **kwargs Other keyword args for :class:`~scvi.train.Trainer`. """ update_dict = dict( lr=lr, weight_decay=weight_decay, eps=eps, n_epochs_kl_warmup=n_epochs_kl_warmup, n_steps_kl_warmup=n_steps_kl_warmup, optimizer="AdamW", ) if plan_kwargs is not None: plan_kwargs.update(update_dict) else: plan_kwargs = update_dict if save_best: if "callbacks" not in kwargs.keys(): kwargs["callbacks"] = [] kwargs["callbacks"].append( SaveBestState(monitor="reconstruction_loss_validation")) super().train( max_epochs=max_epochs, train_size=train_size, use_gpu=use_gpu, validation_size=validation_size, early_stopping=early_stopping, early_stopping_monitor="reconstruction_loss_validation", early_stopping_patience=early_stopping_patience, plan_kwargs=plan_kwargs, check_val_every_n_epoch=check_val_every_n_epoch, batch_size=batch_size, **kwargs, ) @torch.no_grad() def get_library_size_factors( self, adata: Optional[AnnData] = None, indices: Sequence[int] = None, batch_size: int = 128, ): adata = self._validate_anndata(adata) scdl = self._make_data_loader(adata=adata, indices=indices, batch_size=batch_size) library_sizes = [] for tensors in scdl: inference_inputs = self.module._get_inference_input(tensors) outputs = self.module.inference(**inference_inputs) library_sizes.append(outputs["d"].cpu()) return torch.cat(library_sizes).numpy().squeeze() @torch.no_grad() def get_region_factors(self): if self.module.region_factors is None: raise RuntimeError( "region factors were not included in this model") return torch.sigmoid(self.module.region_factors).cpu().numpy() @torch.no_grad() def get_accessibility_estimates( self, adata: Optional[AnnData] = None, indices: Sequence[int] = None, region_indices: Sequence[int] = None, transform_batch: Optional[Union[str, int]] = None, use_z_mean: bool = True, threshold: Optional[float] = None, normalize_cells: bool = False, normalize_regions: bool = False, batch_size: int = 128, ) -> Union[np.ndarray, csr_matrix]: """ Impute the full accessibility matrix. Returns a matrix of accessibility probabilities for each cell and genomic region in the input (for return matrix A, A[i,j] is the probability that region j is accessible in cell i). Parameters ---------- adata AnnData object that has been registered with scvi. If `None`, defaults to the AnnData object used to initialize the model. indices Indices of cells in adata to use. If `None`, all cells are used. region_indices Indices of regions to use. if `None`, all regions are used. transform_batch Batch to condition on. If transform_batch is: - None, then real observed batch is used - int, then batch transform_batch is used use_z_mean If True (default), use the distribution mean. Otherwise, sample from the distribution. threshold If provided, values below the threshold are replaced with 0 and a sparse matrix is returned instead. This is recommended for very large matrices. Must be between 0 and 1. normalize_cells Whether to reintroduce library size factors to scale the normalized probabilities. This makes the estimates closer to the input, but removes the library size correction. False by default. normalize_regions Whether to reintroduce region factors to scale the normalized probabilities. This makes the estimates closer to the input, but removes the region-level bias correction. False by default. batch_size Minibatch size for data loading into model """ adata = self._validate_anndata(adata) post = self._make_data_loader(adata=adata, indices=indices, batch_size=batch_size) transform_batch = _get_batch_code_from_category(adata, transform_batch) if threshold is not None and (threshold < 0 or threshold > 1): raise ValueError("the provided threshold must be between 0 and 1") imputed = [] for tensors in post: get_generative_input_kwargs = dict( transform_batch=transform_batch[0]) generative_kwargs = dict(use_z_mean=use_z_mean) inference_outputs, generative_outputs = self.module.forward( tensors=tensors, get_generative_input_kwargs=get_generative_input_kwargs, generative_kwargs=generative_kwargs, compute_loss=False, ) p = generative_outputs["p"].cpu() if normalize_cells: p *= inference_outputs["d"].cpu() if normalize_regions: p *= torch.sigmoid(self.module.region_factors).cpu() if threshold: p[p < threshold] = 0 p = csr_matrix(p.numpy()) if region_indices is not None: p = p[:, region_indices] imputed.append(p) if threshold: # imputed is a list of csr_matrix objects imputed = vstack(imputed, format="csr") else: # imputed is a list of tensors imputed = torch.cat(imputed).numpy() return imputed @_doc_params( doc_differential_expression=doc_differential_expression, ) def differential_accessibility( self, adata: Optional[AnnData] = None, groupby: Optional[str] = None, group1: Optional[Iterable[str]] = None, group2: Optional[str] = None, idx1: Optional[Union[Sequence[int], Sequence[bool], str]] = None, idx2: Optional[Union[Sequence[int], Sequence[bool], str]] = None, mode: Literal["vanilla", "change"] = "change", delta: float = 0.05, batch_size: Optional[int] = None, all_stats: bool = True, batch_correction: bool = False, batchid1: Optional[Iterable[str]] = None, batchid2: Optional[Iterable[str]] = None, fdr_target: float = 0.05, silent: bool = False, two_sided: bool = True, **kwargs, ) -> pd.DataFrame: r""" A unified method for differential accessibility analysis. Implements `"vanilla"` DE [Lopez18]_ and `"change"` mode DE [Boyeau19]_. Parameters ---------- {doc_differential_expression} two_sided Whether to perform a two-sided test, or a one-sided test. **kwargs Keyword args for :func:`scvi.utils.DifferentialComputation.get_bayes_factors` Returns ------- Differential accessibility DataFrame with the following columns: prob_da the probability of the region being differentially accessible is_da_fdr whether the region passes a multiple hypothesis correction procedure with the target_fdr threshold bayes_factor Bayes Factor indicating the level of significance of the analysis effect_size the effect size, computed as (accessibility in population 2) - (accessibility in population 1) emp_effect the empirical effect, based on observed detection rates instead of the estimated accessibility scores from the PeakVI model est_prob1 the estimated probability of accessibility in population 1 est_prob2 the estimated probability of accessibility in population 2 emp_prob1 the empirical (observed) probability of accessibility in population 1 emp_prob2 the empirical (observed) probability of accessibility in population 2 """ adata = self._validate_anndata(adata) col_names = _get_var_names_from_setup_anndata(adata) model_fn = partial(self.get_accessibility_estimates, use_z_mean=False, batch_size=batch_size) # TODO check if change_fn in kwargs and raise error if so def change_fn(a, b): return a - b if two_sided: def m1_domain_fn(samples): return np.abs(samples) >= delta else: def m1_domain_fn(samples): return samples >= delta result = _de_core( adata=adata, model_fn=model_fn, groupby=groupby, group1=group1, group2=group2, idx1=idx1, idx2=idx2, all_stats=all_stats, all_stats_fn=scatac_raw_counts_properties, col_names=col_names, mode=mode, batchid1=batchid1, batchid2=batchid2, delta=delta, batch_correction=batch_correction, fdr=fdr_target, change_fn=change_fn, m1_domain_fn=m1_domain_fn, silent=silent, **kwargs, ) # manually change the results DataFrame to fit a PeakVI differential accessibility results result = pd.DataFrame( { "prob_da": result.proba_de, "is_da_fdr": result.loc[:, "is_de_fdr_{}".format(fdr_target)], "bayes_factor": result.bayes_factor, "effect_size": result.scale2 - result.scale1, "emp_effect": result.emp_mean2 - result.emp_mean1, "est_prob1": result.scale1, "est_prob2": result.scale2, "emp_prob1": result.emp_mean1, "emp_prob2": result.emp_mean2, }, ) return result.reindex(adata.var.index)