def compute_query_projection( what: Union[str, ut.Matrix] = "__x__", *, adata: AnnData, qdata: AnnData, weights: ut.Matrix, atlas_total_umis: Optional[ut.Vector] = None, query_total_umis: Optional[ut.Vector] = None, ) -> None: """ Compute the projected image of the query on the atlas. **Input** Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. The ``weights`` of the projection where each row is a query metacell, each column is an atlas metacell, and the value is the weight of the atlas cell for projecting the metacell, such that the sum of weights in each row is one. **Returns** In addition, sets the following annotations in ``qdata``: Observation (Cell) Annotations ``projection`` The number of UMIs of each gene in the projected image of the query to the metacell, if the total number of UMIs in the projection is equal to the total number of UMIs in the query metacell. **Computation Parameters** 1. Compute the fraction of each gene in the atlas and the query based on the total UMIs, unless ``atlas_total_umis`` and/or ``query_total_umis`` are specified. 2. Compute the projected image of each query metacell on the atlas using the weights. 3. Convert this image to UMIs count based on the total UMIs of each metacell. Note that if overriding the total atlas or query UMIs, this means that the result need not sum to this total. """ assert np.all(adata.var_names == qdata.var_names) atlas_umis = ut.get_vo_proper(adata, what, layout="row_major") query_umis = ut.get_vo_proper(qdata, what, layout="row_major") if atlas_total_umis is None: atlas_total_umis = ut.sum_per(atlas_umis, per="row") atlas_total_umis = ut.to_numpy_vector(atlas_total_umis) if query_total_umis is None: query_total_umis = ut.sum_per(query_umis, per="row") query_total_umis = ut.to_numpy_vector(query_total_umis) atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis)) projected_fractions = weights @ atlas_fractions # type: ignore projected_umis = ut.scale_by(projected_fractions, scale=query_total_umis, by="row") ut.set_vo_data(qdata, "projected", projected_umis)
def _pick_candidates( *, adata_of_all_genes_of_all_cells: AnnData, what: Union[str, ut.Matrix] = "__x__", max_gene_cell_fraction: float, min_gene_maximum: int, min_genes_of_modules: int, allowed_genes_mask: ut.NumpyVector, ) -> Optional[Tuple[AnnData, ut.NumpyVector]]: data = ut.get_vo_proper(adata_of_all_genes_of_all_cells, what, layout="column_major") nnz_cells_of_genes = ut.nnz_per(data, per="column") nnz_cell_fraction_of_genes = nnz_cells_of_genes / adata_of_all_genes_of_all_cells.n_obs nnz_cell_fraction_mask_of_genes = nnz_cell_fraction_of_genes <= max_gene_cell_fraction max_umis_of_genes = ut.max_per(data, per="column") max_umis_mask_of_genes = max_umis_of_genes >= min_gene_maximum candidates_mask_of_genes = max_umis_mask_of_genes & nnz_cell_fraction_mask_of_genes & allowed_genes_mask ut.log_calc("candidate_genes", candidates_mask_of_genes) candidate_genes_indices = np.where(candidates_mask_of_genes)[0] candidate_genes_count = candidate_genes_indices.size if candidate_genes_count < min_genes_of_modules: return None candidate_data = ut.slice(adata_of_all_genes_of_all_cells, name=".candidate_genes", vars=candidate_genes_indices, top_level=False) return candidate_data, candidate_genes_indices
def find_high_relative_variance_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_relative_variance: float = pr.significant_gene_relative_variance, window_size: int = pr.relative_variance_window_size, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high relative variance of ``what`` (default: {what}) data. The relative variance measures the variance / mean of each gene relative to the other genes with a similar level of expression. See :py:func:`metacells.utilities.computation.relative_variance_per` for details. Genes with a high relative variance are good candidates for being selected as "feature genes", that is, be used to compute the similarity between cells. Using the relative variance compensates for the bias for selecting higher-expression genes, whose normalized variance can to be larger due to random noise alone. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``high_relative_variance_gene`` A boolean mask indicating whether each gene was found to have a high relative variance. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Use :py:func:`metacells.utilities.computation.relative_variance_per` to get the relative variance of each gene. 2. Select the genes whose relative variance is at least ``min_gene_relative_variance`` (default: {min_gene_relative_variance}). """ data = ut.get_vo_proper(adata, what, layout="column_major") relative_variance_of_genes = ut.relative_variance_per(data, per="column", window_size=window_size) genes_mask = relative_variance_of_genes >= min_gene_relative_variance if inplace: ut.set_v_data(adata, "high_relative_variance_gene", genes_mask) return None ut.log_return("high_relative_variance_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_high_normalized_variance_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_normalized_variance: float = pr.significant_gene_normalized_variance, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high normalized variance of ``what`` (default: {what}) data. The normalized variance measures the variance / mean of each gene. See :py:func:`metacells.utilities.computation.normalized_variance_per` for details. Genes with a high normalized variance are "noisy", that is, have significantly different expression level in different cells. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``high_normalized_variance_gene`` A boolean mask indicating whether each gene was found to have a high normalized variance. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Use :py:func:`metacells.utilities.computation.normalized_variance_per` to get the normalized variance of each gene. 2. Select the genes whose normalized variance is at least ``min_gene_normalized_variance`` (default: {min_gene_normalized_variance}). """ data = ut.get_vo_proper(adata, what, layout="column_major") normalized_variance_of_genes = ut.normalized_variance_per(data, per="column") genes_mask = normalized_variance_of_genes >= min_gene_normalized_variance if inplace: ut.set_v_data(adata, "high_normalized_variance_gene", genes_mask) return None ut.log_return("high_normalized_variance_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_biased_genes( adata: AnnData, *, max_projection_fold_factor: float = pr.project_max_projection_fold_factor, min_metacells_fraction: float = pr.biased_min_metacells_fraction, abs_folds: bool = pr.project_abs_folds, to_property_name: str = "biased_gene", ) -> None: """ Find genes that have a strong bias in the query compared to the atlas. **Input** Annotated query ``adata`` where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. This should contain a ``projected_fold`` per-variable-per-observation matrix with the fold factor between each query metacell and its projected image on the atlas. **Returns** Sets the following annotations in ``adata``: Variable (Gene) Annotations ``biased_gene`` (or ``to_property_name``): A boolean mask indicating whether the gene has a strong bias in the query compared to the atlas. **Computation Parameters** 1. Count for each such gene the number of query metacells for which the ``projected_fold`` is above ``max_projection_fold_factor``. If ``abs_folds`` (default: {abs_folds}), consider the absolute fold factor. 2. Mark the gene as biased if either count is at least a ``min_metacells_fraction`` (default: {min_metacells_fraction}) of the metacells. """ assert max_projection_fold_factor >= 0 assert 0 <= min_metacells_fraction <= 1 projected_fold = ut.get_vo_proper(adata, "projected_fold", layout="column_major") if abs_folds: projected_fold = np.abs(projected_fold) # type: ignore high_projection_folds = ut.to_numpy_matrix(projected_fold > max_projection_fold_factor) # type: ignore ut.log_calc("high_projection_folds", high_projection_folds) count_of_genes = ut.sum_per(high_projection_folds, per="column") min_count = adata.n_obs * min_metacells_fraction mask_of_genes = count_of_genes >= min_count ut.set_v_data(adata, to_property_name, mask_of_genes)
def find_high_topN_genes( # pylint: disable=invalid-name adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, topN: int, # pylint: disable=invalid-name min_gene_topN: int, # pylint: disable=invalid-name inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high total top-Nth value of ``what`` (default: {what}) data. This should typically only be applied to downsampled data to ensure that variance in sampling depth does not affect the result. Genes with too-low expression are typically excluded from computations. In particular, genes may have all-zero expression, in which case including them just slows the computations (and triggers numeric edge cases). **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``high_top<topN>_gene`` A boolean mask indicating whether each gene was found to have a high top-Nth value. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Use :py:func:`metacells.utilities.computation.top_per` to get the top-Nth UMIs of each gene. 2. Select the genes whose fraction is at least ``min_gene_topN``. """ data_of_genes = ut.get_vo_proper(adata, what, layout="column_major") rank = max(adata.n_obs - topN - 1, 1) topN_of_genes = ut.rank_per(data_of_genes, per="column", rank=rank) # pylint: disable=invalid-name genes_mask = topN_of_genes >= min_gene_topN if inplace: ut.set_v_data(adata, f"high_top{topN}_gene", genes_mask) return None ut.log_return(f"high_top{topN}_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_high_fraction_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_fraction: float = pr.significant_gene_fraction, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high fraction of the total ``what`` (default: {what}) data of the cells. Genes with too-low expression are typically excluded from computations. In particular, genes may have all-zero expression, in which case including them just slows the computations (and triggers numeric edge cases). **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``high_fraction_gene`` A boolean mask indicating whether each gene was found to have a high normalized variance. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Use :py:func:`metacells.utilities.computation.fraction_per` to get the fraction of each gene. 2. Select the genes whose fraction is at least ``min_gene_fraction`` (default: {min_gene_fraction}). """ data = ut.get_vo_proper(adata, what, layout="column_major") fraction_of_genes = ut.fraction_per(data, per="column") genes_mask = fraction_of_genes >= min_gene_fraction if inplace: ut.set_v_data(adata, "high_fraction_gene", genes_mask) return None ut.log_return("high_fraction_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def compute_similar_query_metacells( adata: AnnData, max_projection_fold_factor: float = pr.project_max_projection_fold_factor, abs_folds: bool = pr.project_abs_folds, ) -> None: """ Mark query metacells that are similar to their projection on the atlas. This does not guarantee the query metacell is "the same as" its projection on the atlas; rather, it means the two are sufficiently similar that one can be "reasonably confident" in applying atlas metadata to the query metacell based on the projection, which is a much lower bar. **Input** Annotated query ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. The data should contain per-observation-per-variable annotations ``projected_fold`` with the significant projection folds factors, as computed by :py:func:`compute_significant_projected_fold_factors`. **Returns** Sets the following in ``adata``: Per-Observation (Cell) Annotations ``similar`` A boolean mask indicating the query metacell is similar to its projection in the atlas. **Computation Parameters** 1. Mark as dissimilar any query metacells which have even one gene whose projection fold is above ``max_projection_fold_factor``. """ assert max_projection_fold_factor >= 0 projected_folds = ut.get_vo_proper(adata, "projected_fold", layout="row_major") if abs_folds: projected_folds = np.abs(projected_folds) # type: ignore high_folds = projected_folds > max_projection_fold_factor # type: ignore high_folds_per_metacell = ut.sum_per(high_folds, per="row") # type: ignore similar_mask = high_folds_per_metacell == 0 ut.set_o_data(adata, "similar", similar_mask)
def find_distinct_genes( adata: AnnData, what: Union[str, ut.Matrix] = "distinct_fold", *, distinct_genes_count: int = pr.distinct_genes_count, distinct_abs_folds: bool = pr.distinct_abs_folds, inplace: bool = True, ) -> Optional[Tuple[ut.PandasFrame, ut.PandasFrame]]: """ Find for each observation (cell) the genes in which its ``what`` (default: {what}) value is most distinct from the general population. This is typically applied to the metacells data rather than to the cells data. **Input** Annotated ``adata``, where the observations are (mata)cells and the variables are genes, including a per-observation-per-variable annotated folds data, {what}), e.g. as computed by :py:func:`compute_distinct_folds`. **Returns** Observation-Any (Cell) Annotations ``cell_distinct_gene_indices`` For each cell, the indices of its top ``distinct_genes_count`` genes. ``cell_distinct_gene_folds`` For each cell, the fold factor of its top ``distinct_genes_count``. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as two pandas frames (indexed by the observation and distinct gene rank). **Computation Parameters** 1. Fetch the previously computed per-observation-per-variable ``what`` data. 2. Keep the ``distinct_genes_count`` (default: {distinct_genes_count}) top fold factors. If ``distinct_abs_folds`` (default: ``distinct_abs_folds``), keep the top absolute fold factors. """ assert 0 < distinct_genes_count < adata.n_vars distinct_gene_indices = np.empty((adata.n_obs, distinct_genes_count), dtype="int32") distinct_gene_folds = np.empty((adata.n_obs, distinct_genes_count), dtype="float32") fold_in_cells = ut.mustbe_numpy_matrix( ut.get_vo_proper(adata, what, layout="row_major")) extension_name = f"top_distinct_{fold_in_cells.dtype}_t" extension = getattr(xt, extension_name) extension(distinct_gene_indices, distinct_gene_folds, fold_in_cells, distinct_abs_folds) if inplace: ut.set_oa_data(adata, "cell_distinct_gene_indices", distinct_gene_indices) ut.set_oa_data(adata, "cell_distinct_gene_folds", distinct_gene_folds) return None return ( ut.to_pandas_frame(distinct_gene_indices, index=adata.obs_names), ut.to_pandas_frame(distinct_gene_folds, index=adata.obs_names), )
def find_deviant_cells( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, candidates: Union[str, ut.Vector] = "candidate", min_gene_fold_factor: float = pr.deviants_min_gene_fold_factor, abs_folds: bool = pr.deviants_abs_folds, max_gene_fraction: Optional[float] = pr.deviants_max_gene_fraction, max_cell_fraction: Optional[float] = pr.deviants_max_cell_fraction, inplace: bool = True, ) -> Optional[Tuple[ut.PandasSeries, ut.PandasSeries]]: """ Find cells which are have significantly different gene expression from the metacells they are belong to based on ``what`` (default: {what}) data. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Observation (Cell) Annotations ``cell_deviant_votes`` The number of genes that were the reason the cell was marked as deviant (if zero, the cell is not deviant). Variable (Gene) Annotations ``gene_deviant_votes`` The number of cells each gene marked as deviant (if zero, the gene did not mark any cell as deviant). If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as two pandas series (indexed by the observation and variable names). **Computation Parameters** Intuitively, we first select some fraction of the genes which were least predictable compared to the mean expression in the candidate metacells. We then mark as deviants some fraction of the cells whose expression of these genes was least predictable compared to the mean expression in the candidate metacells. Operationally: 1. Compute for each candidate metacell the mean fraction of the UMIs expressed by each gene. Scale this by each cell's total UMIs to compute the expected number of UMIs for each cell. Compute the fold factor log2((actual UMIs + 1) / (expected UMIs + 1)) for each gene for each cell. 2. Ignore all fold factors less than the ``min_gene_fold_factor`` (default: {min_gene_fold_factor}). If ``abs_folds`` (default: {abs_folds}), consider the absolute fold factors. Count the number of genes which have a fold factor above this minimum in at least one cell. If the fraction of such genes is above ``max_gene_fraction`` (default: {max_gene_fraction}), then raise the minimal gene fold factor such that at most this fraction of genes remain. 3. For each remaining gene, rank all the cells where it is expressed above the min fold factor. Give an artificial maximum rank to all cells with fold factor 0, that is, below the minimum. 4. For each cell, compute the minimal rank it has in any of these genes. That is, if a cell has a rank of 1, it means that it has at least one gene whose expression fold factor is the worst (highest) across all cells (and is also above the minimum). 5. Select as deviants all cells whose minimal rank is below the artificial maximum rank, that is, which contain at least one gene whose expression fold factor is high relative to the rest of the cells. If the fraction of such cells is higher than ``max_cell_fraction`` (default: {max_cell_fraction}), reduce the maximal rank such that at most this fraction of cells are selected as deviants. """ if max_gene_fraction is None: max_gene_fraction = 1 if max_cell_fraction is None: max_cell_fraction = 1 assert min_gene_fold_factor > 0 assert 0 < max_gene_fraction < 1 assert 0 < max_cell_fraction < 1 cells_count, genes_count = adata.shape assert cells_count > 0 candidate_of_cells = ut.get_o_numpy(adata, candidates, formatter=ut.groups_description) totals_of_cells = ut.get_o_numpy(adata, what, sum=True) assert totals_of_cells.size == cells_count data = ut.get_vo_proper(adata, what, layout="row_major") list_of_fold_factors, list_of_cell_index_of_rows = _collect_fold_factors( data=data, candidate_of_cells=candidate_of_cells, totals_of_cells=totals_of_cells, min_gene_fold_factor=min_gene_fold_factor, abs_folds=abs_folds, ) fold_factors = _construct_fold_factors(cells_count, list_of_fold_factors, list_of_cell_index_of_rows) if fold_factors is None: votes_of_deviant_cells = np.zeros(adata.n_obs, dtype="int32") votes_of_deviant_genes = np.zeros(adata.n_vars, dtype="int32") else: deviant_gene_indices = _filter_genes( cells_count=cells_count, genes_count=genes_count, fold_factors=fold_factors, min_gene_fold_factor=min_gene_fold_factor, max_gene_fraction=max_gene_fraction, ) deviant_genes_fold_ranks = _fold_ranks( cells_count=cells_count, fold_factors=fold_factors, deviant_gene_indices=deviant_gene_indices ) votes_of_deviant_cells, votes_of_deviant_genes = _filter_cells( cells_count=cells_count, genes_count=genes_count, deviant_genes_fold_ranks=deviant_genes_fold_ranks, deviant_gene_indices=deviant_gene_indices, max_cell_fraction=max_cell_fraction, ) if inplace: ut.set_v_data(adata, "gene_deviant_votes", votes_of_deviant_genes, formatter=ut.mask_description) ut.set_o_data(adata, "cell_deviant_votes", votes_of_deviant_cells, formatter=ut.mask_description) return None ut.log_return("gene_deviant_votes", votes_of_deviant_genes, formatter=ut.mask_description) ut.log_return("cell_deviant_votes", votes_of_deviant_cells, formatter=ut.mask_description) return ( ut.to_pandas_series(votes_of_deviant_cells, index=adata.obs_names), ut.to_pandas_series(votes_of_deviant_genes, index=adata.var_names), )
def dissolve_metacells( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, candidates: Union[str, ut.Vector] = "candidate", deviants: Optional[Union[str, ut.Vector]] = "cell_deviant_votes", target_metacell_size: float = pr.target_metacell_size, cell_sizes: Optional[Union[str, ut.Vector]] = pr.dissolve_cell_sizes, min_metacell_cells: int = pr.dissolve_min_metacell_cells, min_robust_size_factor: Optional[float] = pr. dissolve_min_robust_size_factor, min_convincing_size_factor: Optional[float] = pr. dissolve_min_convincing_size_factor, min_convincing_gene_fold_factor: float = pr. dissolve_min_convincing_gene_fold_factor, abs_folds: bool = pr.dissolve_abs_folds, inplace: bool = True, ) -> Optional[ut.PandasFrame]: """ Dissolve too-small metacells based on ``what`` (default: {what}) data. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Observation (Cell) Annotations ``metacell`` The integer index of the metacell each cell belongs to. The metacells are in no particular order. Cells with no metacell assignment are given a metacell index of ``-1``. ``dissolved`` A boolean mask of the cells which were in a dissolved metacell. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas data frame (indexed by the observation names). **Computation Parameters** 1. Mark all cells with non-zero ``deviants`` (default: {deviants}) as "outliers". This can be the name of a per-observation (cell) annotation, or an explicit boolean mask of cells, or a or ``None`` if there are no deviant cells to mark. 2. Any metacell which has less cells than the ``min_metacell_cells`` is dissolved. 3. We are trying to create metacells of size ``target_metacell_size``. Compute the sizes of the resulting metacells by summing the ``cell_sizes`` (default: {cell_sizes}). If it is ``None``, each has a size of one. These parameters are typically identical to these passed to :py:func:`metacells.tools.candidates.compute_candidate_metacells`. 4. If ``min_robust_size_factor`` (default: {min_robust_size_factor}) is specified, then any metacell whose total size is at least ``target_metacell_size * min_robust_size_factor`` is preserved. 5. If ``min_convincing_size_factor`` (default: {min_convincing_size_factor}) is specified, then any remaining metacells whose size is at least ``target_metacell_size * min_convincing_size_factor`` are preserved, given they contain at least one gene whose fold factor (log2((actual + 1) / (expected + 1))) is at least ``min_convincing_gene_fold_factor`` (default: {min_convincing_gene_fold_factor}). If ``abs_folds``, consider the absolute fold factors. That is, we only preserve these smaller metacells if there is at least one gene whose expression is significantly different from the mean of the population. 6 . Any remaining metacell is dissolved into "outlier" cells. """ dissolved_of_cells = np.zeros(adata.n_obs, dtype="bool") candidate_of_cells = ut.get_o_numpy(adata, candidates, formatter=ut.groups_description) candidate_of_cells = np.copy(candidate_of_cells) deviant_of_cells = ut.maybe_o_numpy(adata, deviants, formatter=ut.mask_description) if deviant_of_cells is not None: deviant_of_cells = deviant_of_cells > 0 cell_sizes = ut.maybe_o_numpy(adata, cell_sizes, formatter=ut.sizes_description) if deviant_of_cells is not None: candidate_of_cells[deviant_of_cells > 0] = -1 candidate_of_cells = ut.compress_indices(candidate_of_cells) candidates_count = np.max(candidate_of_cells) + 1 data = ut.get_vo_proper(adata, what, layout="column_major") fraction_of_genes = ut.fraction_per(data, per="column") if min_robust_size_factor is None: min_robust_size = None else: min_robust_size = target_metacell_size * min_robust_size_factor ut.log_calc("min_robust_size", min_robust_size) if min_convincing_size_factor is None: min_convincing_size = None else: min_convincing_size = target_metacell_size * min_convincing_size_factor ut.log_calc("min_convincing_size", min_convincing_size) did_dissolve = False for candidate_index in range(candidates_count): candidate_cell_indices = np.where( candidate_of_cells == candidate_index)[0] if not _keep_candidate( adata, candidate_index, data=data, cell_sizes=cell_sizes, fraction_of_genes=fraction_of_genes, min_metacell_cells=min_metacell_cells, min_robust_size=min_robust_size, min_convincing_size=min_convincing_size, min_convincing_gene_fold_factor=min_convincing_gene_fold_factor, abs_folds=abs_folds, candidates_count=candidates_count, candidate_cell_indices=candidate_cell_indices, ): dissolved_of_cells[candidate_cell_indices] = True candidate_of_cells[candidate_cell_indices] = -1 did_dissolve = True if did_dissolve: metacell_of_cells = ut.compress_indices(candidate_of_cells) else: metacell_of_cells = candidate_of_cells if inplace: ut.set_o_data(adata, "dissolved", dissolved_of_cells, formatter=ut.mask_description) ut.set_o_data(adata, "metacell", metacell_of_cells, formatter=ut.groups_description) return None ut.log_return("dissolved", dissolved_of_cells) ut.log_return("metacell", metacell_of_cells, formatter=ut.groups_description) obs_frame = ut.to_pandas_frame(index=adata.obs_names) obs_frame["dissolved"] = dissolved_of_cells obs_frame["metacell"] = metacell_of_cells return obs_frame
def _related_genes( # pylint: disable=too-many-statements,too-many-branches *, adata_of_all_genes_of_all_cells: AnnData, what: Union[str, ut.Matrix] = "__x__", rare_gene_indices_of_modules: List[List[int]], allowed_genes_mask: ut.NumpyVector, min_genes_of_modules: int, min_gene_maximum: int, min_cells_of_modules: int, max_cells_of_modules: int, min_cell_module_total: int, min_related_gene_fold_factor: float, max_related_gene_increase_factor: float, ) -> List[List[int]]: total_all_cells_umis_of_all_genes = ut.get_v_numpy( adata_of_all_genes_of_all_cells, what, sum=True) ut.log_calc("genes for modules:") modules_count = 0 related_gene_indices_of_modules: List[List[int]] = [] rare_gene_indices_of_any: Set[int] = set() for rare_gene_indices_of_module in rare_gene_indices_of_modules: if len(rare_gene_indices_of_module) >= min_genes_of_modules: rare_gene_indices_of_any.update(list(rare_gene_indices_of_module)) for rare_gene_indices_of_module in rare_gene_indices_of_modules: if len(rare_gene_indices_of_module) < min_genes_of_modules: continue module_index = modules_count modules_count += 1 with ut.log_step("- module", module_index): ut.log_calc( "rare_gene_names", sorted(adata_of_all_genes_of_all_cells. var_names[rare_gene_indices_of_module])) adata_of_module_genes_of_all_cells = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.rare_gene", vars=rare_gene_indices_of_module, top_level=False, ) total_module_genes_umis_of_all_cells = ut.get_o_numpy( adata_of_module_genes_of_all_cells, what, sum=True) mask_of_expressed_cells = total_module_genes_umis_of_all_cells > 0 expressed_cells_count = np.sum(mask_of_expressed_cells) if expressed_cells_count > max_cells_of_modules: if ut.logging_calc(): ut.log_calc( "expressed_cells", ut.mask_description(mask_of_expressed_cells) + " (too many)") continue if expressed_cells_count < min_cells_of_modules: if ut.logging_calc(): ut.log_calc( "expressed_cells", ut.mask_description(mask_of_expressed_cells) + " (too few)") continue ut.log_calc("expressed_cells", mask_of_expressed_cells) adata_of_all_genes_of_expressed_cells_of_module = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.rare_cell", obs=mask_of_expressed_cells, top_level=False, ) total_expressed_cells_umis_of_all_genes = ut.get_v_numpy( adata_of_all_genes_of_expressed_cells_of_module, what, sum=True) data = ut.get_vo_proper( adata_of_all_genes_of_expressed_cells_of_module, what, layout="column_major") max_expressed_cells_umis_of_all_genes = ut.max_per(data, per="column") total_background_cells_umis_of_all_genes = ( total_all_cells_umis_of_all_genes - total_expressed_cells_umis_of_all_genes) expressed_cells_fraction_of_all_genes = total_expressed_cells_umis_of_all_genes / sum( total_expressed_cells_umis_of_all_genes) background_cells_fraction_of_all_genes = total_background_cells_umis_of_all_genes / sum( total_background_cells_umis_of_all_genes) mask_of_related_genes = ( allowed_genes_mask & (max_expressed_cells_umis_of_all_genes >= min_gene_maximum) & (expressed_cells_fraction_of_all_genes >= background_cells_fraction_of_all_genes * (2**min_related_gene_fold_factor))) related_gene_indices = np.where(mask_of_related_genes)[0] assert np.all(mask_of_related_genes[rare_gene_indices_of_module]) base_genes_of_all_cells_adata = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.base", vars=rare_gene_indices_of_module) total_base_genes_of_all_cells = ut.get_o_numpy( base_genes_of_all_cells_adata, what, sum=True) mask_of_strong_base_cells = total_base_genes_of_all_cells >= min_cell_module_total count_of_strong_base_cells = np.sum(mask_of_strong_base_cells) if ut.logging_calc(): ut.log_calc( "candidate_gene_names", sorted(adata_of_all_genes_of_all_cells. var_names[related_gene_indices])) ut.log_calc("base_strong_genes", count_of_strong_base_cells) related_gene_indices_of_module = list(rare_gene_indices_of_module) for gene_index in related_gene_indices: if gene_index in rare_gene_indices_of_module: continue if gene_index in rare_gene_indices_of_any: ut.log_calc( f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} " f"belongs to another module") continue if gene_index not in rare_gene_indices_of_module: related_gene_of_all_cells_adata = ut.slice( adata_of_all_genes_of_all_cells, name= f".{adata_of_all_genes_of_all_cells.var_names[gene_index]}", vars=np.array([gene_index]), ) assert related_gene_of_all_cells_adata.n_vars == 1 total_related_genes_of_all_cells = ut.get_o_numpy( related_gene_of_all_cells_adata, what, sum=True) total_related_genes_of_all_cells += total_base_genes_of_all_cells mask_of_strong_related_cells = total_related_genes_of_all_cells >= min_cell_module_total count_of_strong_related_cells = np.sum( mask_of_strong_related_cells) ut.log_calc( f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} " f"strong cells: {count_of_strong_related_cells} " f"factor: {count_of_strong_related_cells / count_of_strong_base_cells}" ) if count_of_strong_related_cells > max_related_gene_increase_factor * count_of_strong_base_cells: continue related_gene_indices_of_module.append(gene_index) related_gene_indices_of_modules.append( related_gene_indices_of_module) # if ut.logging_calc(): ut.log_calc("related genes for modules:") for module_index, related_gene_indices_of_module in enumerate( related_gene_indices_of_modules): ut.log_calc( f"- module {module_index} related_gene_names", sorted(adata_of_all_genes_of_all_cells. var_names[related_gene_indices_of_module]), ) return related_gene_indices_of_modules
def find_systematic_genes( what: Union[str, ut.Matrix] = "__x__", *, adata: AnnData, qdata: AnnData, atlas_total_umis: Optional[ut.Vector] = None, query_total_umis: Optional[ut.Vector] = None, low_gene_quantile: float = pr.systematic_low_gene_quantile, high_gene_quantile: float = pr.systematic_high_gene_quantile, to_property_name: str = "systematic_gene", ) -> None: """ Find genes that **Input** Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** A matrix whose rows are query metacells and columns are atlas metacells, where each entry is the weight of the atlas metacell in the projection of the query metacells. The sum of weights in each row (that is, for a single query metacell) is 1. The weighted sum of the atlas metacells using these weights is the "projected" image of the query metacell onto the atlas. In addition, sets the following annotations in ``qdata``: Variable (Gene) Annotations ``systematic_gene`` (or ``to_property_name``) A boolean mask indicating whether the gene is systematically higher or lower in the query compared to the atlas. **Computation Parameters** 1. Compute the fraction of each gene out of the total UMIs in both the atlas and the query. If ``atlas_total_umis`` and/or ``query_total_umis`` are given, use them as the basis instead of the sum of the UMIs. 2. Compute for each gene its ``low_gene_quantile`` (default: {low_gene_quantile}) fraction in the query, and its ``high_gene_quantile`` (default: {high_gene_quantile}) fraction in the atlas. 3. Compute for each gene its standard deviation in the atlas. 4. Mark as systematic the genes for which the low quantile value in the query is at least the atlas high quantile value. 5. Mark as systematic the genes for which the low quantile value in the atlas is at least the query high quantile value. """ assert 0 <= low_gene_quantile <= 1 assert 0 <= high_gene_quantile <= 1 assert np.all(adata.var_names == qdata.var_names) query_umis = ut.get_vo_proper(qdata, what, layout="row_major") atlas_umis = ut.get_vo_proper(adata, what, layout="row_major") atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis)) query_fractions = ut.to_numpy_matrix(ut.fraction_by(query_umis, by="row", sums=query_total_umis)) query_fractions = ut.to_layout(query_fractions, layout="column_major") atlas_fractions = ut.to_layout(atlas_fractions, layout="column_major") query_low_gene_values = ut.quantile_per(query_fractions, low_gene_quantile, per="column") atlas_low_gene_values = ut.quantile_per(atlas_fractions, low_gene_quantile, per="column") query_high_gene_values = ut.quantile_per(query_fractions, high_gene_quantile, per="column") atlas_high_gene_values = ut.quantile_per(atlas_fractions, high_gene_quantile, per="column") query_above_atlas = query_low_gene_values > atlas_high_gene_values atlas_above_query = atlas_low_gene_values >= query_high_gene_values systematic = query_above_atlas | atlas_above_query ut.set_v_data(qdata, to_property_name, systematic)
def project_query_onto_atlas( what: Union[str, ut.Matrix] = "__x__", *, adata: AnnData, qdata: AnnData, atlas_total_umis: Optional[ut.Vector] = None, query_total_umis: Optional[ut.Vector] = None, project_log_data: bool = pr.project_log_data, fold_normalization: float = pr.project_fold_normalization, min_significant_gene_value: float = pr.project_min_significant_gene_value, max_consistency_fold_factor: float = pr.project_max_consistency_fold_factor, candidates_count: int = pr.project_candidates_count, min_usage_weight: float = pr.project_min_usage_weight, reproducible: bool, ) -> ut.CompressedMatrix: """ Project query metacells onto atlas metacells. **Input** Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. Typically this data excludes any genes having a systematic difference between the query and the atlas, e.g. genes detected by by :py:func:`metacells.tools.project.find_systematic_genes`. **Returns** A matrix whose rows are query metacells and columns are atlas metacells, where each entry is the weight of the atlas metacell in the projection of the query metacells. The sum of weights in each row (that is, for a single query metacell) is 1. The weighted sum of the atlas metacells using these weights is the "projected" image of the query metacell onto the atlas. In addition, sets the following annotations in ``qdata``: Observation (Cell) Annotations ``similar`` A boolean mask indicating whether the query metacell is similar to its projection onto the atlas. If ``False`` the metacells is said to be "dissimilar", which may indicate the query contains cell states that do not appear in the atlas. **Computation Parameters** 0. All fold computations (log2 of the ratio between gene expressions as a fraction of the total UMIs) use the ``fold_normalization`` (default: {fold_normalization}). Fractions are computed based on the total UMIs, unless ``atlas_total_umis`` and/or ``query_total_umis`` are specified. For each query metacell: 1. Correlate the metacell with all the atlas metacells, and pick the highest-correlated one as the "anchor". If ``reproducible``, a slower (still parallel) but reproducible algorithm will be used. 2. Consider as candidates only atlas metacells whose maximal gene fold factor compared to the anchor is at most ``max_consistency_fold_factor`` (default: {max_consistency_fold_factor}). Ignore the fold factors of genes whose sum of UMIs in the anchor and the candidate metacells is less than ``min_significant_gene_value`` (default: {min_significant_gene_value}). 3. Select the ``candidates_count`` (default: {candidates_count}) candidate metacells with the highest correlation with the query metacell. 4. Compute the non-negative weights (with a sum of 1) of the selected candidates that give the best projection of the query metacells onto the atlas. Since the algorithm for computing these weights rarely produces an exact 0 weight, reduce all weights less than the ``min_usage_weight`` (default: {min_usage_weight}) to zero. If ``project_log_data`` (default: {project_log_data}), compute the match on the log of the data instead of the actual data. """ assert fold_normalization > 0 assert candidates_count > 0 assert min_usage_weight >= 0 assert max_consistency_fold_factor >= 0 assert np.all(adata.var_names == qdata.var_names) atlas_umis = ut.get_vo_proper(adata, what, layout="row_major") query_umis = ut.get_vo_proper(qdata, what, layout="row_major") if atlas_total_umis is None: atlas_total_umis = ut.sum_per(atlas_umis, per="row") atlas_total_umis = ut.to_numpy_vector(atlas_total_umis) if query_total_umis is None: query_total_umis = ut.sum_per(query_umis, per="row") query_total_umis = ut.to_numpy_vector(query_total_umis) atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis)) query_fractions = ut.to_numpy_matrix(ut.fraction_by(query_umis, by="row", sums=query_total_umis)) atlas_fractions += fold_normalization query_fractions += fold_normalization atlas_log_fractions = np.log2(atlas_fractions) query_log_fractions = np.log2(query_fractions) atlas_fractions -= fold_normalization query_fractions -= fold_normalization if project_log_data: atlas_project_data = atlas_log_fractions query_project_data = query_log_fractions else: atlas_project_data = atlas_fractions query_project_data = query_fractions query_atlas_corr = ut.cross_corrcoef_rows(query_project_data, atlas_project_data, reproducible=reproducible) @ut.timed_call("project_single_metacell") def _project_single(query_metacell_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]: return _project_single_metacell( atlas_umis=atlas_umis, query_atlas_corr=query_atlas_corr, atlas_project_data=atlas_project_data, query_project_data=query_project_data, atlas_log_fractions=atlas_log_fractions, candidates_count=candidates_count, min_significant_gene_value=min_significant_gene_value, min_usage_weight=min_usage_weight, max_consistency_fold_factor=max_consistency_fold_factor, query_metacell_index=query_metacell_index, ) results = list(ut.parallel_map(_project_single, qdata.n_obs)) indices = np.concatenate([result[0] for result in results], dtype="int32") data = np.concatenate([result[1] for result in results], dtype="float32") atlas_used_sizes = [len(result[0]) for result in results] atlas_used_sizes.insert(0, 0) indptr = np.cumsum(np.array(atlas_used_sizes)) return sp.csr_matrix((data, indices, indptr), shape=(qdata.n_obs, adata.n_obs))
def compute_inner_fold_factors( what: Union[str, ut.Matrix] = "__x__", *, adata: AnnData, gdata: AnnData, group: Union[str, ut.Vector] = "metacell", min_gene_inner_fold_factor: float = pr.min_gene_inner_fold_factor, min_entry_inner_fold_factor: float = pr.min_entry_inner_fold_factor, inner_abs_folds: float = pr.inner_abs_folds, ) -> None: """ Compute the inner fold factors of genes within in each metacell. This computes, for each cell of the metacell, the same fold factors that are used to detect deviant cells (see :py:func:`metacells.tools.deviants.find_deviant_cells`), and keeps the maximal fold for each gene in the metacell. The result per-metacell-per-gene matrix is then made sparse by discarding too-low values (setting them to zero). Ideally, this matrix should be "very" sparse. If it contains "too many" non-zero values, this indicates the metacells contains "too much" variability. This may be due to actual biology (e.g. immune cells or olfactory nerves which are all similar except for each one expressing one different gene), due to batch effects (similar cells in distinct batches differing in some genes due to technical issues), due to low data quality (the overall noise level is so high that this is simply the best the algorithm can do), or worse - a combination of the above. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. In addition, ``gdata`` is assumed to have one observation for each group, and use the same genes as ``adata``. **Returns** Sets the following in ``gdata``: Per-Variable Per-Observation (Gene-Cell) Annotations ``inner_fold`` For each gene and group, the maximal fold factor of this gene in any cell contained in the group (unless the value is too low to be of interest, in which case it will be zero). **Computation Parameters** 1. For each group (metacell), for each gene, compute the gene's maximal (in all the cells of the group) fold factor log2((actual UMIs + 1) / (expected UMIs + 1)), similarly to :py:func:`metacells.tools.deviants.find_deviant_cells`. 2. If the maximal fold factor for a gene (across all metacells) is below ``min_gene_inner_fold_factor`` (default: {min_gene_inner_fold_factor}), then set all the gene's fold factors to zero (too low to be of interest). If ``inner_abs_folds`` (default: {inner_abs_folds}), consider the absolute fold factors. 3. Otherwise, for any metacell whose fold factor for the gene is less than ``min_entry_inner_fold_factor`` (default: {min_entry_inner_fold_factor}), set the fold factor to zero (too low to be of interest). """ assert 0 <= min_entry_inner_fold_factor <= min_gene_inner_fold_factor cells_data = ut.get_vo_proper(adata, what, layout="row_major") metacells_data = ut.get_vo_proper(gdata, what, layout="row_major") group_of_cells = ut.get_o_numpy(adata, group, formatter=ut.groups_description) total_umis_per_cell = ut.sum_per(cells_data, per="row") total_umis_per_metacell = ut.sum_per(metacells_data, per="row") @ut.timed_call("compute_metacell_inner_folds") def _compute_single_metacell_inner_folds( metacell_index: int) -> ut.NumpyVector: return _compute_metacell_inner_folds( metacell_index=metacell_index, cells_data=cells_data, metacells_data=metacells_data, group_of_cells=group_of_cells, total_umis_per_cell=total_umis_per_cell, total_umis_per_metacell=total_umis_per_metacell, ) results = list( ut.parallel_map(_compute_single_metacell_inner_folds, gdata.n_obs)) dense_inner_folds_by_row = np.array(results) dense_inner_folds_by_column = ut.to_layout(dense_inner_folds_by_row, "column_major") if inner_abs_folds: comparable_dense_inner_folds_by_column = np.abs( dense_inner_folds_by_column) else: comparable_dense_inner_folds_by_column = dense_inner_folds_by_column max_fold_per_gene = ut.max_per(comparable_dense_inner_folds_by_column, per="column") significant_genes_mask = max_fold_per_gene >= min_gene_inner_fold_factor ut.log_calc("significant_genes_mask", significant_genes_mask) dense_inner_folds_by_column[:, ~significant_genes_mask] = 0 dense_inner_folds_by_column[comparable_dense_inner_folds_by_column < min_entry_inner_fold_factor] = 0 dense_inner_folds_by_row = ut.to_layout(dense_inner_folds_by_column, layout="row_major") sparse_inner_folds = sparse.csr_matrix(dense_inner_folds_by_row) ut.set_vo_data(gdata, "inner_fold", sparse_inner_folds)
def group_obs_data( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, groups: Union[str, ut.Vector], name: Optional[str] = None, ) -> Optional[AnnData]: """ Compute new data which has the ``what`` (default: {what}) sum of the observations (cells) for each group. For example, having computed a metacell index for each cell, compute the per-metacell data for further analysis. If ``groups`` is a string, it is expected to be the name of a per-observation vector annotation. Otherwise it should be a vector. The group indices should be integers, where negative values indicate "no group" and non-negative values indicate the index of the group to which each observation (cell) belongs to. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** An annotated data where each observation is the sum of the group of original observations (cells). Observations with a negative group index are discarded. If all observations are discarded, return ``None``. The new data will contain only: * An ``X`` member holding the summed-per-group data. * A new ``grouped`` per-observation data which counts, for each group, the number of grouped observations summed into it. If ``name`` is not specified, the data will be unnamed. Otherwise, if it starts with a ``.``, it will be appended to the current name (if any). Otherwise, ``name`` is the new name. """ group_of_cells = ut.get_o_numpy(adata, groups, formatter=ut.groups_description) data = ut.get_vo_proper(adata, what, layout="row_major") results = ut.sum_groups(data, group_of_cells, per="row") if results is None: return None summed_data, cell_counts = results gdata = AnnData(summed_data) gdata.var_names = adata.var_names ut.set_name(gdata, ut.get_name(adata)) ut.set_name(gdata, name) ut.set_o_data(gdata, "grouped", cell_counts, formatter=ut.sizes_description) return gdata
def collect_metacells( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, max_cell_size: Optional[float] = pr.max_cell_size, max_cell_size_factor: Optional[float] = pr.max_cell_size_factor, cell_sizes: Optional[Union[str, ut.Vector]] = pr.cell_sizes, name: str = "metacells", top_level: bool = True, ) -> AnnData: """ Collect computed metacells ``what`` (default: {what}) data. **Input** Annotated (presumably "clean") ``adata``, where the observations are cells and the variables are genes, and where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Annotated metacell data containing for each observation the sum of the data (by of the cells for each metacell, which contains the following annotations: Variable (Gene) Annotations ``excluded_gene`` A mask of the genes which were excluded by name. ``clean_gene`` A boolean mask of the clean genes. ``forbidden_gene`` A boolean mask of genes which are forbidden from being chosen as "feature" genes based on their name. This is ``False`` for non-"clean" genes. If directly computing metecalls: ``feature`` A boolean mask of the "feature" genes. This is ``False`` for non-"clean" genes. If using divide-and-conquer: ``pre_feature``, ``feature`` The number of times the gene was used as a feature when computing the preliminary and final metacells. This is zero for non-"clean" genes. Observations (Cell) Annotations ``grouped`` The number of ("clean") cells grouped into each metacell. ``pile`` The index of the pile used to compute the metacell each cell was assigned to to. This is ``-1`` for non-"clean" cells. ``candidate`` The index of the candidate metacell each cell was assigned to to. This is ``-1`` for non-"clean" cells. Also sets all relevant annotations in the full data based on their value in the clean data, with appropriate defaults for non-"clean" data. **Computation Parameters** 1. Compute the cell's scale factors by invoking :py:func:`compute_effective_cell_sizes` using the ``max_cell_size`` (default: {max_cell_size}), ``max_cell_size_factor`` (default: {max_cell_size_factor}) and ``cell_sizes`` (default: {cell_sizes}). 2. Scale the cell's data using these factors, if needed. 3. Invoke :py:func:`metacells.tools.group.group_obs_data` to sum the cells into metacells. 4. Pass all relevant per-gene and per-cell annotations to the result. """ _cell_sizes, _max_cell_size, cell_scale_factors = compute_effective_cell_sizes( adata, max_cell_size=max_cell_size, max_cell_size_factor=max_cell_size_factor, cell_sizes=cell_sizes ) if cell_scale_factors is not None: data = ut.get_vo_proper(adata, what, layout="row_major") what = ut.scale_by(data, cell_scale_factors, by="row") mdata = tl.group_obs_data(adata, what, groups="metacell", name=name) assert mdata is not None if top_level: ut.top_level(mdata) for annotation_name in ("excluded_gene", "clean_gene", "forbidden_gene", "pre_feature_gene", "feature_gene"): if not ut.has_data(adata, annotation_name): continue value_per_gene = ut.get_v_numpy(adata, annotation_name, formatter=ut.mask_description) ut.set_v_data(mdata, annotation_name, value_per_gene, formatter=ut.mask_description) for annotation_name in ("pile", "candidate"): if ut.has_data(adata, annotation_name): tl.group_obs_annotation( adata, mdata, groups="metacell", formatter=ut.groups_description, name=annotation_name, method="unique" ) return mdata
def _compute_elements_similarity( # pylint: disable=too-many-branches adata: AnnData, elements: str, per: str, what: Union[str, ut.Matrix], *, method: str, reproducible: bool, logistics_location: float, logistics_slope: float, top: Optional[int], bottom: Optional[int], inplace: bool, ) -> Optional[ut.PandasFrame]: assert elements in ("obs", "var") assert method in ("pearson", "repeated_pearson", "logistics", "logistics_pearson") data = ut.get_vo_proper(adata, what, layout=f"{per}_major") dense = ut.to_numpy_matrix(data) similarity: ut.ProperMatrix if method.startswith("logistics"): similarity = ut.logistics(dense, location=logistics_location, slope=logistics_slope, per=per) similarity *= -1 similarity += 1 else: similarity = ut.corrcoef(dense, per=per, reproducible=reproducible) if method.endswith("_pearson"): similarity = ut.corrcoef(similarity, per=None, reproducible=reproducible) if top is not None: top_similarity = ut.top_per(similarity, top, per="row") if bottom is not None: similarity *= -1 bottom_similarity = ut.top_per(similarity, bottom, per="row") bottom_similarity *= -1 # type: ignore if top is not None: if bottom is not None: similarity = top_similarity + bottom_similarity # type: ignore else: similarity = top_similarity else: if bottom is not None: similarity = bottom_similarity if inplace: to = elements + "_similarity" if elements == "obs": ut.set_oo_data(adata, to, similarity) else: ut.set_vv_data(adata, to, similarity) return None if elements == "obs": names = adata.obs_names else: names = adata.var_names return ut.to_pandas_frame(similarity, index=names, columns=names)
def compute_knn_by_features( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, max_top_feature_genes: int = pr.max_top_feature_genes, similarity_value_normalization: float = pr. umap_similarity_value_normalization, similarity_log_data: bool = pr.umap_similarity_log_data, similarity_method: str = pr.umap_similarity_method, logistics_location: float = pr.logistics_location, logistics_slope: float = pr.logistics_slope, k: int, balanced_ranks_factor: float = pr.knn_balanced_ranks_factor, incoming_degree_factor: float = pr.knn_incoming_degree_factor, outgoing_degree_factor: float = pr.knn_outgoing_degree_factor, reproducible: bool = pr.reproducible, ) -> ut.PandasFrame: """ Compute KNN graph between metacells based on feature genes. If ``reproducible`` (default: {reproducible}) is ``True``, a slower (still parallel) but reproducible algorithm will be used to compute pearson correlations. **Input** Annotated ``adata`` where each observation is a metacells and the variables are genes, are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Sets the following in ``adata``: Observations-Pair (Metacells) Annotations ``obs_outgoing_weights`` A sparse square matrix where each non-zero entry is the weight of an edge between a pair of cells or genes, where the sum of the weights of the outgoing edges for each element is 1 (there is always at least one such edge). Also return a pandas data frame of the similarities between the observations (metacells). **Computation Parameters** 1. Invoke :py:func:`metacells.tools.high.find_top_feature_genes` using ``max_top_feature_genes`` (default: {max_top_feature_genes}) to pick the feature genes to use to compute similarities between the metacells. 2. Compute the fractions of each gene in each cell, and add the ``similarity_value_normalization`` (default: {similarity_value_normalization}) to it. 3. If ``similarity_log_data`` (default: {similarity_log_data}), invoke the :py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of the data. 4. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` using ``similarity_method`` (default: {similarity_method}), ``logistics_location`` (default: {logistics_slope}) and ``logistics_slope`` (default: {logistics_slope}) and convert this to distances. 5. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` using the distances, ``k`` (no default!), ``balanced_ranks_factor`` (default: {balanced_ranks_factor}), ``incoming_degree_factor`` (default: {incoming_degree_factor}), ``outgoing_degree_factor`` (default: {outgoing_degree_factor}) to compute a "skeleton" graph to overlay on top of the UMAP graph. """ tl.find_top_feature_genes(adata, max_genes=max_top_feature_genes) all_data = ut.get_vo_proper(adata, what, layout="row_major") all_fractions = ut.fraction_by(all_data, by="row") top_feature_genes_mask = ut.get_v_numpy(adata, "top_feature_gene") top_feature_genes_fractions = all_fractions[:, top_feature_genes_mask] top_feature_genes_fractions = ut.to_layout(top_feature_genes_fractions, layout="row_major") top_feature_genes_fractions = ut.to_numpy_matrix( top_feature_genes_fractions) top_feature_genes_fractions += similarity_value_normalization if similarity_log_data: top_feature_genes_fractions = ut.log_data(top_feature_genes_fractions, base=2) tdata = ut.slice(adata, vars=top_feature_genes_mask) similarities = tl.compute_obs_obs_similarity( tdata, top_feature_genes_fractions, method=similarity_method, reproducible=reproducible, logistics_location=logistics_location, logistics_slope=logistics_slope, inplace=False, ) assert similarities is not None tl.compute_obs_obs_knn_graph( adata, similarities, k=k, balanced_ranks_factor=balanced_ranks_factor, incoming_degree_factor=incoming_degree_factor, outgoing_degree_factor=outgoing_degree_factor, ) return similarities
def compute_deviant_fold_factors( what: Union[str, ut.Matrix] = "__x__", *, adata: AnnData, gdata: AnnData, group: Union[str, ut.Vector] = "metacell", similar: Union[str, ut.Vector] = "similar", significant_gene_fold_factor: float = pr.significant_gene_fold_factor, ) -> None: """ Given an assignment of observations (cells) to groups (metacells) or, if an outlier, to the most similar groups, compute for each observation and gene the fold factor relative to its group for the purpose of detecting deviant cells. Ideally, all grouped cells would have no genes with high enough fold factors to be considered deviants, and all outlier cells would. In practice grouped cells might have a (few) such genes to the restriction on the fraction of deviants. It is important not to read too much into the results for a single cell, but looking at which genes appear for cell populations (e.g., cells with specific metadata such as batch identification) might be instructive. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. In addition, ``gdata`` is assumed to have one observation for each group, and use the same genes as ``adata``. **Returns** Sets the following in ``adata``: Per-Variable Per-Observation (Gene-Cell) Annotations ``deviant_fold`` The fold factor between the cell's UMIs and the expected number of UMIs for the purpose of computing deviant cells. **Computation Parameters** 1. For each cell, compute the expected UMIs for each gene given the fraction of the gene in the metacells associated with the cell (the one it is belongs to, or the most similar one for outliers). If this is less than ``significant_gene_fold_factor`` (default: {significant_gene_fold_factor}), set it to zero so the result will be sparse. """ cells_data = ut.get_vo_proper(adata, what, layout="row_major") metacells_data = ut.get_vo_proper(gdata, what, layout="row_major") total_umis_per_cell = ut.sum_per(cells_data, per="row") total_umis_per_metacell = ut.sum_per(metacells_data, per="row") group_of_cells = ut.get_o_numpy(adata, group, formatter=ut.groups_description) similar_of_cells = ut.get_o_numpy(adata, similar, formatter=ut.groups_description) @ut.timed_call("compute_cell_deviant_certificates") def _compute_cell_deviant_certificates( cell_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]: return _compute_cell_certificates( cell_index=cell_index, cells_data=cells_data, metacells_data=metacells_data, group_of_cells=group_of_cells, similar_of_cells=similar_of_cells, total_umis_per_cell=total_umis_per_cell, total_umis_per_metacell=total_umis_per_metacell, significant_gene_fold_factor=significant_gene_fold_factor, ) results = list( ut.parallel_map(_compute_cell_deviant_certificates, adata.n_obs)) cell_indices = np.concatenate([ np.full(len(result[0]), cell_index, dtype="int32") for cell_index, result in enumerate(results) ]) gene_indices = np.concatenate([result[0] for result in results]) fold_factors = np.concatenate([result[1] for result in results]) deviant_folds = sparse.csr_matrix( (fold_factors, (cell_indices, gene_indices)), shape=adata.shape) ut.set_vo_data(adata, "deviant_folds", deviant_folds)
def compute_outliers_matches( what: Union[str, ut.Matrix] = "__x__", *, adata: AnnData, gdata: AnnData, group: Union[str, ut.Vector] = "metacell", similar: str = "similar", value_normalization: float = pr.outliers_value_normalization, reproducible: bool, ) -> None: """ Given an assignment of observations (cells) to groups (metacells), compute for each outlier the "most similar" group. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. In addition, ``gdata`` is assumed to have one observation for each group, and use the same genes as ``adata``. **Returns** Sets the following in ``adata``: Per-Observation (Cell) Annotations ``similar`` (default: {similar}) For each observation (cell), the index of the "most similar" group. **Computation Parameters** 1. Compute the log2 of the fraction of each gene in each of the outlier cells and the group metacells using the ``value_normalization`` (default: {value_normalization}). 2. Cross-correlate each of the outlier cells with each of the group metacells, in a ``reproducible`` manner. """ group_of_cells = ut.get_o_numpy(adata, group) outliers_mask = group_of_cells < 0 odata = ut.slice(adata, obs=outliers_mask) outliers_data = ut.get_vo_proper(odata, what, layout="row_major") groups_data = ut.get_vo_proper(gdata, what, layout="row_major") outliers_fractions = ut.fraction_by(outliers_data, by="row") groups_fractions = ut.fraction_by(groups_data, by="row") outliers_fractions = ut.to_numpy_matrix(outliers_fractions) groups_fractions = ut.to_numpy_matrix(groups_fractions) outliers_fractions += value_normalization groups_fractions += value_normalization outliers_log_fractions = np.log2(outliers_fractions, out=outliers_fractions) groups_log_fractions = np.log2(groups_fractions, out=groups_fractions) outliers_groups_correlation = ut.cross_corrcoef_rows( outliers_log_fractions, groups_log_fractions, reproducible=reproducible) outliers_similar_group_indices = np.argmax(outliers_groups_correlation, axis=1) assert len(outliers_similar_group_indices) == odata.n_obs cells_similar_group_indices = np.full(adata.n_obs, -1, dtype="int32") cells_similar_group_indices[outliers_mask] = outliers_similar_group_indices ut.set_o_data(adata, similar, cells_similar_group_indices)
def compute_subset_distinct_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, prefix: Optional[str] = None, scale: Optional[Union[bool, str, ut.NumpyVector]], subset: Union[str, ut.NumpyVector], normalization: float, ) -> Optional[Tuple[ut.PandasSeries, ut.PandasSeries]]: """ Given a subset of the observations (cells), compute for each gene how distinct its ``what`` (default: {what}) value is in the subset compared to the overall population. This is the area-under-curve of the receiver operating characteristic (AUROC) for the gene, that is, the probability that a randomly selected observation (cell) in the subset will have a higher value than a randomly selected observation (cell) outside the subset. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``<prefix>_fold`` Store the ratio of the expression of the gene in the subset as opposed to the rest of the population. ``<prefix>_auroc`` Store the distinctiveness of the gene in the subset as opposed to the rest of the population. If ``prefix`` (default: {prefix}), is specified, this is written to the data. Otherwise this is returned as two pandas series (indexed by the gene names). **Computation Parameters** 1. Use the ``subset`` to assign a boolean label to each observation (cell). The ``subset`` can be a vector of integer observation names, or a boolean mask, or the string name of a per-observation annotation containing the boolean mask. 2. If ``scale`` is ``False``, use the data as-is. If it is ``True``, divide the data by the sum of each observation (cell). If it is a string, it should be the name of a per-observation annotation to use. Otherwise, it should be a vector of the scale factor for each observation (cell). 3. Compute the fold ratios using the ``normalization`` (no default!) and the AUROC for each gene, for the scaled data based on this mask. """ if isinstance(subset, str): subset = ut.get_o_numpy(adata, subset) if subset.dtype != "bool": mask: ut.NumpyVector = np.full(adata.n_obs, False) mask[subset] = True subset = mask scale_of_cells: Optional[ut.NumpyVector] = None if not isinstance(scale, bool): scale_of_cells = ut.maybe_o_numpy(adata, scale, formatter=ut.sizes_description) elif scale: scale_of_cells = ut.get_o_numpy(adata, what, sum=True) else: scale_of_cells = None matrix = ut.get_vo_proper(adata, what, layout="column_major").transpose() fold_of_genes, auroc_of_genes = ut.matrix_rows_folds_and_aurocs( matrix, columns_subset=subset, columns_scale=scale_of_cells, normalization=normalization) if prefix is not None: ut.set_v_data(adata, f"{prefix}_auroc", auroc_of_genes) ut.set_v_data(adata, f"{prefix}_fold", fold_of_genes) return None return ( ut.to_pandas_series(fold_of_genes, index=adata.var_names), ut.to_pandas_series(auroc_of_genes, index=adata.var_names), )
def compute_distinct_folds( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, normalization: float = 0, inplace: bool = True, ) -> Optional[ut.PandasFrame]: """ Compute for each observation (cell) and each variable (gene) how much is the ``what`` (default: {what}) value different from the overall population. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Per-Observation-Per-Variable (Cell-Gene) Annotations: ``distinct_ratio`` For each gene in each cell, the log based 2 of the ratio between the fraction of the gene in the cell and the fraction of the gene in the overall population (sum of cells). If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas frame (indexed by the observation and distinct gene rank). **Computation Parameters** 1. Compute, for each gene, the fraction of the gene's values out of the total sum of the values (that is, the mean fraction of the gene's expression in the population). 2. Compute, for each cell, for each gene, the fraction of the gene's value out of the sum of the values in the cell (that is, the fraction of the gene's expression in the cell). 3. Divide the two to the distinct ratio (that is, how much the gene's expression in the cell is different from the overall population), first adding the ``normalization`` (default: {normalization}) to both. 4. Compute the log (base 2) of the result and use it as the fold factor. """ columns_data = ut.get_vo_proper(adata, what, layout="column_major") fractions_of_genes_in_data = ut.fraction_per(columns_data, per="column") fractions_of_genes_in_data += normalization total_umis_of_cells = ut.get_o_numpy(adata, what, sum=True) total_umis_of_cells[total_umis_of_cells == 0] = 1 rows_data = ut.get_vo_proper(adata, what, layout="row_major") fraction_of_genes_in_cells = ut.to_numpy_matrix( rows_data) / total_umis_of_cells[:, np.newaxis] fraction_of_genes_in_cells += normalization zeros_mask = fractions_of_genes_in_data <= 0 fractions_of_genes_in_data[zeros_mask] = -1 fraction_of_genes_in_cells[:, zeros_mask] = -1 ratio_of_genes_in_cells = fraction_of_genes_in_cells ratio_of_genes_in_cells /= fractions_of_genes_in_data assert np.min(np.min(ratio_of_genes_in_cells)) > 0 fold_of_genes_in_cells = np.log2(ratio_of_genes_in_cells, out=ratio_of_genes_in_cells) if inplace: ut.set_vo_data(adata, "distinct_fold", fold_of_genes_in_cells) return None return ut.to_pandas_frame(fold_of_genes_in_cells, index=adata.obs_names, columns=adata.var_names)
def find_properly_sampled_cells( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_cell_total: Optional[int], max_cell_total: Optional[int], excluded_adata: Optional[AnnData] = None, max_excluded_genes_fraction: Optional[float], inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Detect cells with a "proper" amount of ``what`` (default: {what}) data. Due to both technical effects and natural variance between cells, the total number of UMIs varies from cell to cell. We often would like to work on cells that contain a sufficient number of UMIs for meaningful analysis; we sometimes also wish to exclude cells which have "too many" UMIs. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Observation (Cell) Annotations ``properly_sampled_cell`` A boolean mask indicating whether each cell has a "proper" amount of UMIs. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the observation names). **Computation Parameters** 1. Exclude all cells whose total data is less than the ``min_cell_total`` (no default), unless it is ``None``. 2. Exclude all cells whose total data is more than the ``max_cell_total`` (no default), unless it is ``None``. 3. If ``max_excluded_genes_fraction`` (no default) is not ``None``, then ``excluded_adata`` must not be ``None`` and should contain just the excluded genes data for each cell. Exclude all cells whose sum of the excluded data divided by the total data is more than the specified threshold. """ assert (max_excluded_genes_fraction is None) == (excluded_adata is None) total_of_cells = ut.get_o_numpy(adata, what, sum=True) cells_mask = np.full(adata.n_obs, True, dtype="bool") if min_cell_total is not None: cells_mask = cells_mask & (total_of_cells >= min_cell_total) if max_cell_total is not None: cells_mask = cells_mask & (total_of_cells <= max_cell_total) if excluded_adata is not None: assert max_excluded_genes_fraction is not None excluded_data = ut.get_vo_proper(excluded_adata, layout="row_major") excluded_of_cells = ut.sum_per(excluded_data, per="row") if np.min(total_of_cells) == 0: total_of_cells = np.copy(total_of_cells) total_of_cells[total_of_cells == 0] = 1 excluded_fraction = excluded_of_cells / total_of_cells cells_mask = cells_mask & (excluded_fraction <= max_excluded_genes_fraction) if inplace: ut.set_o_data(adata, "properly_sampled_cell", cells_mask) return None ut.log_return("properly_sampled_cell", cells_mask) return ut.to_pandas_series(cells_mask, index=adata.obs_names)
def renormalize_query_by_atlas( # pylint: disable=too-many-statements,too-many-branches what: str = "__x__", *, adata: AnnData, qdata: AnnData, var_annotations: Dict[str, Any], layers: Dict[str, Any], varp_annotations: Dict[str, Any], ) -> Optional[AnnData]: """ Add an ``ATLASNORM`` pseudo-gene to query metacells data to compensate for the query having filtered out many genes. This renormalizes the gene fractions in the query to fit the atlas in case the query has aggressive filtered a significant amount of genes. **Input** Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where ``X`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** None if no normalization is needed (or possible). Otherwise, a copy of the query metacells data, with an additional variable (gene) called ``ATLASNORM`` to the query data, such that the total number of UMIs for each query metacells is as expected given the total number of UMIs of the genes common to the query and the atlas. This is skipped if the query and the atlas have exactly the same list of genes, or if if the query already contains a high number of genes missing from the atlas so that the total number of UMIs for the query metacells is already at least the expected based on the common genes. **Computation Parameters** 1. Computes how many UMIs should be added to each query metacell so that its (total UMIs / total common gene UMIs) would be the same as the (total atlas UMIs / total atlas common UMIs). If this is zero (or negative), stop. 2. Add an ``ATLASNORM`` pseudo-gene to the query with the above amount of UMIs. For each per-variable (gene) observation, add the value specified in ``var_annotations``, whose list of keys must cover the set of per-variable annotations in the query data. For each per-observation-per-variable layer, add the value specified in ``layers``, whose list of keys must cover the existing layers. For each per-variable-per-variable annotation, add the value specified in ``varp_annotations``. """ for name in qdata.var.keys(): if "|" not in name and name not in var_annotations.keys(): raise RuntimeError(f"missing default value for variable annotation {name}") for name in qdata.layers.keys(): if name not in layers.keys(): raise RuntimeError(f"missing default value for layer {name}") for name in qdata.varp.keys(): if name not in varp_annotations.keys(): raise RuntimeError(f"missing default value for variable-variable {name}") if list(qdata.var_names) == list(adata.var_names): return None query_genes_list = list(qdata.var_names) atlas_genes_list = list(adata.var_names) common_genes_list = list(sorted(set(qdata.var_names) & set(adata.var_names))) query_gene_indices = np.array([query_genes_list.index(gene) for gene in common_genes_list]) atlas_gene_indices = np.array([atlas_genes_list.index(gene) for gene in common_genes_list]) common_qdata = ut.slice(qdata, name=".common", vars=query_gene_indices, track_var="full_index") common_adata = ut.slice(adata, name=".common", vars=atlas_gene_indices, track_var="full_index") assert list(common_qdata.var_names) == list(common_adata.var_names) atlas_total_umis_per_metacell = ut.get_o_numpy(adata, what, sum=True) atlas_common_umis_per_metacell = ut.get_o_numpy(common_adata, what, sum=True) atlas_total_umis = np.sum(atlas_total_umis_per_metacell) atlas_common_umis = np.sum(atlas_common_umis_per_metacell) atlas_disjoint_umis_fraction = atlas_total_umis / atlas_common_umis - 1.0 ut.log_calc("atlas_total_umis", atlas_total_umis) ut.log_calc("atlas_common_umis", atlas_common_umis) ut.log_calc("atlas_disjoint_umis_fraction", atlas_disjoint_umis_fraction) query_total_umis_per_metacell = ut.get_o_numpy(qdata, what, sum=True) query_common_umis_per_metacell = ut.get_o_numpy(common_qdata, what, sum=True) query_total_umis = np.sum(query_total_umis_per_metacell) query_common_umis = np.sum(query_common_umis_per_metacell) query_disjoint_umis_fraction = query_total_umis / query_common_umis - 1.0 ut.log_calc("query_total_umis", query_total_umis) ut.log_calc("query_common_umis", query_common_umis) ut.log_calc("query_disjoint_umis_fraction", query_disjoint_umis_fraction) if query_disjoint_umis_fraction >= atlas_disjoint_umis_fraction: return None query_normalization_umis_fraction = atlas_disjoint_umis_fraction - query_disjoint_umis_fraction ut.log_calc("query_normalization_umis_fraction", query_normalization_umis_fraction) query_normalization_umis_per_metacell = query_common_umis_per_metacell * query_normalization_umis_fraction _proper, dense, compressed = ut.to_proper_matrices(qdata.X) if dense is None: assert compressed is not None dense = ut.to_numpy_matrix(compressed) added = np.concatenate([dense, query_normalization_umis_per_metacell[:, np.newaxis]], axis=1) if compressed is not None: added = sp.csr_matrix(added) assert added.shape[0] == qdata.shape[0] assert added.shape[1] == qdata.shape[1] + 1 ndata = AnnData(added) ndata.obs_names = qdata.obs_names var_names = list(qdata.var_names) var_names.append("ATLASNORM") ndata.var_names = var_names for name, value in qdata.uns.items(): ut.set_m_data(ndata, name, value) for name, value in qdata.obs.items(): ut.set_o_data(ndata, name, value) for name, value in qdata.obsp.items(): ut.set_oo_data(ndata, name, value) for name in qdata.var.keys(): if "|" in name: continue value = ut.get_v_numpy(qdata, name) value = np.append(value, [var_annotations[name]]) ut.set_v_data(ndata, name, value) for name in qdata.layers.keys(): data = ut.get_vo_proper(qdata, name) _proper, dense, compressed = ut.to_proper_matrices(data) if dense is None: assert compressed is not None dense = ut.to_numpy_matrix(compressed) values = np.full(qdata.n_obs, layers[name], dtype=dense.dtype) added = np.concatenate([dense, values[:, np.newaxis]], axis=1) if compressed is not None: added = sp.csr_matrix(added) ut.set_vo_data(ndata, name, added) for name in qdata.varp.keys(): data = ut.get_vv_proper(qdata, name) _proper, dense, compressed = ut.to_proper_matrices(data) if dense is None: assert compressed is not None dense = ut.to_numpy_matrix(compressed) values = np.full(qdata.n_vars, varp_annotations[name], dtype=dense.dtype) added = np.concatenate([dense, values[:, np.newaxis]], axis=1) values = np.full(qdata.n_vars + 1, varp_annotations[name], dtype=dense.dtype) added = np.concatenate([added, values[:, np.newaxis]], axis=0) if compressed is not None: added = sp.csr_matrix(added) ut.set_vv_data(ndata, name, added) return ndata
def compute_inner_normalized_variance( what: Union[str, ut.Matrix] = "__x__", *, compatible_size: Optional[str] = None, downsample_min_samples: int = pr.downsample_min_samples, downsample_min_cell_quantile: float = pr.downsample_min_cell_quantile, downsample_max_cell_quantile: float = pr.downsample_max_cell_quantile, min_gene_total: int = pr.quality_min_gene_total, adata: AnnData, gdata: AnnData, group: Union[str, ut.Vector] = "metacell", random_seed: int = pr.random_seed, ) -> None: """ Compute the inner normalized variance (variance / mean) for each gene in each group. This is also known as the "index of dispersion" and can serve as a quality measure for the groups. An ideal group would contain only cells with "the same" biological state and all remaining inner variance would be due to technical sampling noise. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. In addition, ``gdata`` is assumed to have one observation for each group, and use the same genes as ``adata``. **Returns** Sets the following in ``gdata``: Per-Variable Per-Observation (Gene-Cell) Annotations ``inner_variance`` For each gene and group, the variance of the gene in the group. ``inner_normalized_variance`` For each gene and group, the normalized variance (variance over mean) of the gene in the group. **Computation Parameters** For each group (metacell): 1. If ``compatible_size`` (default: {compatible_size}) is specified, it should be an integer per-observation annotation of the groups, whose value is at most the number of grouped cells in the group. Pick a random subset of the cells of this size. If ``compatible_size`` is ``None``, use all the cells of the group. 2. Invoke :py:func:`metacells.tools.downsample.downsample_cells` to downsample the surviving cells to the same total number of UMIs, using the ``downsample_min_samples`` (default: {downsample_min_samples}), ``downsample_min_cell_quantile`` (default: {downsample_min_cell_quantile}), ``downsample_max_cell_quantile`` (default: {downsample_max_cell_quantile}) and the ``random_seed`` (default: {random_seed}). 3. Compute the normalized variance of each gene based on the downsampled data. Set the result to ``nan`` for genes with less than ``min_gene_total`` (default: {min_gene_total}). """ cells_data = ut.get_vo_proper(adata, what, layout="row_major") if compatible_size is not None: compatible_size_of_groups: Optional[ut.NumpyVector] = ut.get_o_numpy( gdata, compatible_size, formatter=ut.sizes_description) else: compatible_size_of_groups = None group_of_cells = ut.get_o_numpy(adata, group, formatter=ut.groups_description) groups_count = np.max(group_of_cells) + 1 assert groups_count > 0 assert gdata.n_obs == groups_count variance_per_gene_per_group = np.full(gdata.shape, None, dtype="float32") normalized_variance_per_gene_per_group = np.full(gdata.shape, None, dtype="float32") for group_index in range(groups_count): with ut.log_step( "- group", group_index, formatter=lambda group_index: ut.progress_description( groups_count, group_index, "group"), ): if compatible_size_of_groups is not None: compatible_size_of_group = compatible_size_of_groups[ group_index] else: compatible_size_of_group = None _collect_group_data( group_index, group_of_cells=group_of_cells, cells_data=cells_data, compatible_size=compatible_size_of_group, downsample_min_samples=downsample_min_samples, downsample_min_cell_quantile=downsample_min_cell_quantile, downsample_max_cell_quantile=downsample_max_cell_quantile, min_gene_total=min_gene_total, random_seed=random_seed, variance_per_gene_per_group=variance_per_gene_per_group, normalized_variance_per_gene_per_group= normalized_variance_per_gene_per_group, ) ut.set_vo_data(gdata, "inner_variance", variance_per_gene_per_group) ut.set_vo_data(gdata, "inner_normalized_variance", normalized_variance_per_gene_per_group)
def compute_groups_self_consistency( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, group: str = "metacell", genes_mask: Optional[ut.NumpyVector] = None, self_similarity_log_data: bool = pr.self_similarity_log_data, self_similarity_value_normalization: float = pr. self_similarity_value_normalization, self_similarity_method: str = pr.self_similarity_method, reproducible: bool = pr.reproducible, logistics_location: float = pr.logistics_location, logistics_slope: float = pr.logistics_slope, ) -> ut.NumpyVector: """ Compute the self consistency (similarity between two halves) of some groups. **Input** The input annotated ``adata`` is expected to contain a per-observation annotation named ``group`` (default: {group}) which identifies the group (metacells) each observation (cell) belongs to, and ``half_<group>`` which identifies the half-group each observation belongs to (e.g. as computed by :py:func:`split_groups`). Specifically, the indices of the halves of group index ``i`` are ``i`` and ``i + groups_count``. **Returns** A Numpy vector holding, for each group, the similarity between its two halves. **Computation Parameters** 1. For each group, compute the sum of values in each half and normalize it to fractions (sum of 1). 2. If ``genes_mask`` is specified, select only the genes specified in it. Note the sum of the fractions of the genes of each group in the result will be less than or equal to 1. 3. If ``self_similarity_log_data`` (default: {self_similarity_log_data}), log2 the values using ``self_similarity_value_normalization`` (default: {self_similarity_value_normalization}). 4. Compute the ``self_similarity_method`` (default: {self_similarity_method}) between the two halves. If this is the ``logistics`` similarity, then this will use ``logistics_location`` (default: {logistics_location}) and ``logistics_slope`` (default: {logistics_slope}). If this is ``pearson``, and if ``reproducible`` (default: {reproducible}) is ``True``, a slower (still parallel) but reproducible algorithm will be used to compute Pearson correlations. """ hdata = tl.group_obs_data(adata, what, groups=f"half_{group}", name=".halves") assert hdata is not None sum_of_halves = ut.get_o_numpy(hdata, f"{what}|sum") halves_values = ut.to_numpy_matrix( ut.get_vo_proper(hdata, what, layout="row_major")) halves_data = ut.mustbe_numpy_matrix( ut.scale_by(halves_values, sum_of_halves, by="row")) if self_similarity_value_normalization > 0: halves_data += self_similarity_value_normalization if self_similarity_log_data: halves_data = ut.log_data(halves_data, base=2) if genes_mask is not None: halves_data = halves_data[:, genes_mask] assert hdata.n_obs % 2 == 0 groups_count = hdata.n_obs // 2 low_half_indices = np.arange(groups_count) high_half_indices = low_half_indices + groups_count low_half_data = halves_data[low_half_indices, :] high_half_data = halves_data[high_half_indices, :] assert self_similarity_method in ("logistics", "pearson") if self_similarity_method == "logistics": similarity = ut.pairs_logistics_rows(low_half_data, high_half_data, location=logistics_location, slope=logistics_slope) similarity *= -1 similarity += 1 else: similarity = ut.pairs_corrcoef_rows(low_half_data, high_half_data, reproducible=reproducible) return similarity
def compute_significant_projected_fold_factors( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, total_umis: Optional[ut.Vector], projected: Union[str, ut.Matrix] = "projected", fold_normalization: float = pr.project_fold_normalization, min_significant_gene_value: float = pr.project_min_significant_gene_value, min_gene_fold_factor: float = pr.project_max_projection_fold_factor, min_entry_fold_factor: float = pr.min_entry_project_fold_factor, abs_folds: bool = pr.project_abs_folds, ) -> None: """ Compute the significant projected fold factors of genes for each query metacell. This computes, for each metacell of the query, the fold factors between the actual query UMIs and the UMIs of the projection of the metacell onto the atlas (see :py:func:`metacells.tools.project.project_query_onto_atlas`). The result per-metacell-per-gene matrix is then made sparse by discarding too-low values (setting them to zero). Ideally, this matrix should be "very" sparse. If it contains "too many" non-zero values, more genes need to be ignored by the projection, or somehow corrected for batch effects prior to computing the projection. **Input** Annotated ``adata``, where the observations are query metacells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. In addition, the ``projected`` UMIs of each query metacells onto the atlas. **Returns** Sets the following in ``gdata``: Per-Variable Per-Observation (Gene-Cell) Annotations ``projected_fold`` For each gene and query metacell, the fold factor of this gene between the query and its projection (unless the value is too low to be of interest, in which case it will be zero). **Computation Parameters** 1. For each group (metacell), for each gene, compute the gene's fold factor log2((actual UMIs + ``fold_normalization``) / (expected UMIs + ``fold_normalization``)), similarly to :py:func:`metacells.tools.project.project_query_onto_atlas` (the default ``fold_normalization`` is {fold_normalization}). 2. Set the fold factor to zero for every case where the total UMIs in the query metacell and the projected image is not at least ``min_significant_gene_value`` (default: {min_significant_gene_value}). 3. If the maximal fold factor for a gene (across all metacells) is below ``min_gene_fold_factor`` (default: {min_gene_fold_factor}), then set all the gene's fold factors to zero (too low to be of interest). 4. Otherwise, for any metacell whose fold factor for the gene is less than ``min_entry_fold_factor`` (default: {min_entry_fold_factor}), set the fold factor to zero (too low to be of interest). If ``abs_folds`` (default: {abs_folds}), consider the absolute fold factors. """ assert 0 <= min_entry_fold_factor <= min_gene_fold_factor assert fold_normalization >= 0 metacells_data = ut.get_vo_proper(adata, what, layout="row_major") projected_data = ut.get_vo_proper(adata, projected, layout="row_major") metacells_fractions = ut.fraction_by(metacells_data, by="row", sums=total_umis) projected_fractions = ut.fraction_by(projected_data, by="row", sums=total_umis) metacells_fractions += fold_normalization # type: ignore projected_fractions += fold_normalization # type: ignore dense_folds = metacells_fractions / projected_fractions # type: ignore dense_folds = np.log2(dense_folds, out=dense_folds) total_umis = ut.to_numpy_matrix(metacells_data + projected_data) # type: ignore insignificant_folds_mask = total_umis < min_significant_gene_value ut.log_calc("insignificant entries", insignificant_folds_mask) dense_folds[insignificant_folds_mask] = 0.0 significant_folds = significant_folds_matrix(dense_folds, min_gene_fold_factor, min_entry_fold_factor, abs_folds) ut.set_vo_data(adata, "projected_fold", significant_folds)
def downsample_cells( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, downsample_min_cell_quantile: float = pr.downsample_min_cell_quantile, downsample_min_samples: float = pr.downsample_min_samples, downsample_max_cell_quantile: float = pr.downsample_max_cell_quantile, random_seed: int = pr.random_seed, inplace: bool = True, ) -> Optional[ut.PandasFrame]: """ Downsample the values of ``what`` (default: {what}) data. Downsampling is an effective way to get the same number of samples in multiple cells (that is, the same number of total UMIs in multiple cells), and serves as an alternative to normalization (e.g., working with UMI fractions instead of raw UMI counts). Downsampling is especially important when computing correlations between cells. When there is high variance between the total UMI count in different cells, then normalization will return higher correlation values between cells with a higher total UMI count, which will result in an inflated estimation of their similarity to other cells. Downsampling avoids this effect. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable-Observation (Gene-Cell) Annotations ``downsampled`` The downsampled data where the total number of samples in each cell is at most ``samples``. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas data frame (indexed by the cell and gene names). **Computation Parameters** 1. Compute the total samples in each cell. 2. Decide on the value to downsample to. We would like all cells to end up with at least some reasonable number of samples (total UMIs) ``downsample_min_samples`` (default: {downsample_min_samples}). We'd also like all (most) cells to end up with the highest reasonable downsampled total number of samples, so if possible we increase the number of samples, as long as at most ``downsample_min_cell_quantile`` (default: {downsample_min_cell_quantile}) cells will have lower number of samples. We'd also like all (most) cells to end up with the same downsampled total number of samples, so if we have to we decrease the number of samples to ensure at most ``downsample_max_cell_quantile`` (default: {downsample_max_cell_quantile}) cells will have a lower number of samples. 3. Downsample each cell so that it has at most the selected number of samples. Use the ``random_seed`` to allow making this replicable. """ total_per_cell = ut.get_o_numpy(adata, what, sum=True) samples = int( round( min( max(downsample_min_samples, np.quantile(total_per_cell, downsample_min_cell_quantile)), np.quantile(total_per_cell, downsample_max_cell_quantile), ))) ut.log_calc("samples", samples) data = ut.get_vo_proper(adata, what, layout="row_major") assert ut.shaped_dtype(data) == "float32" downsampled = ut.downsample_matrix(data, per="row", samples=samples, random_seed=random_seed) if inplace: ut.set_vo_data(adata, "downsampled", downsampled) return None return ut.to_pandas_frame(downsampled, index=adata.obs_names, columns=adata.var_names)
def compute_direct_metacells( # pylint: disable=too-many-statements,too-many-branches adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, feature_downsample_min_samples: int = pr.feature_downsample_min_samples, feature_downsample_min_cell_quantile: float = pr.feature_downsample_min_cell_quantile, feature_downsample_max_cell_quantile: float = pr.feature_downsample_max_cell_quantile, feature_min_gene_total: Optional[int] = pr.feature_min_gene_total, feature_min_gene_top3: Optional[int] = pr.feature_min_gene_top3, feature_min_gene_relative_variance: Optional[float] = pr.feature_min_gene_relative_variance, feature_gene_names: Optional[Collection[str]] = None, feature_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None, forbidden_gene_names: Optional[Collection[str]] = None, forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None, cells_similarity_value_normalization: float = pr.cells_similarity_value_normalization, cells_similarity_log_data: bool = pr.cells_similarity_log_data, cells_similarity_method: str = pr.cells_similarity_method, target_metacell_size: float = pr.target_metacell_size, max_cell_size: Optional[float] = pr.max_cell_size, max_cell_size_factor: Optional[float] = pr.max_cell_size_factor, cell_sizes: Optional[Union[str, ut.Vector]] = pr.cell_sizes, knn_k: Optional[int] = pr.knn_k, min_knn_k: Optional[int] = pr.min_knn_k, knn_balanced_ranks_factor: float = pr.knn_balanced_ranks_factor, knn_incoming_degree_factor: float = pr.knn_incoming_degree_factor, knn_outgoing_degree_factor: float = pr.knn_outgoing_degree_factor, candidates_cell_seeds: Optional[Union[str, ut.Vector]] = None, min_seed_size_quantile: float = pr.min_seed_size_quantile, max_seed_size_quantile: float = pr.max_seed_size_quantile, candidates_cooldown_pass: float = pr.cooldown_pass, candidates_cooldown_node: float = pr.cooldown_node, candidates_cooldown_phase: float = pr.cooldown_phase, candidates_min_split_size_factor: Optional[float] = pr.candidates_min_split_size_factor, candidates_max_merge_size_factor: Optional[float] = pr.candidates_max_merge_size_factor, candidates_min_metacell_cells: Optional[int] = pr.min_metacell_cells, candidates_max_split_min_cut_strength: Optional[float] = pr.max_split_min_cut_strength, candidates_min_cut_seed_cells: Optional[int] = pr.min_cut_seed_cells, must_complete_cover: bool = False, deviants_min_gene_fold_factor: float = pr.deviants_min_gene_fold_factor, deviants_abs_folds: bool = pr.deviants_abs_folds, deviants_max_gene_fraction: Optional[float] = pr.deviants_max_gene_fraction, deviants_max_cell_fraction: Optional[float] = pr.deviants_max_cell_fraction, dissolve_min_robust_size_factor: Optional[float] = pr.dissolve_min_robust_size_factor, dissolve_min_convincing_size_factor: Optional[float] = pr.dissolve_min_convincing_size_factor, dissolve_min_convincing_gene_fold_factor: float = pr.dissolve_min_convincing_gene_fold_factor, dissolve_min_metacell_cells: int = pr.dissolve_min_metacell_cells, random_seed: int = pr.random_seed, ) -> AnnData: """ Directly compute metacells using ``what`` (default: {what}) data. This directly computes the metacells on the whole data. Like any method that directly looks at the whole data at once, the amount of CPU and memory needed becomes unreasonable when the data size grows. Above O(10,000) you are much better off using the divide-and-conquer method. .. note:: The current implementation is naive in that it computes the full dense N^2 correlation matrix, and only then extracts the sparse graph out of it. We actually need two copies where each requires 4 bytes per entry, so for O(100,000) cells, we have storage of O(100,000,000,000). In addition, the implementation is serial for the graph clustering phases. It is possible to mitigate this by fusing the correlations phase and the graph generation phase, parallelizing the result, and also (somehow) parallelizing the graph clustering phase. This might increase the "reasonable" size for the direct approach to O(100,000). We have decided not to invest in this direction since it won't allow us to push the size to O(1,000,000) and above. Instead we provide the divide-and-conquer method, which easily scales to O(1,000,000) on a single multi-core server, and to "unlimited" size if we further enhance the implementation to use a distributed compute cluster of such servers. .. todo:: Should :py:func:`compute_direct_metacells` avoid computing the graph and partition it for a very small number of cells? **Input** The presumably "clean" annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Sets the following annotations in ``adata``: Variable (Gene) Annotations ``high_total_gene`` A boolean mask of genes with "high" expression level. ``high_relative_variance_gene`` A boolean mask of genes with "high" normalized variance, relative to other genes with a similar expression level. ``forbidden_gene`` A boolean mask of genes which are forbidden from being chosen as "feature" genes based on their name. ``feature_gene`` A boolean mask of the "feature" genes. ``gene_deviant_votes`` The number of cells each gene marked as deviant (if zero, the gene did not mark any cell as deviant). This will be zero for non-"feature" genes. Observation (Cell) Annotations ``seed`` The index of the seed metacell each cell was assigned to to. This is ``-1`` for non-"clean" cells. ``candidate`` The index of the candidate metacell each cell was assigned to to. This is ``-1`` for non-"clean" cells. ``cell_deviant_votes`` The number of genes that were the reason the cell was marked as deviant (if zero, the cell is not deviant). ``dissolved`` A boolean mask of the cells contained in a dissolved metacell. ``metacell`` The integer index of the metacell each cell belongs to. The metacells are in no particular order. Cells with no metacell assignment ("outliers") are given a metacell index of ``-1``. ``outlier`` A boolean mask of the cells contained in no metacell. **Computation Parameters** 1. Invoke :py:func:`metacells.pipeline.feature.extract_feature_data` to extract "feature" data from the clean data, using the ``feature_downsample_min_samples`` (default: {feature_downsample_min_samples}), ``feature_downsample_min_cell_quantile`` (default: {feature_downsample_min_cell_quantile}), ``feature_downsample_max_cell_quantile`` (default: {feature_downsample_max_cell_quantile}), ``feature_min_gene_total`` (default: {feature_min_gene_total}), ``feature_min_gene_top3`` (default: {feature_min_gene_top3}), ``feature_min_gene_relative_variance`` (default: {feature_min_gene_relative_variance}), ``feature_gene_names`` (default: {feature_gene_names}), ``feature_gene_patterns`` (default: {feature_gene_patterns}), ``forbidden_gene_names`` (default: {forbidden_gene_names}), ``forbidden_gene_patterns`` (default: {forbidden_gene_patterns}) and ``random_seed`` (default: {random_seed}) to make this replicable. 2. Compute the fractions of each variable in each cell, and add the ``cells_similarity_value_normalization`` (default: {cells_similarity_value_normalization}) to it. 3. If ``cells_similarity_log_data`` (default: {cells_similarity_log_data}), invoke the :py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of the data. 4. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` to compute the similarity between each pair of cells, using the ``cells_similarity_method`` (default: {cells_similarity_method}). 5. Invoke :py:func:`metacells.pipeline.collect.compute_effective_cell_sizes` using ``max_cell_size`` (default: {max_cell_size}), ``max_cell_size_factor`` (default: {max_cell_size_factor}) and ``cell_sizes`` (default: {cell_sizes}) to get the effective cell sizes to use. 5. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` to compute a K-Nearest-Neighbors graph, using the ``knn_balanced_ranks_factor`` (default: {knn_balanced_ranks_factor}), ``knn_incoming_degree_factor`` (default: {knn_incoming_degree_factor}) and ``knn_outgoing_degree_factor`` (default: {knn_outgoing_degree_factor}). If ``knn_k`` (default: {knn_k}) is not specified, then it is chosen to be the median number of cells required to reach the target metacell size, but at least ``min_knn_k`` (default: {min_knn_k}). 6. Invoke :py:func:`metacells.tools.candidates.compute_candidate_metacells` to compute the candidate metacells, using the ``candidates_cell_seeds`` (default: {candidates_cell_seeds}), ``min_seed_size_quantile`` (default: {min_seed_size_quantile}), ``max_seed_size_quantile`` (default: {max_seed_size_quantile}), ``candidates_cooldown_pass`` (default: {candidates_cooldown_pass}), ``candidates_cooldown_node`` (default: {candidates_cooldown_node}), ``candidates_cooldown_phase`` (default: {candidates_cooldown_phase}), ``candidates_min_split_size_factor`` (default: {candidates_min_split_size_factor}), ``candidates_max_merge_size_factor`` (default: {candidates_max_merge_size_factor}), ``candidates_min_metacell_cells`` (default: {candidates_min_metacell_cells}), and ``random_seed`` (default: {random_seed}) to make this replicable. This tries to build metacells of the ``target_metacell_size`` (default: {target_metacell_size}) using the effective cell sizes. 7. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke :py:func:`metacells.tools.deviants.find_deviant_cells` to remove deviants from the candidate metacells, using the ``deviants_min_gene_fold_factor`` (default: {deviants_min_gene_fold_factor}), ``deviants_abs_folds`` (default: {deviants_abs_folds}), ``deviants_max_gene_fraction`` (default: {deviants_max_gene_fraction}) and ``deviants_max_cell_fraction`` (default: {deviants_max_cell_fraction}). 8. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke :py:func:`metacells.tools.dissolve.dissolve_metacells` to dissolve small unconvincing metacells, using the same ``target_metacell_size`` (default: {target_metacell_size}), and the effective cell sizes and the ``dissolve_min_robust_size_factor`` (default: {dissolve_min_robust_size_factor}), ``dissolve_min_convincing_size_factor`` (default: {dissolve_min_convincing_size_factor}), ``dissolve_min_convincing_gene_fold_factor`` (default: {dissolve_min_convincing_size_factor}) and ``dissolve_min_metacell_cells`` (default: ``dissolve_min_metacell_cells``). """ fdata = extract_feature_data( adata, what, top_level=False, downsample_min_samples=feature_downsample_min_samples, downsample_min_cell_quantile=feature_downsample_min_cell_quantile, downsample_max_cell_quantile=feature_downsample_max_cell_quantile, min_gene_relative_variance=feature_min_gene_relative_variance, min_gene_total=feature_min_gene_total, min_gene_top3=feature_min_gene_top3, forced_gene_names=feature_gene_names, forced_gene_patterns=feature_gene_patterns, forbidden_gene_names=forbidden_gene_names, forbidden_gene_patterns=forbidden_gene_patterns, random_seed=random_seed, ) if fdata is None: raise ValueError("Empty feature data, giving up") effective_cell_sizes, max_cell_size, _cell_scale_factors = compute_effective_cell_sizes( adata, max_cell_size=max_cell_size, max_cell_size_factor=max_cell_size_factor, cell_sizes=cell_sizes ) ut.log_calc("effective_cell_sizes", effective_cell_sizes, formatter=ut.sizes_description) if max_cell_size is not None: if candidates_min_metacell_cells is not None: target_metacell_size = max(target_metacell_size, max_cell_size * candidates_min_metacell_cells) if dissolve_min_metacell_cells is not None: target_metacell_size = max(target_metacell_size, max_cell_size * dissolve_min_metacell_cells) if candidates_min_metacell_cells is not None or dissolve_min_metacell_cells is not None: ut.log_calc("target_metacell_size", target_metacell_size) data = ut.get_vo_proper(fdata, "downsampled", layout="row_major") data = ut.to_numpy_matrix(data, copy=True) if cells_similarity_value_normalization > 0: data += cells_similarity_value_normalization if cells_similarity_log_data: data = ut.log_data(data, base=2) if knn_k is None: if effective_cell_sizes is None: median_cell_size = 1.0 else: median_cell_size = float(np.median(effective_cell_sizes)) knn_k = int(round(target_metacell_size / median_cell_size)) if min_knn_k is not None: knn_k = max(knn_k, min_knn_k) if knn_k == 0: ut.log_calc("knn_k: 0 (too small, try single metacell)") ut.set_o_data(fdata, "candidate", np.full(fdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0") elif knn_k >= fdata.n_obs: ut.log_calc(f"knn_k: {knn_k} (too large, try single metacell)") ut.set_o_data(fdata, "candidate", np.full(fdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0") else: ut.log_calc("knn_k", knn_k) tl.compute_obs_obs_similarity(fdata, data, method=cells_similarity_method, reproducible=(random_seed != 0)) tl.compute_obs_obs_knn_graph( fdata, k=knn_k, balanced_ranks_factor=knn_balanced_ranks_factor, incoming_degree_factor=knn_incoming_degree_factor, outgoing_degree_factor=knn_outgoing_degree_factor, ) tl.compute_candidate_metacells( fdata, target_metacell_size=target_metacell_size, cell_sizes=effective_cell_sizes, cell_seeds=candidates_cell_seeds, min_seed_size_quantile=min_seed_size_quantile, max_seed_size_quantile=max_seed_size_quantile, cooldown_pass=candidates_cooldown_pass, cooldown_node=candidates_cooldown_node, cooldown_phase=candidates_cooldown_phase, min_split_size_factor=candidates_min_split_size_factor, max_merge_size_factor=candidates_max_merge_size_factor, min_metacell_cells=candidates_min_metacell_cells, max_split_min_cut_strength=candidates_max_split_min_cut_strength, min_cut_seed_cells=candidates_min_cut_seed_cells, must_complete_cover=must_complete_cover, random_seed=random_seed, ) ut.set_oo_data(adata, "obs_similarity", ut.get_oo_proper(fdata, "obs_similarity")) ut.set_oo_data(adata, "obs_outgoing_weights", ut.get_oo_proper(fdata, "obs_outgoing_weights")) seed_of_cells = ut.get_o_numpy(fdata, "seed", formatter=ut.groups_description) ut.set_o_data(adata, "seed", seed_of_cells, formatter=ut.groups_description) candidate_of_cells = ut.get_o_numpy(fdata, "candidate", formatter=ut.groups_description) ut.set_o_data(adata, "candidate", candidate_of_cells, formatter=ut.groups_description) if must_complete_cover: assert np.min(candidate_of_cells) == 0 deviant_votes_of_genes = np.zeros(adata.n_vars, dtype="float32") deviant_votes_of_cells = np.zeros(adata.n_obs, dtype="float32") dissolved_of_cells = np.zeros(adata.n_obs, dtype="bool") ut.set_v_data(adata, "gene_deviant_votes", deviant_votes_of_genes, formatter=ut.mask_description) ut.set_o_data(adata, "cell_deviant_votes", deviant_votes_of_cells, formatter=ut.mask_description) ut.set_o_data(adata, "dissolved", dissolved_of_cells, formatter=ut.mask_description) ut.set_o_data(adata, "metacell", candidate_of_cells, formatter=ut.groups_description) else: tl.find_deviant_cells( adata, candidates=candidate_of_cells, min_gene_fold_factor=deviants_min_gene_fold_factor, abs_folds=deviants_abs_folds, max_gene_fraction=deviants_max_gene_fraction, max_cell_fraction=deviants_max_cell_fraction, ) tl.dissolve_metacells( adata, candidates=candidate_of_cells, target_metacell_size=target_metacell_size, cell_sizes=effective_cell_sizes, min_robust_size_factor=dissolve_min_robust_size_factor, min_convincing_size_factor=dissolve_min_convincing_size_factor, min_convincing_gene_fold_factor=dissolve_min_convincing_gene_fold_factor, min_metacell_cells=dissolve_min_metacell_cells, ) metacell_of_cells = ut.get_o_numpy(adata, "metacell", formatter=ut.groups_description) outlier_of_cells = metacell_of_cells < 0 ut.set_o_data(adata, "outlier", outlier_of_cells, formatter=ut.mask_description) return fdata