Пример #1
0
def find_high_relative_variance_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_relative_variance: float = pr.significant_gene_relative_variance,
    window_size: int = pr.relative_variance_window_size,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have high relative variance of ``what`` (default: {what}) data.

    The relative variance measures the variance / mean of each gene relative to the other genes with
    a similar level of expression. See
    :py:func:`metacells.utilities.computation.relative_variance_per` for details.

    Genes with a high relative variance are good candidates for being selected as "feature genes",
    that is, be used to compute the similarity between cells. Using the relative variance
    compensates for the bias for selecting higher-expression genes, whose normalized variance can to
    be larger due to random noise alone.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``high_relative_variance_gene``
            A boolean mask indicating whether each gene was found to have a high relative
            variance.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Use :py:func:`metacells.utilities.computation.relative_variance_per` to get the relative
       variance of each gene.

    2. Select the genes whose relative variance is at least
       ``min_gene_relative_variance`` (default: {min_gene_relative_variance}).
    """
    data = ut.get_vo_proper(adata, what, layout="column_major")
    relative_variance_of_genes = ut.relative_variance_per(data, per="column", window_size=window_size)

    genes_mask = relative_variance_of_genes >= min_gene_relative_variance

    if inplace:
        ut.set_v_data(adata, "high_relative_variance_gene", genes_mask)
        return None

    ut.log_return("high_relative_variance_genes", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Пример #2
0
def find_top_feature_genes(
    adata: AnnData,
    *,
    max_genes: int = pr.max_top_feature_genes,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have high ``feature_gene`` value.

    This is applied after computing metacells to pick the "strongest" feature genes. If using the
    direct algorithm (:py:func:`metacells.pipeline.direct.compute_direct_metacells`) then all
    feature genes are equally "strong"; however, if using the divide-and-conquer algorithm
    (:py:func:`metacells.pipeline.divide_and_conquer.divide_and_conquer_pipeline`,
    :py:func:`metacells.pipeline.divide_and_conquer.compute_divide_and_conquer_metacells`) then this
    will pick the genes which were most commonly used as features across all the piles.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``feature_gene`` is a per-variable (gene) annotation counting how many times each gene was used
    as a feature.

    **Returns**

    Variable (Gene) Annotations
        ``top_feature_gene``
            A boolean mask indicating whether each gene was found to be a top feature gene.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Look for the lowest positive ``feature_gene`` threshold such that at most ``max_genes`` are
       picked as top feature genes. Note we may still pick more than ``max_genes``, for example when
       using the direct algorithm, we always return all feature genes as there's no way to
       distinguish between them using the ``feature_gene`` data.
    """
    feature_of_gene = ut.get_v_numpy(adata, "feature_gene", formatter=ut.mask_description)
    max_threshold = np.max(feature_of_gene)
    assert max_threshold > 0
    threshold = 0
    selected_count = max_genes + 1
    while selected_count > max_genes and threshold < max_threshold:
        threshold = threshold + 1
        genes_mask = feature_of_gene >= threshold
        selected_count = np.sum(genes_mask)
        ut.log_calc(f"threshold: {threshold} selected: {selected_count}")

    if inplace:
        ut.set_v_data(adata, "top_feature_gene", genes_mask)
        return None

    ut.log_return("top_feature_gene", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Пример #3
0
def find_high_normalized_variance_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_normalized_variance: float = pr.significant_gene_normalized_variance,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have high normalized variance of ``what`` (default: {what}) data.

    The normalized variance measures the variance / mean of each gene. See
    :py:func:`metacells.utilities.computation.normalized_variance_per` for details.

    Genes with a high normalized variance are "noisy", that is, have significantly different
    expression level in different cells.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``high_normalized_variance_gene``
            A boolean mask indicating whether each gene was found to have a high normalized
            variance.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Use :py:func:`metacells.utilities.computation.normalized_variance_per` to get the normalized
       variance of each gene.

    2. Select the genes whose normalized variance is at least
       ``min_gene_normalized_variance`` (default: {min_gene_normalized_variance}).
    """
    data = ut.get_vo_proper(adata, what, layout="column_major")
    normalized_variance_of_genes = ut.normalized_variance_per(data, per="column")

    genes_mask = normalized_variance_of_genes >= min_gene_normalized_variance

    if inplace:
        ut.set_v_data(adata, "high_normalized_variance_gene", genes_mask)
        return None

    ut.log_return("high_normalized_variance_genes", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Пример #4
0
def find_biased_genes(
    adata: AnnData,
    *,
    max_projection_fold_factor: float = pr.project_max_projection_fold_factor,
    min_metacells_fraction: float = pr.biased_min_metacells_fraction,
    abs_folds: bool = pr.project_abs_folds,
    to_property_name: str = "biased_gene",
) -> None:
    """
    Find genes that have a strong bias in the query compared to the atlas.

    **Input**

    Annotated query ``adata`` where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    This should contain a ``projected_fold`` per-variable-per-observation matrix with the fold factor between each query
    metacell and its projected image on the atlas.

    **Returns**

    Sets the following annotations in ``adata``:

    Variable (Gene) Annotations
        ``biased_gene`` (or ``to_property_name``):
            A boolean mask indicating whether the gene has a strong bias in the query compared to the atlas.

    **Computation Parameters**

    1. Count for each such gene the number of query metacells for which the ``projected_fold`` is above
       ``max_projection_fold_factor``. If ``abs_folds`` (default: {abs_folds}), consider the absolute fold factor.

    2. Mark the gene as biased if either count is at least a ``min_metacells_fraction`` (default:
       {min_metacells_fraction}) of the metacells.
    """
    assert max_projection_fold_factor >= 0
    assert 0 <= min_metacells_fraction <= 1

    projected_fold = ut.get_vo_proper(adata, "projected_fold", layout="column_major")
    if abs_folds:
        projected_fold = np.abs(projected_fold)  # type: ignore

    high_projection_folds = ut.to_numpy_matrix(projected_fold > max_projection_fold_factor)  # type: ignore
    ut.log_calc("high_projection_folds", high_projection_folds)

    count_of_genes = ut.sum_per(high_projection_folds, per="column")
    min_count = adata.n_obs * min_metacells_fraction
    mask_of_genes = count_of_genes >= min_count

    ut.set_v_data(adata, to_property_name, mask_of_genes)
Пример #5
0
def find_high_topN_genes(  # pylint: disable=invalid-name
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    topN: int,  # pylint: disable=invalid-name
    min_gene_topN: int,  # pylint: disable=invalid-name
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have high total top-Nth value of ``what`` (default: {what}) data.

    This should typically only be applied to downsampled data to ensure that variance in sampling
    depth does not affect the result.

    Genes with too-low expression are typically excluded from computations. In particular,
    genes may have all-zero expression, in which case including them just slows the
    computations (and triggers numeric edge cases).

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``high_top<topN>_gene``
            A boolean mask indicating whether each gene was found to have a high top-Nth value.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Use :py:func:`metacells.utilities.computation.top_per` to get the top-Nth UMIs of each gene.

    2. Select the genes whose fraction is at least ``min_gene_topN``.
    """
    data_of_genes = ut.get_vo_proper(adata, what, layout="column_major")
    rank = max(adata.n_obs - topN - 1, 1)
    topN_of_genes = ut.rank_per(data_of_genes, per="column", rank=rank)  # pylint: disable=invalid-name
    genes_mask = topN_of_genes >= min_gene_topN

    if inplace:
        ut.set_v_data(adata, f"high_top{topN}_gene", genes_mask)
        return None

    ut.log_return(f"high_top{topN}_genes", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Пример #6
0
def find_properly_sampled_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_total: int = pr.properly_sampled_min_gene_total,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect genes with a "proper" amount of ``what`` (default: {what}) data.

    Due to both technical effects and natural variance between genes, the expression of genes varies
    greatly between cells. This is exactly the information we are trying to analyze. We often would
    like to work on genes that have a sufficient level of expression for meaningful analysis.
    Specifically, it doesn't make sense to analyze genes that have zero expression in all the cells.

    .. todo::

        Provide additional optional criteria for "properly sampled genes"?

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``properly_sampled_gene``
            A boolean mask indicating whether each gene has a "proper" number of UMIs.

    If ``inplace`` (default: {inplace}), this is written to the data and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Exclude all genes whose total data is less than the ``min_gene_total`` (default:
       {min_gene_total}).
    """
    total_of_genes = ut.get_v_numpy(adata, what, sum=True)

    genes_mask = total_of_genes >= min_gene_total

    if inplace:
        ut.set_v_data(adata, "properly_sampled_gene", genes_mask)
        return None

    ut.log_return("properly_sampled_gene", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.obs_names)
Пример #7
0
def find_high_total_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_total: int,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have high total number of ``what`` (default: {what}) data.

    This should typically only be applied to downsampled data to ensure that variance in sampling
    depth does not affect the result.

    Genes with too-low expression are typically excluded from computations. In particular,
    genes may have all-zero expression, in which case including them just slows the
    computations (and triggers numeric edge cases).

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``high_total_gene``
            A boolean mask indicating whether each gene was found to have a high normalized
            variance.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Use :py:func:`metacells.utilities.computation.sum_per` to get the total UMIs of each gene.

    2. Select the genes whose fraction is at least ``min_gene_total``.
    """
    total_of_genes = ut.get_v_numpy(adata, what, sum=True)
    genes_mask = total_of_genes >= min_gene_total

    if inplace:
        ut.set_v_data(adata, "high_total_gene", genes_mask)
        return None

    ut.log_return("high_total_genes", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Пример #8
0
def find_high_fraction_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_fraction: float = pr.significant_gene_fraction,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have high fraction of the total ``what`` (default: {what}) data of the cells.

    Genes with too-low expression are typically excluded from computations. In particular,
    genes may have all-zero expression, in which case including them just slows the
    computations (and triggers numeric edge cases).

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``high_fraction_gene``
            A boolean mask indicating whether each gene was found to have a high normalized
            variance.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Use :py:func:`metacells.utilities.computation.fraction_per` to get the fraction of each gene.

    2. Select the genes whose fraction is at least ``min_gene_fraction`` (default:
       {min_gene_fraction}).
    """
    data = ut.get_vo_proper(adata, what, layout="column_major")
    fraction_of_genes = ut.fraction_per(data, per="column")

    genes_mask = fraction_of_genes >= min_gene_fraction

    if inplace:
        ut.set_v_data(adata, "high_fraction_gene", genes_mask)
        return None

    ut.log_return("high_fraction_genes", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Пример #9
0
def _results(
    *,
    adata: AnnData,
    rare_module_of_cells: ut.NumpyVector,
    list_of_rare_gene_indices_of_modules: List[List[int]],
    inplace: bool,
) -> Optional[Tuple[ut.PandasFrame, ut.PandasFrame]]:
    assert np.max(
        rare_module_of_cells) == len(list_of_rare_gene_indices_of_modules) - 1

    if not inplace:
        var_metrics = ut.to_pandas_frame(index=adata.var_names)

    rare_gene_mask = np.zeros(adata.n_vars, dtype="bool")
    for module_index, rare_gene_indices_of_module in enumerate(
            list_of_rare_gene_indices_of_modules):
        rare_module_gene_mask = np.zeros(adata.n_vars, dtype="bool")
        rare_module_gene_mask[rare_gene_indices_of_module] = True
        property_name = f"rare_gene_module_{module_index}"
        if inplace:
            ut.set_v_data(adata, property_name, rare_module_gene_mask)
        else:
            var_metrics[property_name] = rare_module_gene_mask
            ut.log_return(property_name, rare_module_gene_mask)
        rare_gene_mask |= rare_module_gene_mask

    if inplace:
        ut.set_v_data(adata, "rare_gene", rare_gene_mask)
    else:
        var_metrics["rare_gene"] = rare_gene_mask
        ut.log_return("rare_gene", rare_gene_mask)

    if inplace:
        ut.set_o_data(adata,
                      "cells_rare_gene_module",
                      rare_module_of_cells,
                      formatter=ut.groups_description)
        ut.set_o_data(adata, "rare_cell", rare_module_of_cells >= 0)
        return None

    obs_metrics = ut.to_pandas_frame(index=adata.obs_names)
    ut.log_return("cells_rare_gene_module",
                  rare_module_of_cells,
                  formatter=ut.groups_description)
    ut.log_return("rare_cell", rare_module_of_cells >= 0)

    return obs_metrics, var_metrics
Пример #10
0
def find_named_genes(
    adata: AnnData,
    *,
    names: Optional[Collection[str]] = None,
    patterns: Optional[Collection[Union[str, Pattern]]] = None,
    to: Optional[str] = None,
    invert: bool = False,
) -> Optional[ut.PandasSeries]:
    """
    Find genes by their (case-insensitive) name.

    This creates a mask of all the genes whose name appears in ``names`` or matches any of the
    ``patterns``. If ``invert`` (default: {invert}), invert the resulting mask.

    If ``to`` (default: {to}) is specified, this is stored as a per-variable (gene) annotation with
    that name, and returns ``None``. This is useful to fill gene masks such as ``excluded_genes``
    (genes which should be excluded from the rest of the processing) and ``forbidden_genes`` (genes
    which must not be chosen as feature genes).

    Otherwise, it returns it as a pandas series (indexed by the variable, that is gene, names).
    """
    if names is None:
        names_mask = np.zeros(adata.n_vars, dtype="bool")
    else:
        lower_names_set = {name.lower() for name in names}
        names_mask = np.array([name.lower() in lower_names_set for name in adata.var_names])  #

    if patterns is None:
        patterns_mask = np.zeros(adata.n_vars, dtype="bool")
    else:
        patterns_mask = ut.patterns_matches(patterns, adata.var_names)

    genes_mask = names_mask | patterns_mask

    if invert:
        genes_mask = ~genes_mask

    if to is not None:
        ut.set_v_data(adata, to, genes_mask)
        return None

    ut.log_return("named_genes", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Пример #11
0
def find_noisy_lonely_genes(  # pylint: disable=too-many-statements
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    excluded_genes_mask: Optional[str] = None,
    max_sampled_cells: int = pr.noisy_lonely_max_sampled_cells,
    downsample_min_samples: int = pr.noisy_lonely_downsample_min_samples,
    downsample_min_cell_quantile: float = pr.
    noisy_lonely_downsample_max_cell_quantile,
    downsample_max_cell_quantile: float = pr.
    noisy_lonely_downsample_min_cell_quantile,
    min_gene_total: int = pr.noisy_lonely_min_gene_total,
    min_gene_normalized_variance: float = pr.
    noisy_lonely_min_gene_normalized_variance,
    max_gene_similarity: float = pr.noisy_lonely_max_gene_similarity,
    random_seed: int = pr.random_seed,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect "noisy lonely" genes based on ``what`` (default: {what}) data.

    Return the indices of genes which are "noisy" (have high variance compared to their mean) and
    also "lonely" (have low correlation with all other genes). Such genes should be excluded since
    they will never meaningfully help us compute groups, and will actively cause profiles to be
    considered "deviants".

    Noisy genes have high expression and variance. Lonely genes have no (or low) correlations with
    any other gene. Noisy lonely genes tend to throw off clustering algorithms. In general, such
    algorithms try to group together cells with the same overall biological state. Since the genes
    are lonely, they don't contribute towards this goal. Since they are noisy, they actively hamper
    this, because they make cells which are otherwise similar appear different (just for this lonely
    gene).

    It is therefore useful to explicitly identify, in a pre-processing step, the (few) such genes,
    and exclude them from the rest of the analysis.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``noisy_lonely_genes``
            A boolean mask indicating whether each gene was found to be a "noisy lonely" gene.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number
       of random cells from the data using the ``random_seed``.

    2. If we were specified an ``excluded_genes_mask``, this is the name of a per-variable (gene)
       annotation containing a mask of excluded genes. Get rid of all these excluded genes.

    3. Invoke :py:func:`metacells.tools.downsample.downsample_cells` to downsample the cells to the
       same total number of UMIs, using the ``downsample_min_samples`` (default:
       {downsample_min_samples}), ``downsample_min_cell_quantile`` (default:
       {downsample_min_cell_quantile}), ``downsample_max_cell_quantile`` (default:
       {downsample_max_cell_quantile}) and the ``random_seed`` (default: {random_seed}).

    4. Find "noisy" genes which have a total number of UMIs of at least ``min_gene_total`` (default:
       {min_gene_total}) and a normalized variance of at least ``min_gene_normalized_variance``
       (default: ``min_gene_normalized_variance``).

    5. Cross-correlate the noisy genes.

    6. Find the noisy "lonely" genes whose maximal correlation is at most
       ``max_gene_similarity`` (default: {max_gene_similarity}) with all other genes.
    """
    if max_sampled_cells < adata.n_obs:
        np.random.seed(random_seed)
        cell_indices = np.random.choice(np.arange(adata.n_obs),
                                        size=max_sampled_cells,
                                        replace=False)
        s_data = ut.slice(adata,
                          obs=cell_indices,
                          name=".sampled",
                          top_level=False)
    else:
        s_data = ut.copy_adata(adata, top_level=False)

    track_var: Optional[str] = "sampled_gene_index"

    if excluded_genes_mask is not None:
        results = filter_data(s_data,
                              name="included",
                              top_level=False,
                              track_var=track_var,
                              var_masks=[f"~{excluded_genes_mask}"])
        track_var = None
        assert results is not None
        i_data = results[0]
        assert i_data is not None
    else:
        i_data = s_data

    downsample_cells(
        i_data,
        what,
        downsample_min_samples=downsample_min_samples,
        downsample_min_cell_quantile=downsample_min_cell_quantile,
        downsample_max_cell_quantile=downsample_max_cell_quantile,
        random_seed=random_seed,
    )

    find_high_total_genes(i_data, "downsampled", min_gene_total=min_gene_total)

    results = filter_data(i_data,
                          name="high_total",
                          top_level=False,
                          track_var=track_var,
                          var_masks=["high_total_gene"])
    track_var = None
    assert results is not None
    ht_data = results[0]

    noisy_lonely_genes_mask = np.full(adata.n_vars, False)

    if ht_data is not None:
        ht_genes_count = ht_data.shape[1]

        ht_gene_ht_gene_similarity_frame = compute_var_var_similarity(
            ht_data,
            "downsampled",
            inplace=False,
            reproducible=(random_seed != 0))
        assert ht_gene_ht_gene_similarity_frame is not None

        ht_gene_ht_gene_similarity_matrix = ut.to_numpy_matrix(
            ht_gene_ht_gene_similarity_frame, only_extract=True)
        ht_gene_ht_gene_similarity_matrix = ut.to_layout(
            ht_gene_ht_gene_similarity_matrix,
            layout="row_major",
            symmetric=True)
        np.fill_diagonal(ht_gene_ht_gene_similarity_matrix, -1)

        htv_mask_series = find_high_normalized_variance_genes(
            ht_data,
            "downsampled",
            min_gene_normalized_variance=min_gene_normalized_variance,
            inplace=False)
        assert htv_mask_series is not None
        htv_mask = ut.to_numpy_vector(htv_mask_series)

        htv_genes_count = np.sum(htv_mask)
        assert htv_genes_count <= ht_genes_count

        if htv_genes_count > 0:
            htv_gene_ht_gene_similarity_matrix = ht_gene_ht_gene_similarity_matrix[
                htv_mask, :]
            assert ut.is_layout(htv_gene_ht_gene_similarity_matrix,
                                "row_major")
            assert htv_gene_ht_gene_similarity_matrix.shape == (
                htv_genes_count, ht_genes_count)

            max_similarity_of_htv_genes = ut.max_per(
                htv_gene_ht_gene_similarity_matrix, per="row")
            htvl_mask = max_similarity_of_htv_genes <= max_gene_similarity
            htvl_genes_count = np.sum(htvl_mask)
            ut.log_calc("noisy_lonely_genes_count", htvl_genes_count)

            if htvl_genes_count > 0:
                base_index_of_ht_genes = ut.get_v_numpy(
                    ht_data, "sampled_gene_index")
                base_index_of_htv_genes = base_index_of_ht_genes[htv_mask]
                base_index_of_htvl_genes = base_index_of_htv_genes[htvl_mask]

                noisy_lonely_genes_mask[base_index_of_htvl_genes] = True

                htvl_gene_ht_gene_similarity_matrix = htv_gene_ht_gene_similarity_matrix[
                    htvl_mask, :]
                htvl_gene_ht_gene_similarity_matrix = ut.to_layout(
                    htvl_gene_ht_gene_similarity_matrix, layout="row_major")
                assert htvl_gene_ht_gene_similarity_matrix.shape == (
                    htvl_genes_count, ht_genes_count)

                if ut.logging_calc():
                    i_gene_totals = ut.get_v_numpy(i_data,
                                                   "downsampled",
                                                   sum=True)
                    ht_mask = ut.get_v_numpy(i_data, "high_total_gene")
                    i_total = np.sum(i_gene_totals)
                    htvl_gene_totals = i_gene_totals[ht_mask][htv_mask][
                        htvl_mask]
                    top_similarity_of_htvl_genes = ut.top_per(
                        htvl_gene_ht_gene_similarity_matrix, 10, per="row")
                    for htvl_index, gene_index in enumerate(
                            base_index_of_htvl_genes):
                        gene_name = adata.var_names[gene_index]
                        gene_total = htvl_gene_totals[htvl_index]
                        gene_percent = 100 * gene_total / i_total
                        similar_ht_values = ut.to_numpy_vector(
                            top_similarity_of_htvl_genes[htvl_index, :])  #
                        assert len(similar_ht_values) == ht_genes_count
                        top_similar_ht_mask = similar_ht_values > 0
                        top_similar_ht_values = similar_ht_values[
                            top_similar_ht_mask]
                        top_similar_ht_indices = base_index_of_ht_genes[
                            top_similar_ht_mask]
                        top_similar_ht_names = adata.var_names[
                            top_similar_ht_indices]
                        ut.log_calc(
                            f"- {gene_name}",
                            f"total downsampled UMIs: {gene_total} " +
                            f"({gene_percent:.4g}%), correlated with: " +
                            ", ".join([
                                f"{similar_gene_name}: {similar_gene_value:.4g}"
                                for similar_gene_value, similar_gene_name in
                                reversed(
                                    sorted(
                                        zip(top_similar_ht_values,
                                            top_similar_ht_names)))
                            ]),
                        )

    if ut.logging_calc():
        ut.log_calc("noisy_lonely_gene_names",
                    sorted(list(adata.var_names[noisy_lonely_genes_mask])))

    if inplace:
        ut.set_v_data(adata, "noisy_lonely_gene", noisy_lonely_genes_mask)
        return None

    ut.log_return("noisy_lonely_genes", noisy_lonely_genes_mask)
    return ut.to_pandas_series(noisy_lonely_genes_mask, index=adata.var_names)
Пример #12
0
def find_systematic_genes(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    atlas_total_umis: Optional[ut.Vector] = None,
    query_total_umis: Optional[ut.Vector] = None,
    low_gene_quantile: float = pr.systematic_low_gene_quantile,
    high_gene_quantile: float = pr.systematic_high_gene_quantile,
    to_property_name: str = "systematic_gene",
) -> None:
    """
    Find genes that

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation
    containing such a matrix.

    **Returns**

    A matrix whose rows are query metacells and columns are atlas metacells, where each entry is the weight of the atlas
    metacell in the projection of the query metacells. The sum of weights in each row (that is, for a single query
    metacell) is 1. The weighted sum of the atlas metacells using these weights is the "projected" image of the query
    metacell onto the atlas.

    In addition, sets the following annotations in ``qdata``:

    Variable (Gene) Annotations
        ``systematic_gene`` (or ``to_property_name``)
            A boolean mask indicating whether the gene is systematically higher or lower in the query compared to the
            atlas.

    **Computation Parameters**

    1. Compute the fraction of each gene out of the total UMIs in both the atlas and the query. If ``atlas_total_umis``
       and/or ``query_total_umis`` are given, use them as the basis instead of the sum of the UMIs.

    2. Compute for each gene its ``low_gene_quantile`` (default: {low_gene_quantile}) fraction in the query, and its
       ``high_gene_quantile`` (default: {high_gene_quantile}) fraction in the atlas.

    3. Compute for each gene its standard deviation in the atlas.

    4. Mark as systematic the genes for which the low quantile value in the query is at least the atlas high quantile
       value.

    5. Mark as systematic the genes for which the low quantile value in the atlas is at least the query high quantile
       value.
    """
    assert 0 <= low_gene_quantile <= 1
    assert 0 <= high_gene_quantile <= 1
    assert np.all(adata.var_names == qdata.var_names)

    query_umis = ut.get_vo_proper(qdata, what, layout="row_major")
    atlas_umis = ut.get_vo_proper(adata, what, layout="row_major")

    atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis))
    query_fractions = ut.to_numpy_matrix(ut.fraction_by(query_umis, by="row", sums=query_total_umis))

    query_fractions = ut.to_layout(query_fractions, layout="column_major")
    atlas_fractions = ut.to_layout(atlas_fractions, layout="column_major")

    query_low_gene_values = ut.quantile_per(query_fractions, low_gene_quantile, per="column")
    atlas_low_gene_values = ut.quantile_per(atlas_fractions, low_gene_quantile, per="column")

    query_high_gene_values = ut.quantile_per(query_fractions, high_gene_quantile, per="column")
    atlas_high_gene_values = ut.quantile_per(atlas_fractions, high_gene_quantile, per="column")

    query_above_atlas = query_low_gene_values > atlas_high_gene_values
    atlas_above_query = atlas_low_gene_values >= query_high_gene_values

    systematic = query_above_atlas | atlas_above_query

    ut.set_v_data(qdata, to_property_name, systematic)
Пример #13
0
def relate_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    max_sampled_cells: int = pr.related_max_sampled_cells,
    downsample_min_samples: float = pr.related_downsample_min_samples,
    downsample_min_cell_quantile: float = pr.
    related_downsample_min_cell_quantile,
    downsample_max_cell_quantile: float = pr.
    related_downsample_max_cell_quantile,
    min_gene_relative_variance: float = pr.related_min_gene_relative_variance,
    min_gene_total: int = pr.related_min_gene_total,
    min_gene_top3: int = pr.related_min_gene_top3,
    forbidden_gene_names: Optional[Collection[str]] = None,
    forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    genes_similarity_method: str = pr.related_genes_similarity_method,
    genes_cluster_method: str = pr.related_genes_cluster_method,
    min_genes_of_modules: int = pr.related_min_genes_of_modules,
    random_seed: int = 0,
) -> None:
    """
    Detect coarse relations between genes based on ``what`` (default: {what}) data.

    This is a quick-and-dirty way to group genes together and shouldn't only be used as a starting
    point for more precise forms of gene relationship analysis.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable-pair (Gene) Annotations
        ``related_genes_similarity``
            The similarity between each two related genes.

    Variable (Gene) Annotations
        ``related_genes_module``
            The index of the gene module for each gene.

    **Computation Parameters**

    1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number
       of random cells from the data using the ``random_seed``.

    2. Pick candidate genes using :py:func:`metacells.pipeline.feature.extract_feature_data`.

    3. Compute the similarity between the feature genes using
       :py:func:`metacells.tools.similarity.compute_var_var_similarity` using the
       ``genes_similarity_method`` (default: {genes_similarity_method}).

    4. Create a hierarchical clustering of the candidate genes using the ``genes_cluster_method``
       (default: {genes_cluster_method}).

    5. Identify gene modules in the hierarchical clustering which contain at least
       ``min_genes_of_modules`` genes.
    """
    if max_sampled_cells < adata.n_obs:
        np.random.seed(random_seed)
        cell_indices = np.random.choice(np.arange(adata.n_obs),
                                        size=max_sampled_cells,
                                        replace=False)
        sdata = ut.slice(adata,
                         obs=cell_indices,
                         name=".sampled",
                         top_level=False)
    else:
        sdata = ut.copy_adata(adata, top_level=False)

    fdata = extract_feature_data(
        sdata,
        what,
        top_level=False,
        downsample_min_samples=downsample_min_samples,
        downsample_min_cell_quantile=downsample_min_cell_quantile,
        downsample_max_cell_quantile=downsample_max_cell_quantile,
        min_gene_relative_variance=min_gene_relative_variance,
        min_gene_total=min_gene_total,
        min_gene_top3=min_gene_top3,
        forbidden_gene_names=forbidden_gene_names,
        forbidden_gene_patterns=forbidden_gene_patterns,
        random_seed=random_seed,
    )
    assert fdata is not None

    frame = tl.compute_var_var_similarity(fdata,
                                          what,
                                          method=genes_similarity_method,
                                          reproducible=(random_seed != 0),
                                          inplace=False)
    assert frame is not None
    similarity = ut.to_layout(ut.to_numpy_matrix(frame), layout="row_major")

    linkage = _cluster_genes(similarity, genes_cluster_method)
    clusters = _linkage_to_clusters(linkage, min_genes_of_modules,
                                    fdata.n_vars)

    cluster_of_genes = pd.Series(np.full(adata.n_vars, -1, dtype="int32"),
                                 index=adata.var_names)
    for cluster_index, gene_indices in enumerate(clusters):
        cluster_of_genes[fdata.var_names[gene_indices]] = cluster_index

    ut.set_v_data(adata,
                  "related_genes_module",
                  cluster_of_genes,
                  formatter=ut.groups_description)

    feature_gene_indices = ut.get_v_numpy(fdata, "full_gene_index")
    data = similarity.flatten(order="C")
    rows = np.repeat(feature_gene_indices, len(feature_gene_indices))
    cols = np.tile(feature_gene_indices, len(feature_gene_indices))
    full_similarity = sp.csr_matrix((data, (rows, cols)),
                                    shape=(adata.n_vars, adata.n_vars))

    ut.set_vv_data(adata, "related_genes_similarity", full_similarity)
Пример #14
0
def find_metacells_significant_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_range_fold: float = pr.min_significant_metacells_gene_range_fold_factor,
    normalization: float = pr.metacells_gene_range_normalization,
    min_gene_fraction: float = pr.min_significant_metacells_gene_fraction,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have a significant signal in metacells data. This computation is too unreliable to be used on
    cells.

    Find genes which have a high maximal expression in at least one metacell, and a wide range of expression across the
    metacells. Such genes are good candidates for being used as marker genes and/or to compute distances between
    metacells.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``significant_gene``
            A boolean mask indicating whether each gene was found to be significant.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Compute the minimal and maximal expression level of each gene.

    2. Select the genes whose fold factor (log2 of maximal over minimal value, using the ``normalization``
       (default: {normalization}) is at least ``min_gene_range_fold`` (default: {min_gene_range_fold}).

    3. Select the genes whose maximal expression is at least ``min_gene_fraction`` (default: {min_gene_fraction}).
    """
    assert normalization >= 0

    data = ut.get_vo_proper(adata, what, layout="row_major")
    fractions_of_genes = ut.to_layout(ut.fraction_by(data, by="row"), layout="column_major")

    min_fraction_of_genes = ut.min_per(fractions_of_genes, per="column")
    max_fraction_of_genes = ut.max_per(fractions_of_genes, per="column")

    high_max_fraction_genes_mask = max_fraction_of_genes >= min_gene_fraction
    ut.log_calc("high max fraction genes", high_max_fraction_genes_mask)

    min_fraction_of_genes += normalization
    max_fraction_of_genes += normalization

    max_fraction_of_genes /= min_fraction_of_genes
    range_fold_of_genes = np.log2(max_fraction_of_genes, out=max_fraction_of_genes)

    high_range_genes_mask = range_fold_of_genes >= min_gene_range_fold
    ut.log_calc("high range genes", high_range_genes_mask)

    significant_genes_mask = high_max_fraction_genes_mask & high_range_genes_mask

    if inplace:
        ut.set_v_data(adata, "significant_gene", significant_genes_mask)
        return None

    ut.log_return("significant_genes", significant_genes_mask)
    return ut.to_pandas_series(significant_genes_mask, index=adata.var_names)
Пример #15
0
def find_deviant_cells(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    candidates: Union[str, ut.Vector] = "candidate",
    min_gene_fold_factor: float = pr.deviants_min_gene_fold_factor,
    abs_folds: bool = pr.deviants_abs_folds,
    max_gene_fraction: Optional[float] = pr.deviants_max_gene_fraction,
    max_cell_fraction: Optional[float] = pr.deviants_max_cell_fraction,
    inplace: bool = True,
) -> Optional[Tuple[ut.PandasSeries, ut.PandasSeries]]:
    """
    Find cells which are have significantly different gene expression from the metacells they are
    belong to based on ``what`` (default: {what}) data.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Observation (Cell) Annotations
        ``cell_deviant_votes``
            The number of genes that were the reason the cell was marked as deviant (if zero, the
            cell is not deviant).

    Variable (Gene) Annotations
        ``gene_deviant_votes``
            The number of cells each gene marked as deviant (if zero, the gene did not mark any cell
            as deviant).

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as two pandas series (indexed by the observation and
    variable names).

    **Computation Parameters**

    Intuitively, we first select some fraction of the genes which were least predictable compared to
    the mean expression in the candidate metacells. We then mark as deviants some fraction of the
    cells whose expression of these genes was least predictable compared to the mean expression in
    the candidate metacells. Operationally:

    1. Compute for each candidate metacell the mean fraction of the UMIs expressed by each gene.
       Scale this by each cell's total UMIs to compute the expected number of UMIs for each cell.
       Compute the fold factor log2((actual UMIs + 1) / (expected UMIs + 1)) for each gene for each
       cell.

    2. Ignore all fold factors less than the ``min_gene_fold_factor`` (default: {min_gene_fold_factor}). If
       ``abs_folds`` (default: {abs_folds}), consider the absolute fold factors. Count the number of genes which have a
       fold factor above this minimum in at least one cell. If the fraction of such genes is above ``max_gene_fraction``
       (default: {max_gene_fraction}), then raise the minimal gene fold factor such that at most this fraction of genes
       remain.

    3. For each remaining gene, rank all the cells where it is expressed above the min fold
       factor. Give an artificial maximum rank to all cells with fold factor 0, that is, below the
       minimum.

    4. For each cell, compute the minimal rank it has in any of these genes. That is, if a cell has
       a rank of 1, it means that it has at least one gene whose expression fold factor is the worst
       (highest) across all cells (and is also above the minimum).

    5. Select as deviants all cells whose minimal rank is below the artificial maximum rank, that
       is, which contain at least one gene whose expression fold factor is high relative to the rest
       of the cells. If the fraction of such cells is higher than ``max_cell_fraction`` (default:
       {max_cell_fraction}), reduce the maximal rank such that at most this fraction of cells are
       selected as deviants.
    """
    if max_gene_fraction is None:
        max_gene_fraction = 1

    if max_cell_fraction is None:
        max_cell_fraction = 1

    assert min_gene_fold_factor > 0
    assert 0 < max_gene_fraction < 1
    assert 0 < max_cell_fraction < 1

    cells_count, genes_count = adata.shape
    assert cells_count > 0

    candidate_of_cells = ut.get_o_numpy(adata, candidates, formatter=ut.groups_description)

    totals_of_cells = ut.get_o_numpy(adata, what, sum=True)
    assert totals_of_cells.size == cells_count

    data = ut.get_vo_proper(adata, what, layout="row_major")
    list_of_fold_factors, list_of_cell_index_of_rows = _collect_fold_factors(
        data=data,
        candidate_of_cells=candidate_of_cells,
        totals_of_cells=totals_of_cells,
        min_gene_fold_factor=min_gene_fold_factor,
        abs_folds=abs_folds,
    )

    fold_factors = _construct_fold_factors(cells_count, list_of_fold_factors, list_of_cell_index_of_rows)

    if fold_factors is None:
        votes_of_deviant_cells = np.zeros(adata.n_obs, dtype="int32")
        votes_of_deviant_genes = np.zeros(adata.n_vars, dtype="int32")

    else:
        deviant_gene_indices = _filter_genes(
            cells_count=cells_count,
            genes_count=genes_count,
            fold_factors=fold_factors,
            min_gene_fold_factor=min_gene_fold_factor,
            max_gene_fraction=max_gene_fraction,
        )

        deviant_genes_fold_ranks = _fold_ranks(
            cells_count=cells_count, fold_factors=fold_factors, deviant_gene_indices=deviant_gene_indices
        )

        votes_of_deviant_cells, votes_of_deviant_genes = _filter_cells(
            cells_count=cells_count,
            genes_count=genes_count,
            deviant_genes_fold_ranks=deviant_genes_fold_ranks,
            deviant_gene_indices=deviant_gene_indices,
            max_cell_fraction=max_cell_fraction,
        )

    if inplace:
        ut.set_v_data(adata, "gene_deviant_votes", votes_of_deviant_genes, formatter=ut.mask_description)
        ut.set_o_data(adata, "cell_deviant_votes", votes_of_deviant_cells, formatter=ut.mask_description)
        return None

    ut.log_return("gene_deviant_votes", votes_of_deviant_genes, formatter=ut.mask_description)
    ut.log_return("cell_deviant_votes", votes_of_deviant_cells, formatter=ut.mask_description)

    return (
        ut.to_pandas_series(votes_of_deviant_cells, index=adata.obs_names),
        ut.to_pandas_series(votes_of_deviant_genes, index=adata.var_names),
    )
Пример #16
0
def compute_subset_distinct_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    prefix: Optional[str] = None,
    scale: Optional[Union[bool, str, ut.NumpyVector]],
    subset: Union[str, ut.NumpyVector],
    normalization: float,
) -> Optional[Tuple[ut.PandasSeries, ut.PandasSeries]]:
    """
    Given a subset of the observations (cells), compute for each gene how distinct its ``what``
    (default: {what}) value is in the subset compared to the overall population.

    This is the area-under-curve of the receiver operating characteristic (AUROC) for the gene, that
    is, the probability that a randomly selected observation (cell) in the subset will have a higher
    value than a randomly selected observation (cell) outside the subset.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``<prefix>_fold``
            Store the ratio of the expression of the gene in the subset as opposed to the rest of
            the population.
        ``<prefix>_auroc``
            Store the distinctiveness of the gene in the subset as opposed to the rest of the
            population.

    If ``prefix`` (default: {prefix}), is specified, this is written to the data. Otherwise this is
    returned as two pandas series (indexed by the gene names).

    **Computation Parameters**

    1. Use the ``subset`` to assign a boolean label to each observation (cell). The ``subset`` can
       be a vector of integer observation names, or a boolean mask, or the string name of a
       per-observation annotation containing the boolean mask.

    2. If ``scale`` is ``False``, use the data as-is. If it is ``True``, divide the data by the
       sum of each observation (cell). If it is a string, it should be the name of a per-observation
       annotation to use. Otherwise, it should be a vector of the scale factor for each observation
       (cell).

    3. Compute the fold ratios using the ``normalization`` (no default!) and the AUROC for each gene,
       for the scaled data based on this mask.
    """
    if isinstance(subset, str):
        subset = ut.get_o_numpy(adata, subset)

    if subset.dtype != "bool":
        mask: ut.NumpyVector = np.full(adata.n_obs, False)
        mask[subset] = True
        subset = mask

    scale_of_cells: Optional[ut.NumpyVector] = None
    if not isinstance(scale, bool):
        scale_of_cells = ut.maybe_o_numpy(adata,
                                          scale,
                                          formatter=ut.sizes_description)
    elif scale:
        scale_of_cells = ut.get_o_numpy(adata, what, sum=True)
    else:
        scale_of_cells = None

    matrix = ut.get_vo_proper(adata, what, layout="column_major").transpose()
    fold_of_genes, auroc_of_genes = ut.matrix_rows_folds_and_aurocs(
        matrix,
        columns_subset=subset,
        columns_scale=scale_of_cells,
        normalization=normalization)

    if prefix is not None:
        ut.set_v_data(adata, f"{prefix}_auroc", auroc_of_genes)
        ut.set_v_data(adata, f"{prefix}_fold", fold_of_genes)
        return None

    return (
        ut.to_pandas_series(fold_of_genes, index=adata.var_names),
        ut.to_pandas_series(auroc_of_genes, index=adata.var_names),
    )
Пример #17
0
def collect_metacells(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    max_cell_size: Optional[float] = pr.max_cell_size,
    max_cell_size_factor: Optional[float] = pr.max_cell_size_factor,
    cell_sizes: Optional[Union[str, ut.Vector]] = pr.cell_sizes,
    name: str = "metacells",
    top_level: bool = True,
) -> AnnData:
    """
    Collect computed metacells ``what`` (default: {what}) data.

    **Input**

    Annotated (presumably "clean") ``adata``, where the observations are cells and the variables are
    genes, and where ``what`` is a per-variable-per-observation matrix or the name of a
    per-variable-per-observation annotation containing such a matrix.

    **Returns**

    Annotated metacell data containing for each observation the sum of the data (by of the cells for
    each metacell, which contains the following annotations:

    Variable (Gene) Annotations
        ``excluded_gene``
            A mask of the genes which were excluded by name.

        ``clean_gene``
            A boolean mask of the clean genes.

        ``forbidden_gene``
            A boolean mask of genes which are forbidden from being chosen as "feature" genes based
            on their name. This is ``False`` for non-"clean" genes.

        If directly computing metecalls:

        ``feature``
            A boolean mask of the "feature" genes. This is ``False`` for non-"clean" genes.

        If using divide-and-conquer:

        ``pre_feature``, ``feature``
            The number of times the gene was used as a feature when computing the preliminary and
            final metacells. This is zero for non-"clean" genes.

    Observations (Cell) Annotations
        ``grouped``
            The number of ("clean") cells grouped into each metacell.

        ``pile``
            The index of the pile used to compute the metacell each cell was assigned to to. This is
            ``-1`` for non-"clean" cells.

        ``candidate``
            The index of the candidate metacell each cell was assigned to to. This is ``-1`` for
            non-"clean" cells.

    Also sets all relevant annotations in the full data based on their value in the clean data, with
    appropriate defaults for non-"clean" data.

    **Computation Parameters**

    1. Compute the cell's scale factors by invoking :py:func:`compute_effective_cell_sizes` using the
       ``max_cell_size`` (default: {max_cell_size}), ``max_cell_size_factor`` (default:
       {max_cell_size_factor}) and ``cell_sizes`` (default: {cell_sizes}).

    2. Scale the cell's data using these factors, if needed.

    3. Invoke :py:func:`metacells.tools.group.group_obs_data` to sum the cells into
       metacells.

    4. Pass all relevant per-gene and per-cell annotations to the result.
    """
    _cell_sizes, _max_cell_size, cell_scale_factors = compute_effective_cell_sizes(
        adata, max_cell_size=max_cell_size, max_cell_size_factor=max_cell_size_factor, cell_sizes=cell_sizes
    )

    if cell_scale_factors is not None:
        data = ut.get_vo_proper(adata, what, layout="row_major")
        what = ut.scale_by(data, cell_scale_factors, by="row")

    mdata = tl.group_obs_data(adata, what, groups="metacell", name=name)
    assert mdata is not None
    if top_level:
        ut.top_level(mdata)

    for annotation_name in ("excluded_gene", "clean_gene", "forbidden_gene", "pre_feature_gene", "feature_gene"):
        if not ut.has_data(adata, annotation_name):
            continue
        value_per_gene = ut.get_v_numpy(adata, annotation_name, formatter=ut.mask_description)
        ut.set_v_data(mdata, annotation_name, value_per_gene, formatter=ut.mask_description)

    for annotation_name in ("pile", "candidate"):
        if ut.has_data(adata, annotation_name):
            tl.group_obs_annotation(
                adata, mdata, groups="metacell", formatter=ut.groups_description, name=annotation_name, method="unique"
            )

    return mdata
Пример #18
0
def renormalize_query_by_atlas(  # pylint: disable=too-many-statements,too-many-branches
    what: str = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    var_annotations: Dict[str, Any],
    layers: Dict[str, Any],
    varp_annotations: Dict[str, Any],
) -> Optional[AnnData]:
    """
    Add an ``ATLASNORM`` pseudo-gene to query metacells data to compensate for the query having filtered out many genes.

    This renormalizes the gene fractions in the query to fit the atlas in case the query has aggressive filtered a
    significant amount of genes.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``X`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing
    such a matrix.

    **Returns**

    None if no normalization is needed (or possible). Otherwise, a copy of the query metacells data, with an additional
    variable (gene) called ``ATLASNORM`` to the query data, such that the total number of UMIs for each query metacells
    is as expected given the total number of UMIs of the genes common to the query and the atlas. This is skipped if the
    query and the atlas have exactly the same list of genes, or if if the query already contains a high number of genes
    missing from the atlas so that the total number of UMIs for the query metacells is already at least the expected
    based on the common genes.

    **Computation Parameters**

    1. Computes how many UMIs should be added to each query metacell so that its (total UMIs / total common gene UMIs)
       would be the same as the (total atlas UMIs / total atlas common UMIs). If this is zero (or negative), stop.

    2. Add an ``ATLASNORM`` pseudo-gene to the query with the above amount of UMIs. For each per-variable (gene)
       observation, add the value specified in ``var_annotations``, whose list of keys must cover the set of
       per-variable annotations in the query data. For each per-observation-per-variable layer, add the value specified
       in ``layers``, whose list of keys must cover the existing layers. For each per-variable-per-variable annotation,
       add the value specified in ``varp_annotations``.
    """
    for name in qdata.var.keys():
        if "|" not in name and name not in var_annotations.keys():
            raise RuntimeError(f"missing default value for variable annotation {name}")

    for name in qdata.layers.keys():
        if name not in layers.keys():
            raise RuntimeError(f"missing default value for layer {name}")

    for name in qdata.varp.keys():
        if name not in varp_annotations.keys():
            raise RuntimeError(f"missing default value for variable-variable {name}")

    if list(qdata.var_names) == list(adata.var_names):
        return None

    query_genes_list = list(qdata.var_names)
    atlas_genes_list = list(adata.var_names)
    common_genes_list = list(sorted(set(qdata.var_names) & set(adata.var_names)))
    query_gene_indices = np.array([query_genes_list.index(gene) for gene in common_genes_list])
    atlas_gene_indices = np.array([atlas_genes_list.index(gene) for gene in common_genes_list])
    common_qdata = ut.slice(qdata, name=".common", vars=query_gene_indices, track_var="full_index")
    common_adata = ut.slice(adata, name=".common", vars=atlas_gene_indices, track_var="full_index")

    assert list(common_qdata.var_names) == list(common_adata.var_names)

    atlas_total_umis_per_metacell = ut.get_o_numpy(adata, what, sum=True)
    atlas_common_umis_per_metacell = ut.get_o_numpy(common_adata, what, sum=True)
    atlas_total_umis = np.sum(atlas_total_umis_per_metacell)
    atlas_common_umis = np.sum(atlas_common_umis_per_metacell)
    atlas_disjoint_umis_fraction = atlas_total_umis / atlas_common_umis - 1.0

    ut.log_calc("atlas_total_umis", atlas_total_umis)
    ut.log_calc("atlas_common_umis", atlas_common_umis)
    ut.log_calc("atlas_disjoint_umis_fraction", atlas_disjoint_umis_fraction)

    query_total_umis_per_metacell = ut.get_o_numpy(qdata, what, sum=True)
    query_common_umis_per_metacell = ut.get_o_numpy(common_qdata, what, sum=True)
    query_total_umis = np.sum(query_total_umis_per_metacell)
    query_common_umis = np.sum(query_common_umis_per_metacell)
    query_disjoint_umis_fraction = query_total_umis / query_common_umis - 1.0

    ut.log_calc("query_total_umis", query_total_umis)
    ut.log_calc("query_common_umis", query_common_umis)
    ut.log_calc("query_disjoint_umis_fraction", query_disjoint_umis_fraction)

    if query_disjoint_umis_fraction >= atlas_disjoint_umis_fraction:
        return None

    query_normalization_umis_fraction = atlas_disjoint_umis_fraction - query_disjoint_umis_fraction
    ut.log_calc("query_normalization_umis_fraction", query_normalization_umis_fraction)
    query_normalization_umis_per_metacell = query_common_umis_per_metacell * query_normalization_umis_fraction

    _proper, dense, compressed = ut.to_proper_matrices(qdata.X)

    if dense is None:
        assert compressed is not None
        dense = ut.to_numpy_matrix(compressed)
    added = np.concatenate([dense, query_normalization_umis_per_metacell[:, np.newaxis]], axis=1)

    if compressed is not None:
        added = sp.csr_matrix(added)

    assert added.shape[0] == qdata.shape[0]
    assert added.shape[1] == qdata.shape[1] + 1

    ndata = AnnData(added)
    ndata.obs_names = qdata.obs_names
    var_names = list(qdata.var_names)
    var_names.append("ATLASNORM")
    ndata.var_names = var_names

    for name, value in qdata.uns.items():
        ut.set_m_data(ndata, name, value)

    for name, value in qdata.obs.items():
        ut.set_o_data(ndata, name, value)

    for name, value in qdata.obsp.items():
        ut.set_oo_data(ndata, name, value)

    for name in qdata.var.keys():
        if "|" in name:
            continue
        value = ut.get_v_numpy(qdata, name)
        value = np.append(value, [var_annotations[name]])
        ut.set_v_data(ndata, name, value)

    for name in qdata.layers.keys():
        data = ut.get_vo_proper(qdata, name)
        _proper, dense, compressed = ut.to_proper_matrices(data)

        if dense is None:
            assert compressed is not None
            dense = ut.to_numpy_matrix(compressed)

        values = np.full(qdata.n_obs, layers[name], dtype=dense.dtype)
        added = np.concatenate([dense, values[:, np.newaxis]], axis=1)

        if compressed is not None:
            added = sp.csr_matrix(added)

        ut.set_vo_data(ndata, name, added)

    for name in qdata.varp.keys():
        data = ut.get_vv_proper(qdata, name)
        _proper, dense, compressed = ut.to_proper_matrices(data)

        if dense is None:
            assert compressed is not None
            dense = ut.to_numpy_matrix(compressed)

        values = np.full(qdata.n_vars, varp_annotations[name], dtype=dense.dtype)
        added = np.concatenate([dense, values[:, np.newaxis]], axis=1)
        values = np.full(qdata.n_vars + 1, varp_annotations[name], dtype=dense.dtype)
        added = np.concatenate([added, values[:, np.newaxis]], axis=0)

        if compressed is not None:
            added = sp.csr_matrix(added)

        ut.set_vv_data(ndata, name, added)

    return ndata
Пример #19
0
def compute_direct_metacells(  # pylint: disable=too-many-statements,too-many-branches
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    feature_downsample_min_samples: int = pr.feature_downsample_min_samples,
    feature_downsample_min_cell_quantile: float = pr.feature_downsample_min_cell_quantile,
    feature_downsample_max_cell_quantile: float = pr.feature_downsample_max_cell_quantile,
    feature_min_gene_total: Optional[int] = pr.feature_min_gene_total,
    feature_min_gene_top3: Optional[int] = pr.feature_min_gene_top3,
    feature_min_gene_relative_variance: Optional[float] = pr.feature_min_gene_relative_variance,
    feature_gene_names: Optional[Collection[str]] = None,
    feature_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    forbidden_gene_names: Optional[Collection[str]] = None,
    forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    cells_similarity_value_normalization: float = pr.cells_similarity_value_normalization,
    cells_similarity_log_data: bool = pr.cells_similarity_log_data,
    cells_similarity_method: str = pr.cells_similarity_method,
    target_metacell_size: float = pr.target_metacell_size,
    max_cell_size: Optional[float] = pr.max_cell_size,
    max_cell_size_factor: Optional[float] = pr.max_cell_size_factor,
    cell_sizes: Optional[Union[str, ut.Vector]] = pr.cell_sizes,
    knn_k: Optional[int] = pr.knn_k,
    min_knn_k: Optional[int] = pr.min_knn_k,
    knn_balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
    knn_incoming_degree_factor: float = pr.knn_incoming_degree_factor,
    knn_outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
    candidates_cell_seeds: Optional[Union[str, ut.Vector]] = None,
    min_seed_size_quantile: float = pr.min_seed_size_quantile,
    max_seed_size_quantile: float = pr.max_seed_size_quantile,
    candidates_cooldown_pass: float = pr.cooldown_pass,
    candidates_cooldown_node: float = pr.cooldown_node,
    candidates_cooldown_phase: float = pr.cooldown_phase,
    candidates_min_split_size_factor: Optional[float] = pr.candidates_min_split_size_factor,
    candidates_max_merge_size_factor: Optional[float] = pr.candidates_max_merge_size_factor,
    candidates_min_metacell_cells: Optional[int] = pr.min_metacell_cells,
    candidates_max_split_min_cut_strength: Optional[float] = pr.max_split_min_cut_strength,
    candidates_min_cut_seed_cells: Optional[int] = pr.min_cut_seed_cells,
    must_complete_cover: bool = False,
    deviants_min_gene_fold_factor: float = pr.deviants_min_gene_fold_factor,
    deviants_abs_folds: bool = pr.deviants_abs_folds,
    deviants_max_gene_fraction: Optional[float] = pr.deviants_max_gene_fraction,
    deviants_max_cell_fraction: Optional[float] = pr.deviants_max_cell_fraction,
    dissolve_min_robust_size_factor: Optional[float] = pr.dissolve_min_robust_size_factor,
    dissolve_min_convincing_size_factor: Optional[float] = pr.dissolve_min_convincing_size_factor,
    dissolve_min_convincing_gene_fold_factor: float = pr.dissolve_min_convincing_gene_fold_factor,
    dissolve_min_metacell_cells: int = pr.dissolve_min_metacell_cells,
    random_seed: int = pr.random_seed,
) -> AnnData:
    """
    Directly compute metacells using ``what`` (default: {what}) data.

    This directly computes the metacells on the whole data. Like any method that directly looks at
    the whole data at once, the amount of CPU and memory needed becomes unreasonable when the data
    size grows. Above O(10,000) you are much better off using the divide-and-conquer method.

    .. note::

        The current implementation is naive in that it computes the full dense N^2 correlation
        matrix, and only then extracts the sparse graph out of it. We actually need two copies where
        each requires 4 bytes per entry, so for O(100,000) cells, we have storage of
        O(100,000,000,000). In addition, the implementation is serial for the graph clustering
        phases.

        It is possible to mitigate this by fusing the correlations phase and the graph generation
        phase, parallelizing the result, and also (somehow) parallelizing the graph clustering
        phase. This might increase the "reasonable" size for the direct approach to O(100,000).

        We have decided not to invest in this direction since it won't allow us to push the size to
        O(1,000,000) and above. Instead we provide the divide-and-conquer method, which easily
        scales to O(1,000,000) on a single multi-core server, and to "unlimited" size if we further
        enhance the implementation to use a distributed compute cluster of such servers.

    .. todo::

        Should :py:func:`compute_direct_metacells` avoid computing the graph and partition it for a
        very small number of cells?

    **Input**

    The presumably "clean" annotated ``adata``, where the observations are cells and the variables
    are genes, where ``what`` is a per-variable-per-observation matrix or the name of a
    per-variable-per-observation annotation containing such a matrix.

    **Returns**

    Sets the following annotations in ``adata``:

    Variable (Gene) Annotations
        ``high_total_gene``
            A boolean mask of genes with "high" expression level.

        ``high_relative_variance_gene``
            A boolean mask of genes with "high" normalized variance, relative to other genes with a
            similar expression level.

        ``forbidden_gene``
            A boolean mask of genes which are forbidden from being chosen as "feature" genes based
            on their name.

        ``feature_gene``
            A boolean mask of the "feature" genes.

        ``gene_deviant_votes``
            The number of cells each gene marked as deviant (if zero, the gene did not mark any cell
            as deviant). This will be zero for non-"feature" genes.

    Observation (Cell) Annotations
        ``seed``
            The index of the seed metacell each cell was assigned to to. This is ``-1`` for
            non-"clean" cells.

        ``candidate``
            The index of the candidate metacell each cell was assigned to to. This is ``-1`` for
            non-"clean" cells.

        ``cell_deviant_votes``
            The number of genes that were the reason the cell was marked as deviant (if zero, the
            cell is not deviant).

        ``dissolved``
            A boolean mask of the cells contained in a dissolved metacell.

        ``metacell``
            The integer index of the metacell each cell belongs to. The metacells are in no
            particular order. Cells with no metacell assignment ("outliers") are given a metacell
            index of ``-1``.

        ``outlier``
            A boolean mask of the cells contained in no metacell.

    **Computation Parameters**

    1. Invoke :py:func:`metacells.pipeline.feature.extract_feature_data` to extract "feature" data
       from the clean data, using the
       ``feature_downsample_min_samples`` (default: {feature_downsample_min_samples}),
       ``feature_downsample_min_cell_quantile`` (default: {feature_downsample_min_cell_quantile}),
       ``feature_downsample_max_cell_quantile`` (default: {feature_downsample_max_cell_quantile}),
       ``feature_min_gene_total`` (default: {feature_min_gene_total}), ``feature_min_gene_top3``
       (default: {feature_min_gene_top3}), ``feature_min_gene_relative_variance`` (default:
       {feature_min_gene_relative_variance}), ``feature_gene_names`` (default:
       {feature_gene_names}), ``feature_gene_patterns`` (default: {feature_gene_patterns}),
       ``forbidden_gene_names`` (default: {forbidden_gene_names}), ``forbidden_gene_patterns``
       (default: {forbidden_gene_patterns}) and ``random_seed`` (default: {random_seed}) to make
       this replicable.

    2. Compute the fractions of each variable in each cell, and add the
       ``cells_similarity_value_normalization`` (default: {cells_similarity_value_normalization}) to
       it.

    3. If ``cells_similarity_log_data`` (default: {cells_similarity_log_data}), invoke the
       :py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of
       the data.

    4. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` to compute the
       similarity between each pair of cells, using the
       ``cells_similarity_method`` (default: {cells_similarity_method}).

    5. Invoke :py:func:`metacells.pipeline.collect.compute_effective_cell_sizes` using
       ``max_cell_size`` (default: {max_cell_size}), ``max_cell_size_factor`` (default:
       {max_cell_size_factor}) and ``cell_sizes`` (default: {cell_sizes}) to get the effective cell
       sizes to use.

    5. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` to compute a
       K-Nearest-Neighbors graph, using the
       ``knn_balanced_ranks_factor`` (default: {knn_balanced_ranks_factor}),
       ``knn_incoming_degree_factor`` (default: {knn_incoming_degree_factor})
       and
       ``knn_outgoing_degree_factor`` (default: {knn_outgoing_degree_factor}).
       If ``knn_k`` (default: {knn_k}) is not specified, then it is
       chosen to be the median number of cells required to reach the target metacell size,
       but at least ``min_knn_k`` (default: {min_knn_k}).

    6. Invoke :py:func:`metacells.tools.candidates.compute_candidate_metacells` to compute
       the candidate metacells, using the
       ``candidates_cell_seeds`` (default: {candidates_cell_seeds}),
       ``min_seed_size_quantile`` (default: {min_seed_size_quantile}),
       ``max_seed_size_quantile`` (default: {max_seed_size_quantile}),
       ``candidates_cooldown_pass`` (default: {candidates_cooldown_pass}),
       ``candidates_cooldown_node`` (default: {candidates_cooldown_node}),
       ``candidates_cooldown_phase`` (default: {candidates_cooldown_phase}),
       ``candidates_min_split_size_factor`` (default: {candidates_min_split_size_factor}),
       ``candidates_max_merge_size_factor`` (default: {candidates_max_merge_size_factor}),
       ``candidates_min_metacell_cells`` (default: {candidates_min_metacell_cells}),
       and
       ``random_seed`` (default: {random_seed})
       to make this replicable. This tries to build metacells of the
       ``target_metacell_size`` (default: {target_metacell_size})
       using the effective cell sizes.

    7. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke
       :py:func:`metacells.tools.deviants.find_deviant_cells` to remove deviants from the candidate
       metacells, using the
       ``deviants_min_gene_fold_factor`` (default: {deviants_min_gene_fold_factor}),
       ``deviants_abs_folds`` (default: {deviants_abs_folds}),
       ``deviants_max_gene_fraction`` (default: {deviants_max_gene_fraction})
       and
       ``deviants_max_cell_fraction`` (default: {deviants_max_cell_fraction}).

    8. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke
       :py:func:`metacells.tools.dissolve.dissolve_metacells` to dissolve small unconvincing
       metacells, using the same
       ``target_metacell_size`` (default: {target_metacell_size}),
       and the effective cell sizes
       and the
       ``dissolve_min_robust_size_factor`` (default: {dissolve_min_robust_size_factor}),
       ``dissolve_min_convincing_size_factor`` (default: {dissolve_min_convincing_size_factor}),
       ``dissolve_min_convincing_gene_fold_factor`` (default: {dissolve_min_convincing_size_factor})
       and
       ``dissolve_min_metacell_cells`` (default: ``dissolve_min_metacell_cells``).
    """
    fdata = extract_feature_data(
        adata,
        what,
        top_level=False,
        downsample_min_samples=feature_downsample_min_samples,
        downsample_min_cell_quantile=feature_downsample_min_cell_quantile,
        downsample_max_cell_quantile=feature_downsample_max_cell_quantile,
        min_gene_relative_variance=feature_min_gene_relative_variance,
        min_gene_total=feature_min_gene_total,
        min_gene_top3=feature_min_gene_top3,
        forced_gene_names=feature_gene_names,
        forced_gene_patterns=feature_gene_patterns,
        forbidden_gene_names=forbidden_gene_names,
        forbidden_gene_patterns=forbidden_gene_patterns,
        random_seed=random_seed,
    )

    if fdata is None:
        raise ValueError("Empty feature data, giving up")

    effective_cell_sizes, max_cell_size, _cell_scale_factors = compute_effective_cell_sizes(
        adata, max_cell_size=max_cell_size, max_cell_size_factor=max_cell_size_factor, cell_sizes=cell_sizes
    )
    ut.log_calc("effective_cell_sizes", effective_cell_sizes, formatter=ut.sizes_description)

    if max_cell_size is not None:
        if candidates_min_metacell_cells is not None:
            target_metacell_size = max(target_metacell_size, max_cell_size * candidates_min_metacell_cells)

        if dissolve_min_metacell_cells is not None:
            target_metacell_size = max(target_metacell_size, max_cell_size * dissolve_min_metacell_cells)

        if candidates_min_metacell_cells is not None or dissolve_min_metacell_cells is not None:
            ut.log_calc("target_metacell_size", target_metacell_size)

    data = ut.get_vo_proper(fdata, "downsampled", layout="row_major")
    data = ut.to_numpy_matrix(data, copy=True)

    if cells_similarity_value_normalization > 0:
        data += cells_similarity_value_normalization

    if cells_similarity_log_data:
        data = ut.log_data(data, base=2)

    if knn_k is None:
        if effective_cell_sizes is None:
            median_cell_size = 1.0
        else:
            median_cell_size = float(np.median(effective_cell_sizes))
        knn_k = int(round(target_metacell_size / median_cell_size))
        if min_knn_k is not None:
            knn_k = max(knn_k, min_knn_k)

    if knn_k == 0:
        ut.log_calc("knn_k: 0 (too small, try single metacell)")
        ut.set_o_data(fdata, "candidate", np.full(fdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0")
    elif knn_k >= fdata.n_obs:
        ut.log_calc(f"knn_k: {knn_k} (too large, try single metacell)")
        ut.set_o_data(fdata, "candidate", np.full(fdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0")

    else:
        ut.log_calc("knn_k", knn_k)

        tl.compute_obs_obs_similarity(fdata, data, method=cells_similarity_method, reproducible=(random_seed != 0))

        tl.compute_obs_obs_knn_graph(
            fdata,
            k=knn_k,
            balanced_ranks_factor=knn_balanced_ranks_factor,
            incoming_degree_factor=knn_incoming_degree_factor,
            outgoing_degree_factor=knn_outgoing_degree_factor,
        )

        tl.compute_candidate_metacells(
            fdata,
            target_metacell_size=target_metacell_size,
            cell_sizes=effective_cell_sizes,
            cell_seeds=candidates_cell_seeds,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            cooldown_pass=candidates_cooldown_pass,
            cooldown_node=candidates_cooldown_node,
            cooldown_phase=candidates_cooldown_phase,
            min_split_size_factor=candidates_min_split_size_factor,
            max_merge_size_factor=candidates_max_merge_size_factor,
            min_metacell_cells=candidates_min_metacell_cells,
            max_split_min_cut_strength=candidates_max_split_min_cut_strength,
            min_cut_seed_cells=candidates_min_cut_seed_cells,
            must_complete_cover=must_complete_cover,
            random_seed=random_seed,
        )

        ut.set_oo_data(adata, "obs_similarity", ut.get_oo_proper(fdata, "obs_similarity"))

        ut.set_oo_data(adata, "obs_outgoing_weights", ut.get_oo_proper(fdata, "obs_outgoing_weights"))

        seed_of_cells = ut.get_o_numpy(fdata, "seed", formatter=ut.groups_description)

        ut.set_o_data(adata, "seed", seed_of_cells, formatter=ut.groups_description)

    candidate_of_cells = ut.get_o_numpy(fdata, "candidate", formatter=ut.groups_description)

    ut.set_o_data(adata, "candidate", candidate_of_cells, formatter=ut.groups_description)

    if must_complete_cover:
        assert np.min(candidate_of_cells) == 0

        deviant_votes_of_genes = np.zeros(adata.n_vars, dtype="float32")
        deviant_votes_of_cells = np.zeros(adata.n_obs, dtype="float32")
        dissolved_of_cells = np.zeros(adata.n_obs, dtype="bool")

        ut.set_v_data(adata, "gene_deviant_votes", deviant_votes_of_genes, formatter=ut.mask_description)

        ut.set_o_data(adata, "cell_deviant_votes", deviant_votes_of_cells, formatter=ut.mask_description)

        ut.set_o_data(adata, "dissolved", dissolved_of_cells, formatter=ut.mask_description)

        ut.set_o_data(adata, "metacell", candidate_of_cells, formatter=ut.groups_description)

    else:
        tl.find_deviant_cells(
            adata,
            candidates=candidate_of_cells,
            min_gene_fold_factor=deviants_min_gene_fold_factor,
            abs_folds=deviants_abs_folds,
            max_gene_fraction=deviants_max_gene_fraction,
            max_cell_fraction=deviants_max_cell_fraction,
        )

        tl.dissolve_metacells(
            adata,
            candidates=candidate_of_cells,
            target_metacell_size=target_metacell_size,
            cell_sizes=effective_cell_sizes,
            min_robust_size_factor=dissolve_min_robust_size_factor,
            min_convincing_size_factor=dissolve_min_convincing_size_factor,
            min_convincing_gene_fold_factor=dissolve_min_convincing_gene_fold_factor,
            min_metacell_cells=dissolve_min_metacell_cells,
        )

        metacell_of_cells = ut.get_o_numpy(adata, "metacell", formatter=ut.groups_description)

        outlier_of_cells = metacell_of_cells < 0
        ut.set_o_data(adata, "outlier", outlier_of_cells, formatter=ut.mask_description)

    return fdata
Пример #20
0
def combine_masks(  # pylint: disable=too-many-branches,too-many-statements
    adata: AnnData,
    masks: List[str],
    *,
    invert: bool = False,
    to: Optional[str] = None,
) -> Optional[ut.PandasSeries]:
    """
    Combine different pre-computed masks into a final overall mask.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes.

    **Returns**

    If ``to`` (default: {to}) is ``None``, returns the computed mask. Otherwise, sets the
    mask as an annotation (per-variable or per-observation depending on the type of the combined masks).

    **Computation Parameters**

    1. For each of the mask in ``masks``, fetch it. Silently ignore missing masks if the name has a
       ``?`` suffix. Invert the mask if the name has a ``~`` prefix. If the name has a ``|`` prefix
       (before the ``~`` prefix, if any), then bitwise-OR the mask into the OR mask, otherwise (or if
       it has a ``&`` prefix), bitwise-AND the mask into the AND mask.

    2. Combine (bitwise-AND) the AND mask and the OR mask into a single mask.

    3. If ``invert`` (default: {invert}), invert the result combined mask.
    """
    assert len(masks) > 0

    per: Optional[str] = None

    and_mask: Optional[ut.NumpyVector] = None
    or_mask: Optional[ut.NumpyVector] = None

    for mask_name in masks:
        log_mask_name = mask_name

        if mask_name[0] == "|":
            is_or = True
            mask_name = mask_name[1:]
        else:
            is_or = False
            if mask_name[0] == "&":
                mask_name = mask_name[1:]

        if mask_name[0] == "~":
            invert_mask = True
            mask_name = mask_name[1:]
        else:
            invert_mask = False

        if mask_name[-1] == "?":
            must_exist = False
            mask_name = mask_name[:-1]
        else:
            must_exist = True

        if mask_name in adata.obs:
            mask_per = "o"
            mask = ut.get_o_numpy(
                adata, mask_name, formatter=ut.mask_description) > 0
        elif mask_name in adata.var:
            mask_per = "v"
            mask = ut.get_v_numpy(
                adata, mask_name, formatter=ut.mask_description) > 0
        else:
            if must_exist:
                raise KeyError(f"unknown mask data: {mask_name}")
            continue

        if mask.dtype != "bool":
            raise ValueError(f"the data: {mask_name} is not a boolean mask")

        if invert_mask:
            mask = ~mask

        if ut.logging_calc():
            ut.log_calc(log_mask_name, mask)

        if per is None:
            per = mask_per
        else:
            if mask_per != per:
                raise ValueError(
                    "mixing per-observation and per-variable masks")

        if is_or:
            if or_mask is None:
                or_mask = mask
            else:
                or_mask = or_mask | mask
        else:
            if and_mask is None:
                and_mask = mask
            else:
                and_mask = and_mask & mask

    if and_mask is not None:
        if or_mask is not None:
            combined_mask = and_mask & or_mask
        else:
            combined_mask = and_mask
    else:
        if or_mask is not None:
            combined_mask = or_mask
        else:
            raise ValueError("no masks to combine")

    if invert:
        combined_mask = ~combined_mask

    if to is None:
        ut.log_return("combined", combined_mask)
        if per == "o":
            return ut.to_pandas_series(combined_mask, index=adata.obs_names)
        assert per == "v"
        return ut.to_pandas_series(combined_mask, index=adata.var_names)

    if per == "o":
        ut.set_o_data(adata, to, combined_mask)
    else:
        ut.set_v_data(adata, to, combined_mask)

    return None
Пример #21
0
def _apply_annotations(  # pylint: disable=too-many-branches
    adata: AnnData,
    sdata: AnnData,
    per: str,
    annotations: Dict[str, DefaultValues],
    indices: Union[str, ut.Vector],
) -> None:
    full_name = ut.get_name(adata)
    slice_name = ut.get_name(sdata)

    assert per in ("o", "v")

    if per == "o":
        full_data = adata.obs
        full_size = adata.n_obs
        slice_data = sdata.obs
        slice_size = sdata.n_obs
        full_indices = ut.get_o_numpy(sdata, indices)
    else:
        full_data = adata.var
        full_size = adata.n_vars
        slice_data = sdata.var
        slice_size = sdata.n_vars
        full_indices = ut.get_v_numpy(sdata, indices)

    for name, default_values in annotations.items():
        slice_value = slice_data.get(name)
        if slice_value is not None:
            formatter: Optional[Callable[[Any], str]] = None
        else:
            if default_values.slice == Skip or isinstance(
                    default_values.slice, Skip):
                continue

            if default_values.slice == Raise or isinstance(
                    default_values.slice, Raise):
                if slice_name is None:
                    raise KeyError(f"unknown slice data name: {name}")
                raise KeyError(
                    f"unknown slice data: {slice_name} name: {name}")

            slice_value = default_values.slice

            def formatter(_: Any) -> str:
                # pylint: disable=cell-var-from-loop
                return f"{slice_size} <- {slice_value}"

            # pylint: enable=cell-var-from-loop

        full_value = full_data.get(name)
        if full_value is not None:
            ut.unfreeze(full_value)
        else:
            if default_values.full == Skip or isinstance(
                    default_values.full, Skip):
                continue

            if default_values.full == Raise or isinstance(
                    default_values.full, Raise):
                if full_name is None:
                    raise KeyError(f"unknown full data name: {name}")
                raise KeyError(f"unknown full data: {full_name} name: {name}")

            if default_values.full is None:
                full_value = np.full(full_size, None, dtype="float32")
            else:
                full_value = np.full(full_size, default_values.full)

        full_value[full_indices] = slice_value
        if per == "o":
            ut.set_o_data(adata, name, full_value, formatter=formatter)
        else:
            ut.set_v_data(adata, name, full_value, formatter=formatter)