Пример #1
0
def compute_query_projection(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    weights: ut.Matrix,
    atlas_total_umis: Optional[ut.Vector] = None,
    query_total_umis: Optional[ut.Vector] = None,
) -> None:
    """
    Compute the projected image of the query on the atlas.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation
    containing such a matrix.

    The ``weights`` of the projection where each row is a query metacell, each column is an atlas metacell, and the
    value is the weight of the atlas cell for projecting the metacell, such that the sum of weights in each row
    is one.

    **Returns**

    In addition, sets the following annotations in ``qdata``:

    Observation (Cell) Annotations
        ``projection``
            The number of UMIs of each gene in the projected image of the query to the metacell, if the total number of
            UMIs in the projection is equal to the total number of UMIs in the query metacell.

    **Computation Parameters**

    1. Compute the fraction of each gene in the atlas and the query based on the total UMIs, unless ``atlas_total_umis``
       and/or ``query_total_umis`` are specified.

    2. Compute the projected image of each query metacell on the atlas using the weights.

    3. Convert this image to UMIs count based on the total UMIs of each metacell. Note that if overriding the total
       atlas or query UMIs, this means that the result need not sum to this total.
    """
    assert np.all(adata.var_names == qdata.var_names)

    atlas_umis = ut.get_vo_proper(adata, what, layout="row_major")
    query_umis = ut.get_vo_proper(qdata, what, layout="row_major")

    if atlas_total_umis is None:
        atlas_total_umis = ut.sum_per(atlas_umis, per="row")
    atlas_total_umis = ut.to_numpy_vector(atlas_total_umis)

    if query_total_umis is None:
        query_total_umis = ut.sum_per(query_umis, per="row")
    query_total_umis = ut.to_numpy_vector(query_total_umis)

    atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis))
    projected_fractions = weights @ atlas_fractions  # type: ignore
    projected_umis = ut.scale_by(projected_fractions, scale=query_total_umis, by="row")
    ut.set_vo_data(qdata, "projected", projected_umis)
Пример #2
0
def _test_per(rows_matrix: ut.Matrix) -> None:
    columns_matrix = ut.to_layout(rows_matrix, layout="column_major")

    assert np.allclose(ut.nnz_per(rows_matrix, per="row"), np.array([2, 3]))
    assert np.allclose(ut.nnz_per(columns_matrix, per="column"), np.array([1, 2, 2]))

    assert np.allclose(ut.sum_per(rows_matrix, per="row"), np.array([3, 12]))
    assert np.allclose(ut.sum_per(columns_matrix, per="column"), np.array([3, 5, 7]))

    assert np.allclose(ut.max_per(rows_matrix, per="row"), np.array([2, 5]))
    assert np.allclose(ut.max_per(columns_matrix, per="column"), np.array([3, 4, 5]))

    assert np.allclose(ut.min_per(rows_matrix, per="row"), np.array([0, 3]))
    assert np.allclose(ut.min_per(columns_matrix, per="column"), np.array([0, 1, 2]))

    assert np.allclose(ut.sum_squared_per(rows_matrix, per="row"), np.array([5, 50]))
    assert np.allclose(ut.sum_squared_per(columns_matrix, per="column"), np.array([9, 17, 29]))

    assert np.allclose(ut.fraction_per(rows_matrix, per="row"), np.array([3 / 15, 12 / 15]))
    assert np.allclose(ut.fraction_per(columns_matrix, per="column"), np.array([3 / 15, 5 / 15, 7 / 15]))

    assert np.allclose(ut.mean_per(rows_matrix, per="row"), np.array([3 / 3, 12 / 3]))
    assert np.allclose(ut.mean_per(columns_matrix, per="column"), np.array([3 / 2, 5 / 2, 7 / 2]))

    assert np.allclose(
        ut.variance_per(rows_matrix, per="row"), np.array([5 / 3 - (3 / 3) ** 2, 50 / 3 - (12 / 3) ** 2])
    )

    assert np.allclose(
        ut.variance_per(columns_matrix, per="column"),
        np.array([9 / 2 - (3 / 2) ** 2, 17 / 2 - (5 / 2) ** 2, 29 / 2 - (7 / 2) ** 2]),
    )

    assert np.allclose(
        ut.normalized_variance_per(columns_matrix, per="column"),
        np.array(
            [(9 / 2 - (3 / 2) ** 2) / (3 / 2), (17 / 2 - (5 / 2) ** 2) / (5 / 2), (29 / 2 - (7 / 2) ** 2) / (7 / 2)]
        ),
    )

    dense = ut.to_numpy_matrix(ut.fraction_by(rows_matrix, by="row"))
    assert np.allclose(dense, np.array([[0 / 3, 1 / 3, 2 / 3], [3 / 12, 4 / 12, 5 / 12]]))

    dense = ut.to_numpy_matrix(ut.fraction_by(columns_matrix, by="column"))
    assert np.allclose(dense, np.array([[0 / 3, 1 / 5, 2 / 7], [3 / 3, 4 / 5, 5 / 7]]))
Пример #3
0
def find_systematic_genes(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    atlas_total_umis: Optional[ut.Vector] = None,
    query_total_umis: Optional[ut.Vector] = None,
    low_gene_quantile: float = pr.systematic_low_gene_quantile,
    high_gene_quantile: float = pr.systematic_high_gene_quantile,
    to_property_name: str = "systematic_gene",
) -> None:
    """
    Find genes that

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation
    containing such a matrix.

    **Returns**

    A matrix whose rows are query metacells and columns are atlas metacells, where each entry is the weight of the atlas
    metacell in the projection of the query metacells. The sum of weights in each row (that is, for a single query
    metacell) is 1. The weighted sum of the atlas metacells using these weights is the "projected" image of the query
    metacell onto the atlas.

    In addition, sets the following annotations in ``qdata``:

    Variable (Gene) Annotations
        ``systematic_gene`` (or ``to_property_name``)
            A boolean mask indicating whether the gene is systematically higher or lower in the query compared to the
            atlas.

    **Computation Parameters**

    1. Compute the fraction of each gene out of the total UMIs in both the atlas and the query. If ``atlas_total_umis``
       and/or ``query_total_umis`` are given, use them as the basis instead of the sum of the UMIs.

    2. Compute for each gene its ``low_gene_quantile`` (default: {low_gene_quantile}) fraction in the query, and its
       ``high_gene_quantile`` (default: {high_gene_quantile}) fraction in the atlas.

    3. Compute for each gene its standard deviation in the atlas.

    4. Mark as systematic the genes for which the low quantile value in the query is at least the atlas high quantile
       value.

    5. Mark as systematic the genes for which the low quantile value in the atlas is at least the query high quantile
       value.
    """
    assert 0 <= low_gene_quantile <= 1
    assert 0 <= high_gene_quantile <= 1
    assert np.all(adata.var_names == qdata.var_names)

    query_umis = ut.get_vo_proper(qdata, what, layout="row_major")
    atlas_umis = ut.get_vo_proper(adata, what, layout="row_major")

    atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis))
    query_fractions = ut.to_numpy_matrix(ut.fraction_by(query_umis, by="row", sums=query_total_umis))

    query_fractions = ut.to_layout(query_fractions, layout="column_major")
    atlas_fractions = ut.to_layout(atlas_fractions, layout="column_major")

    query_low_gene_values = ut.quantile_per(query_fractions, low_gene_quantile, per="column")
    atlas_low_gene_values = ut.quantile_per(atlas_fractions, low_gene_quantile, per="column")

    query_high_gene_values = ut.quantile_per(query_fractions, high_gene_quantile, per="column")
    atlas_high_gene_values = ut.quantile_per(atlas_fractions, high_gene_quantile, per="column")

    query_above_atlas = query_low_gene_values > atlas_high_gene_values
    atlas_above_query = atlas_low_gene_values >= query_high_gene_values

    systematic = query_above_atlas | atlas_above_query

    ut.set_v_data(qdata, to_property_name, systematic)
Пример #4
0
def project_query_onto_atlas(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    atlas_total_umis: Optional[ut.Vector] = None,
    query_total_umis: Optional[ut.Vector] = None,
    project_log_data: bool = pr.project_log_data,
    fold_normalization: float = pr.project_fold_normalization,
    min_significant_gene_value: float = pr.project_min_significant_gene_value,
    max_consistency_fold_factor: float = pr.project_max_consistency_fold_factor,
    candidates_count: int = pr.project_candidates_count,
    min_usage_weight: float = pr.project_min_usage_weight,
    reproducible: bool,
) -> ut.CompressedMatrix:
    """
    Project query metacells onto atlas metacells.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation
    containing such a matrix.

    Typically this data excludes any genes having a systematic difference between the query and the atlas, e.g. genes
    detected by by :py:func:`metacells.tools.project.find_systematic_genes`.

    **Returns**

    A matrix whose rows are query metacells and columns are atlas metacells, where each entry is the weight of the atlas
    metacell in the projection of the query metacells. The sum of weights in each row (that is, for a single query
    metacell) is 1. The weighted sum of the atlas metacells using these weights is the "projected" image of the query
    metacell onto the atlas.

    In addition, sets the following annotations in ``qdata``:

    Observation (Cell) Annotations
        ``similar``
            A boolean mask indicating whether the query metacell is similar to its projection onto the atlas. If
            ``False`` the metacells is said to be "dissimilar", which may indicate the query contains cell states that
            do not appear in the atlas.

    **Computation Parameters**

    0. All fold computations (log2 of the ratio between gene expressions as a fraction of the total UMIs) use the
       ``fold_normalization`` (default: {fold_normalization}). Fractions are computed based on the total UMIs, unless
       ``atlas_total_umis`` and/or ``query_total_umis`` are specified.

    For each query metacell:

    1. Correlate the metacell with all the atlas metacells, and pick the highest-correlated one as the "anchor".
       If ``reproducible``, a slower (still parallel) but reproducible algorithm will be used.

    2. Consider as candidates only atlas metacells whose maximal gene fold factor compared to the anchor is at most
       ``max_consistency_fold_factor`` (default: {max_consistency_fold_factor}). Ignore the fold factors of genes whose
       sum of UMIs in the anchor and the candidate metacells is less than ``min_significant_gene_value`` (default:
       {min_significant_gene_value}).

    3. Select the ``candidates_count`` (default: {candidates_count}) candidate metacells with the highest correlation
       with the query metacell.

    4. Compute the non-negative weights (with a sum of 1) of the selected candidates that give the best projection of
       the query metacells onto the atlas. Since the algorithm for computing these weights rarely produces an exact 0
       weight, reduce all weights less than the ``min_usage_weight`` (default: {min_usage_weight}) to zero. If
       ``project_log_data`` (default: {project_log_data}), compute the match on the log of the data instead of the
       actual data.
    """
    assert fold_normalization > 0
    assert candidates_count > 0
    assert min_usage_weight >= 0
    assert max_consistency_fold_factor >= 0
    assert np.all(adata.var_names == qdata.var_names)

    atlas_umis = ut.get_vo_proper(adata, what, layout="row_major")
    query_umis = ut.get_vo_proper(qdata, what, layout="row_major")

    if atlas_total_umis is None:
        atlas_total_umis = ut.sum_per(atlas_umis, per="row")
    atlas_total_umis = ut.to_numpy_vector(atlas_total_umis)

    if query_total_umis is None:
        query_total_umis = ut.sum_per(query_umis, per="row")
    query_total_umis = ut.to_numpy_vector(query_total_umis)

    atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis))
    query_fractions = ut.to_numpy_matrix(ut.fraction_by(query_umis, by="row", sums=query_total_umis))

    atlas_fractions += fold_normalization
    query_fractions += fold_normalization

    atlas_log_fractions = np.log2(atlas_fractions)
    query_log_fractions = np.log2(query_fractions)

    atlas_fractions -= fold_normalization
    query_fractions -= fold_normalization

    if project_log_data:
        atlas_project_data = atlas_log_fractions
        query_project_data = query_log_fractions
    else:
        atlas_project_data = atlas_fractions
        query_project_data = query_fractions

    query_atlas_corr = ut.cross_corrcoef_rows(query_project_data, atlas_project_data, reproducible=reproducible)

    @ut.timed_call("project_single_metacell")
    def _project_single(query_metacell_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]:
        return _project_single_metacell(
            atlas_umis=atlas_umis,
            query_atlas_corr=query_atlas_corr,
            atlas_project_data=atlas_project_data,
            query_project_data=query_project_data,
            atlas_log_fractions=atlas_log_fractions,
            candidates_count=candidates_count,
            min_significant_gene_value=min_significant_gene_value,
            min_usage_weight=min_usage_weight,
            max_consistency_fold_factor=max_consistency_fold_factor,
            query_metacell_index=query_metacell_index,
        )

    results = list(ut.parallel_map(_project_single, qdata.n_obs))

    indices = np.concatenate([result[0] for result in results], dtype="int32")
    data = np.concatenate([result[1] for result in results], dtype="float32")

    atlas_used_sizes = [len(result[0]) for result in results]
    atlas_used_sizes.insert(0, 0)
    indptr = np.cumsum(np.array(atlas_used_sizes))

    return sp.csr_matrix((data, indices, indptr), shape=(qdata.n_obs, adata.n_obs))
Пример #5
0
def compute_knn_by_features(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    max_top_feature_genes: int = pr.max_top_feature_genes,
    similarity_value_normalization: float = pr.
    umap_similarity_value_normalization,
    similarity_log_data: bool = pr.umap_similarity_log_data,
    similarity_method: str = pr.umap_similarity_method,
    logistics_location: float = pr.logistics_location,
    logistics_slope: float = pr.logistics_slope,
    k: int,
    balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
    incoming_degree_factor: float = pr.knn_incoming_degree_factor,
    outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
    reproducible: bool = pr.reproducible,
) -> ut.PandasFrame:
    """
    Compute KNN graph between metacells based on feature genes.

    If ``reproducible`` (default: {reproducible}) is ``True``, a slower (still parallel) but
    reproducible algorithm will be used to compute pearson correlations.

    **Input**

    Annotated ``adata`` where each observation is a metacells and the variables are genes,
    are genes, where ``what`` is a per-variable-per-observation matrix or the name of a
    per-variable-per-observation annotation containing such a matrix.

    **Returns**

    Sets the following in ``adata``:

    Observations-Pair (Metacells) Annotations
        ``obs_outgoing_weights``
            A sparse square matrix where each non-zero entry is the weight of an edge between a pair
            of cells or genes, where the sum of the weights of the outgoing edges for each element
            is 1 (there is always at least one such edge).

    Also return a pandas data frame of the similarities between the observations (metacells).

    **Computation Parameters**

    1. Invoke :py:func:`metacells.tools.high.find_top_feature_genes` using ``max_top_feature_genes``
       (default: {max_top_feature_genes}) to pick the feature genes to use to compute similarities
       between the metacells.

    2. Compute the fractions of each gene in each cell, and add the
       ``similarity_value_normalization`` (default: {similarity_value_normalization}) to
       it.

    3. If ``similarity_log_data`` (default: {similarity_log_data}), invoke the
       :py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of
       the data.

    4. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` using
       ``similarity_method`` (default: {similarity_method}), ``logistics_location`` (default:
       {logistics_slope}) and ``logistics_slope`` (default: {logistics_slope}) and convert this
       to distances.

    5. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` using the distances,
       ``k`` (no default!), ``balanced_ranks_factor`` (default: {balanced_ranks_factor}),
       ``incoming_degree_factor`` (default: {incoming_degree_factor}), ``outgoing_degree_factor``
       (default: {outgoing_degree_factor}) to compute a "skeleton" graph to overlay on top of the
       UMAP graph.
    """
    tl.find_top_feature_genes(adata, max_genes=max_top_feature_genes)

    all_data = ut.get_vo_proper(adata, what, layout="row_major")
    all_fractions = ut.fraction_by(all_data, by="row")

    top_feature_genes_mask = ut.get_v_numpy(adata, "top_feature_gene")

    top_feature_genes_fractions = all_fractions[:, top_feature_genes_mask]
    top_feature_genes_fractions = ut.to_layout(top_feature_genes_fractions,
                                               layout="row_major")
    top_feature_genes_fractions = ut.to_numpy_matrix(
        top_feature_genes_fractions)

    top_feature_genes_fractions += similarity_value_normalization

    if similarity_log_data:
        top_feature_genes_fractions = ut.log_data(top_feature_genes_fractions,
                                                  base=2)

    tdata = ut.slice(adata, vars=top_feature_genes_mask)
    similarities = tl.compute_obs_obs_similarity(
        tdata,
        top_feature_genes_fractions,
        method=similarity_method,
        reproducible=reproducible,
        logistics_location=logistics_location,
        logistics_slope=logistics_slope,
        inplace=False,
    )
    assert similarities is not None

    tl.compute_obs_obs_knn_graph(
        adata,
        similarities,
        k=k,
        balanced_ranks_factor=balanced_ranks_factor,
        incoming_degree_factor=incoming_degree_factor,
        outgoing_degree_factor=outgoing_degree_factor,
    )

    return similarities
Пример #6
0
def compute_outliers_matches(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    gdata: AnnData,
    group: Union[str, ut.Vector] = "metacell",
    similar: str = "similar",
    value_normalization: float = pr.outliers_value_normalization,
    reproducible: bool,
) -> None:
    """
    Given an assignment of observations (cells) to groups (metacells), compute for each outlier the "most similar"
    group.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, ``gdata`` is assumed to have one observation for each group, and use the same genes as ``adata``.

    **Returns**

    Sets the following in ``adata``:

    Per-Observation (Cell) Annotations

        ``similar`` (default: {similar})
            For each observation (cell), the index of the "most similar" group.

    **Computation Parameters**

    1. Compute the log2 of the fraction of each gene in each of the outlier cells and the group metacells using
       the ``value_normalization`` (default: {value_normalization}).

    2. Cross-correlate each of the outlier cells with each of the group metacells, in a ``reproducible`` manner.
    """
    group_of_cells = ut.get_o_numpy(adata, group)
    outliers_mask = group_of_cells < 0
    odata = ut.slice(adata, obs=outliers_mask)

    outliers_data = ut.get_vo_proper(odata, what, layout="row_major")
    groups_data = ut.get_vo_proper(gdata, what, layout="row_major")

    outliers_fractions = ut.fraction_by(outliers_data, by="row")
    groups_fractions = ut.fraction_by(groups_data, by="row")

    outliers_fractions = ut.to_numpy_matrix(outliers_fractions)
    groups_fractions = ut.to_numpy_matrix(groups_fractions)

    outliers_fractions += value_normalization
    groups_fractions += value_normalization

    outliers_log_fractions = np.log2(outliers_fractions,
                                     out=outliers_fractions)
    groups_log_fractions = np.log2(groups_fractions, out=groups_fractions)

    outliers_groups_correlation = ut.cross_corrcoef_rows(
        outliers_log_fractions,
        groups_log_fractions,
        reproducible=reproducible)
    outliers_similar_group_indices = np.argmax(outliers_groups_correlation,
                                               axis=1)
    assert len(outliers_similar_group_indices) == odata.n_obs

    cells_similar_group_indices = np.full(adata.n_obs, -1, dtype="int32")
    cells_similar_group_indices[outliers_mask] = outliers_similar_group_indices
    ut.set_o_data(adata, similar, cells_similar_group_indices)
Пример #7
0
def compute_significant_projected_fold_factors(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    total_umis: Optional[ut.Vector],
    projected: Union[str, ut.Matrix] = "projected",
    fold_normalization: float = pr.project_fold_normalization,
    min_significant_gene_value: float = pr.project_min_significant_gene_value,
    min_gene_fold_factor: float = pr.project_max_projection_fold_factor,
    min_entry_fold_factor: float = pr.min_entry_project_fold_factor,
    abs_folds: bool = pr.project_abs_folds,
) -> None:
    """
    Compute the significant projected fold factors of genes for each query metacell.

    This computes, for each metacell of the query, the fold factors between the actual query UMIs and the UMIs of the
    projection of the metacell onto the atlas (see :py:func:`metacells.tools.project.project_query_onto_atlas`). The
    result per-metacell-per-gene matrix is then made sparse by discarding too-low values (setting them to zero).
    Ideally, this matrix should be "very" sparse. If it contains "too many" non-zero values, more genes need to
    be ignored by the projection, or somehow corrected for batch effects prior to computing the projection.

    **Input**

    Annotated ``adata``, where the observations are query metacells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, the ``projected`` UMIs of each query metacells onto the atlas.

    **Returns**

    Sets the following in ``gdata``:

    Per-Variable Per-Observation (Gene-Cell) Annotations
        ``projected_fold``
            For each gene and query metacell, the fold factor of this gene between the query and its projection (unless
            the value is too low to be of interest, in which case it will be zero).

    **Computation Parameters**

    1. For each group (metacell), for each gene, compute the gene's fold factor
       log2((actual UMIs + ``fold_normalization``) / (expected UMIs + ``fold_normalization``)), similarly to
       :py:func:`metacells.tools.project.project_query_onto_atlas` (the default ``fold_normalization`` is
       {fold_normalization}).

    2. Set the fold factor to zero for every case where the total UMIs in the query metacell and the projected image is
       not at least ``min_significant_gene_value`` (default: {min_significant_gene_value}).

    3. If the maximal fold factor for a gene (across all metacells) is below ``min_gene_fold_factor`` (default:
       {min_gene_fold_factor}), then set all the gene's fold factors to zero (too low to be of interest).

    4. Otherwise, for any metacell whose fold factor for the gene is less than ``min_entry_fold_factor`` (default:
       {min_entry_fold_factor}), set the fold factor to zero (too low to be of interest). If ``abs_folds`` (default:
       {abs_folds}), consider the absolute fold factors.
    """
    assert 0 <= min_entry_fold_factor <= min_gene_fold_factor
    assert fold_normalization >= 0

    metacells_data = ut.get_vo_proper(adata, what, layout="row_major")
    projected_data = ut.get_vo_proper(adata, projected, layout="row_major")

    metacells_fractions = ut.fraction_by(metacells_data,
                                         by="row",
                                         sums=total_umis)
    projected_fractions = ut.fraction_by(projected_data,
                                         by="row",
                                         sums=total_umis)

    metacells_fractions += fold_normalization  # type: ignore
    projected_fractions += fold_normalization  # type: ignore

    dense_folds = metacells_fractions / projected_fractions  # type: ignore
    dense_folds = np.log2(dense_folds, out=dense_folds)

    total_umis = ut.to_numpy_matrix(metacells_data +
                                    projected_data)  # type: ignore
    insignificant_folds_mask = total_umis < min_significant_gene_value
    ut.log_calc("insignificant entries", insignificant_folds_mask)
    dense_folds[insignificant_folds_mask] = 0.0

    significant_folds = significant_folds_matrix(dense_folds,
                                                 min_gene_fold_factor,
                                                 min_entry_fold_factor,
                                                 abs_folds)
    ut.set_vo_data(adata, "projected_fold", significant_folds)
Пример #8
0
def find_metacells_significant_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_range_fold: float = pr.min_significant_metacells_gene_range_fold_factor,
    normalization: float = pr.metacells_gene_range_normalization,
    min_gene_fraction: float = pr.min_significant_metacells_gene_fraction,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have a significant signal in metacells data. This computation is too unreliable to be used on
    cells.

    Find genes which have a high maximal expression in at least one metacell, and a wide range of expression across the
    metacells. Such genes are good candidates for being used as marker genes and/or to compute distances between
    metacells.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``significant_gene``
            A boolean mask indicating whether each gene was found to be significant.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Compute the minimal and maximal expression level of each gene.

    2. Select the genes whose fold factor (log2 of maximal over minimal value, using the ``normalization``
       (default: {normalization}) is at least ``min_gene_range_fold`` (default: {min_gene_range_fold}).

    3. Select the genes whose maximal expression is at least ``min_gene_fraction`` (default: {min_gene_fraction}).
    """
    assert normalization >= 0

    data = ut.get_vo_proper(adata, what, layout="row_major")
    fractions_of_genes = ut.to_layout(ut.fraction_by(data, by="row"), layout="column_major")

    min_fraction_of_genes = ut.min_per(fractions_of_genes, per="column")
    max_fraction_of_genes = ut.max_per(fractions_of_genes, per="column")

    high_max_fraction_genes_mask = max_fraction_of_genes >= min_gene_fraction
    ut.log_calc("high max fraction genes", high_max_fraction_genes_mask)

    min_fraction_of_genes += normalization
    max_fraction_of_genes += normalization

    max_fraction_of_genes /= min_fraction_of_genes
    range_fold_of_genes = np.log2(max_fraction_of_genes, out=max_fraction_of_genes)

    high_range_genes_mask = range_fold_of_genes >= min_gene_range_fold
    ut.log_calc("high range genes", high_range_genes_mask)

    significant_genes_mask = high_max_fraction_genes_mask & high_range_genes_mask

    if inplace:
        ut.set_v_data(adata, "significant_gene", significant_genes_mask)
        return None

    ut.log_return("significant_genes", significant_genes_mask)
    return ut.to_pandas_series(significant_genes_mask, index=adata.var_names)