Exemplo n.º 1
0
def dissolve_metacells(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    candidates: Union[str, ut.Vector] = "candidate",
    deviants: Optional[Union[str, ut.Vector]] = "cell_deviant_votes",
    target_metacell_size: float = pr.target_metacell_size,
    cell_sizes: Optional[Union[str, ut.Vector]] = pr.dissolve_cell_sizes,
    min_metacell_cells: int = pr.dissolve_min_metacell_cells,
    min_robust_size_factor: Optional[float] = pr.
    dissolve_min_robust_size_factor,
    min_convincing_size_factor: Optional[float] = pr.
    dissolve_min_convincing_size_factor,
    min_convincing_gene_fold_factor: float = pr.
    dissolve_min_convincing_gene_fold_factor,
    abs_folds: bool = pr.dissolve_abs_folds,
    inplace: bool = True,
) -> Optional[ut.PandasFrame]:
    """
    Dissolve too-small metacells based on ``what`` (default: {what}) data.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Observation (Cell) Annotations
        ``metacell``
            The integer index of the metacell each cell belongs to. The metacells are in no
            particular order. Cells with no metacell assignment are given a metacell index of
            ``-1``.

        ``dissolved``
            A boolean mask of the cells which were in a dissolved metacell.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas data frame (indexed by the observation names).

    **Computation Parameters**

    1. Mark all cells with non-zero ``deviants`` (default: {deviants}) as "outliers". This can be
       the name of a per-observation (cell) annotation, or an explicit boolean mask of cells, or a
       or ``None`` if there are no deviant cells to mark.

    2. Any metacell which has less cells than the ``min_metacell_cells`` is dissolved.

    3. We are trying to create metacells of size ``target_metacell_size``. Compute the sizes of the
       resulting metacells by summing the ``cell_sizes`` (default: {cell_sizes}). If it is ``None``,
       each has a size of one. These parameters are typically identical to these passed to
       :py:func:`metacells.tools.candidates.compute_candidate_metacells`.

    4. If ``min_robust_size_factor`` (default: {min_robust_size_factor}) is specified, then any
       metacell whose total size is at least ``target_metacell_size * min_robust_size_factor`` is
       preserved.

    5. If ``min_convincing_size_factor`` (default: {min_convincing_size_factor}) is specified, then any remaining
       metacells whose size is at least ``target_metacell_size * min_convincing_size_factor`` are preserved, given they
       contain at least one gene whose fold factor (log2((actual + 1) / (expected + 1))) is at least
       ``min_convincing_gene_fold_factor`` (default: {min_convincing_gene_fold_factor}). If ``abs_folds``, consider the
       absolute fold factors. That is, we only preserve these smaller metacells if there is at least one gene whose
       expression is significantly different from the mean of the population.

    6 . Any remaining metacell is dissolved into "outlier" cells.
    """
    dissolved_of_cells = np.zeros(adata.n_obs, dtype="bool")

    candidate_of_cells = ut.get_o_numpy(adata,
                                        candidates,
                                        formatter=ut.groups_description)
    candidate_of_cells = np.copy(candidate_of_cells)

    deviant_of_cells = ut.maybe_o_numpy(adata,
                                        deviants,
                                        formatter=ut.mask_description)
    if deviant_of_cells is not None:
        deviant_of_cells = deviant_of_cells > 0
    cell_sizes = ut.maybe_o_numpy(adata,
                                  cell_sizes,
                                  formatter=ut.sizes_description)

    if deviant_of_cells is not None:
        candidate_of_cells[deviant_of_cells > 0] = -1
    candidate_of_cells = ut.compress_indices(candidate_of_cells)
    candidates_count = np.max(candidate_of_cells) + 1

    data = ut.get_vo_proper(adata, what, layout="column_major")
    fraction_of_genes = ut.fraction_per(data, per="column")

    if min_robust_size_factor is None:
        min_robust_size = None
    else:
        min_robust_size = target_metacell_size * min_robust_size_factor
    ut.log_calc("min_robust_size", min_robust_size)

    if min_convincing_size_factor is None:
        min_convincing_size = None
    else:
        min_convincing_size = target_metacell_size * min_convincing_size_factor
    ut.log_calc("min_convincing_size", min_convincing_size)

    did_dissolve = False
    for candidate_index in range(candidates_count):
        candidate_cell_indices = np.where(
            candidate_of_cells == candidate_index)[0]
        if not _keep_candidate(
                adata,
                candidate_index,
                data=data,
                cell_sizes=cell_sizes,
                fraction_of_genes=fraction_of_genes,
                min_metacell_cells=min_metacell_cells,
                min_robust_size=min_robust_size,
                min_convincing_size=min_convincing_size,
                min_convincing_gene_fold_factor=min_convincing_gene_fold_factor,
                abs_folds=abs_folds,
                candidates_count=candidates_count,
                candidate_cell_indices=candidate_cell_indices,
        ):
            dissolved_of_cells[candidate_cell_indices] = True
            candidate_of_cells[candidate_cell_indices] = -1
            did_dissolve = True

    if did_dissolve:
        metacell_of_cells = ut.compress_indices(candidate_of_cells)
    else:
        metacell_of_cells = candidate_of_cells

    if inplace:
        ut.set_o_data(adata,
                      "dissolved",
                      dissolved_of_cells,
                      formatter=ut.mask_description)

        ut.set_o_data(adata,
                      "metacell",
                      metacell_of_cells,
                      formatter=ut.groups_description)
        return None

    ut.log_return("dissolved", dissolved_of_cells)
    ut.log_return("metacell",
                  metacell_of_cells,
                  formatter=ut.groups_description)

    obs_frame = ut.to_pandas_frame(index=adata.obs_names)
    obs_frame["dissolved"] = dissolved_of_cells
    obs_frame["metacell"] = metacell_of_cells
    return obs_frame
Exemplo n.º 2
0
def _compute_elements_knn_graph(
    adata: AnnData,
    elements: str,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    k: int,
    balanced_ranks_factor: float,
    incoming_degree_factor: float,
    outgoing_degree_factor: float,
    inplace: bool = True,
) -> Optional[ut.PandasFrame]:
    assert elements in ("obs", "var")
    assert balanced_ranks_factor > 0.0
    assert incoming_degree_factor > 0.0
    assert outgoing_degree_factor > 0.0

    if elements == "obs":
        get_data = ut.get_oo_proper
        set_data = ut.set_oo_data
    else:
        get_data = ut.get_vv_proper
        set_data = ut.set_vv_data

    def store_matrix(matrix: ut.CompressedMatrix, name: str,
                     when: bool) -> None:  #
        if when:
            name = elements + "_" + name
            set_data(
                adata,
                name,
                matrix,
                formatter=lambda matrix: ut.ratio_description(
                    matrix.shape[0] * matrix.shape[1], "element", matrix.nnz,
                    "nonzero"),
            )
        elif ut.logging_calc():
            ut.log_calc(
                f"{elements}_{name}",
                ut.ratio_description(matrix.shape[0] * matrix.shape[1],
                                     "element", matrix.nnz, "nonzero"),
            )

    similarity = ut.to_proper_matrix(get_data(adata, what))
    similarity = ut.to_layout(similarity, "row_major", symmetric=True)
    similarity = ut.to_numpy_matrix(similarity)

    ut.log_calc("similarity", similarity)

    outgoing_ranks = _rank_outgoing(similarity)

    balanced_ranks = _balance_ranks(outgoing_ranks, k, balanced_ranks_factor)
    store_matrix(balanced_ranks, "balanced_ranks", True)

    pruned_ranks = _prune_ranks(balanced_ranks, k, incoming_degree_factor,
                                outgoing_degree_factor)
    store_matrix(pruned_ranks, "pruned_ranks", True)

    outgoing_weights = _weigh_edges(pruned_ranks)
    store_matrix(outgoing_weights, "outgoing_weights", inplace)

    if inplace:
        return None

    if elements == "obs":
        names = adata.obs_names
    else:
        names = adata.var_names

    return ut.to_pandas_frame(outgoing_weights, index=names, columns=names)
Exemplo n.º 3
0
def _keep_candidate(  # pylint: disable=too-many-branches
    adata: AnnData,
    candidate_index: int,
    *,
    data: ut.ProperMatrix,
    cell_sizes: Optional[ut.NumpyVector],
    fraction_of_genes: ut.NumpyVector,
    min_metacell_cells: int,
    min_robust_size: Optional[float],
    min_convincing_size: Optional[float],
    min_convincing_gene_fold_factor: float,
    abs_folds: bool,
    candidates_count: int,
    candidate_cell_indices: ut.NumpyVector,
) -> bool:
    genes_count = data.shape[1]

    if cell_sizes is None:
        candidate_total_size = candidate_cell_indices.size
    else:
        candidate_total_size = np.sum(cell_sizes[candidate_cell_indices])

    if candidate_cell_indices.size < min_metacell_cells:
        if ut.logging_calc():
            ut.log_calc(
                f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} '
                f"cells: {candidate_cell_indices.size} "
                f"size: {candidate_total_size:g} "
                f"is: little")
        return False

    if min_robust_size is not None and candidate_total_size >= min_robust_size:
        if ut.logging_calc():
            ut.log_calc(
                f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} '
                f"cells: {candidate_cell_indices.size} "
                f"size: {candidate_total_size:g} "
                f"is: robust")
        return True

    if min_convincing_size is None:
        if ut.logging_calc():
            ut.log_calc(
                f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} '
                f"cells: {candidate_cell_indices.size} "
                f"size: {candidate_total_size:g} "
                f"is: accepted")
        return True

    if candidate_total_size < min_convincing_size:
        if ut.logging_calc():
            ut.log_calc(
                f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} '
                f"cells: {candidate_cell_indices.size} "
                f"size: {candidate_total_size:g} "
                f"is: unconvincing")
        return False

    candidate_data = data[candidate_cell_indices, :]
    candidate_data_of_genes = ut.to_numpy_vector(candidate_data.sum(axis=0))
    assert candidate_data_of_genes.size == genes_count
    candidate_total = np.sum(candidate_data_of_genes)
    candidate_expected_of_genes = fraction_of_genes * candidate_total
    candidate_expected_of_genes += 1
    candidate_data_of_genes += 1
    candidate_data_of_genes /= candidate_expected_of_genes
    np.log2(candidate_data_of_genes, out=candidate_data_of_genes)
    if abs_folds:
        convincing_genes_mask = np.abs(
            candidate_data_of_genes) >= min_convincing_gene_fold_factor
    else:
        convincing_genes_mask = candidate_data_of_genes >= min_convincing_gene_fold_factor
    keep_candidate = bool(np.any(convincing_genes_mask))

    if ut.logging_calc():
        convincing_gene_indices = np.where(convincing_genes_mask)[0]
        if keep_candidate:
            ut.log_calc(
                f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} '
                f"cells: {candidate_cell_indices.size} "
                f"size: {candidate_total_size:g} "
                f"is: convincing because:")
            for fold_factor, name in reversed(
                    sorted(
                        zip(candidate_data_of_genes[convincing_gene_indices],
                            adata.var_names[convincing_gene_indices]))):
                ut.log_calc(f"    {name}: {ut.fold_description(fold_factor)}")
        else:
            ut.log_calc(
                f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} '
                f"cells: {candidate_cell_indices.size} "
                f"size: {candidate_total_size:g} "
                f"is: not convincing")

    return keep_candidate
Exemplo n.º 4
0
def _identify_cells(
    *,
    adata_of_all_genes_of_all_cells: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    related_gene_indices_of_modules: List[List[int]],
    min_cell_module_total: int,
    min_cells_of_modules: int,
    max_cells_of_modules: int,
    rare_module_of_cells: ut.NumpyVector,
) -> None:
    max_strength_of_cells = np.zeros(adata_of_all_genes_of_all_cells.n_obs)

    ut.log_calc("cells for modules:")
    modules_count = len(related_gene_indices_of_modules)
    for module_index, related_gene_indices_of_module in enumerate(
            related_gene_indices_of_modules):
        if len(related_gene_indices_of_module) == 0:
            continue

        with ut.log_step(
                "- module",
                module_index,
                formatter=lambda module_index: ut.progress_description(
                    modules_count, module_index, "module"),
        ):
            adata_of_related_genes_of_all_cells = ut.slice(
                adata_of_all_genes_of_all_cells,
                name=f".module{module_index}.related_genes",
                vars=related_gene_indices_of_module,
                top_level=False,
            )
            total_related_genes_of_all_cells = ut.get_o_numpy(
                adata_of_related_genes_of_all_cells, what, sum=True)

            mask_of_strong_cells_of_module = total_related_genes_of_all_cells >= min_cell_module_total

            median_strength_of_module = np.median(
                total_related_genes_of_all_cells[
                    mask_of_strong_cells_of_module])  #
            strong_cells_count = np.sum(mask_of_strong_cells_of_module)

            if strong_cells_count > max_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc(
                        "strong_cells",
                        ut.mask_description(mask_of_strong_cells_of_module) +
                        " (too many)")  #
                related_gene_indices_of_module.clear()
                continue

            if strong_cells_count < min_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc(
                        "strong_cells",
                        ut.mask_description(mask_of_strong_cells_of_module) +
                        " (too few)")  #
                related_gene_indices_of_module.clear()
                continue

            ut.log_calc("strong_cells", mask_of_strong_cells_of_module)

            strength_of_all_cells = total_related_genes_of_all_cells / median_strength_of_module
            mask_of_strong_cells_of_module &= strength_of_all_cells >= max_strength_of_cells
            max_strength_of_cells[
                mask_of_strong_cells_of_module] = strength_of_all_cells[
                    mask_of_strong_cells_of_module]

            rare_module_of_cells[mask_of_strong_cells_of_module] = module_index
Exemplo n.º 5
0
def _compress_modules(
    *,
    adata_of_all_genes_of_all_cells: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    min_cells_of_modules: int,
    max_cells_of_modules: int,
    target_metacell_size: float,
    min_modules_size_factor: float,
    related_gene_indices_of_modules: List[List[int]],
    rare_module_of_cells: ut.NumpyVector,
) -> List[List[int]]:
    list_of_rare_gene_indices_of_modules: List[List[int]] = []
    list_of_names_of_genes_of_modules: List[List[str]] = []

    min_umis_of_modules = target_metacell_size * min_modules_size_factor
    ut.log_calc("min_umis_of_modules", min_umis_of_modules)

    total_all_genes_of_all_cells = ut.get_o_numpy(
        adata_of_all_genes_of_all_cells, what, sum=True)

    cell_counts_of_modules: List[int] = []

    ut.log_calc("compress modules:")
    modules_count = len(related_gene_indices_of_modules)
    for module_index, gene_indices_of_module in enumerate(
            related_gene_indices_of_modules):
        if len(gene_indices_of_module) == 0:
            continue

        with ut.log_step(
                "- module",
                module_index,
                formatter=lambda module_index: ut.progress_description(
                    modules_count, module_index, "module"),
        ):
            module_cells_mask = rare_module_of_cells == module_index
            module_cells_count = np.sum(module_cells_mask)
            module_umis_count = np.sum(
                total_all_genes_of_all_cells[module_cells_mask])

            if module_cells_count < min_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc("cells",
                                str(module_cells_count) + " (too few)")
                rare_module_of_cells[module_cells_mask] = -1
                continue

            if module_cells_count > max_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc("cells",
                                str(module_cells_count) + " (too many)")
                rare_module_of_cells[module_cells_mask] = -1
                continue

            ut.log_calc("cells", module_cells_count)

            if module_umis_count < min_umis_of_modules:
                if ut.logging_calc():
                    ut.log_calc("UMIs", str(module_umis_count) + " (too few)")
                rare_module_of_cells[module_cells_mask] = -1
                continue

            ut.log_calc("UMIs", module_umis_count)

            next_module_index = len(list_of_rare_gene_indices_of_modules)
            if module_index != next_module_index:
                ut.log_calc("is reindexed to", next_module_index)
                rare_module_of_cells[module_cells_mask] = next_module_index
                module_index = next_module_index

            next_module_index += 1
            list_of_rare_gene_indices_of_modules.append(gene_indices_of_module)

            if ut.logging_calc():
                cell_counts_of_modules.append(np.sum(module_cells_mask))
            list_of_names_of_genes_of_modules.append(  #
                sorted(adata_of_all_genes_of_all_cells.
                       var_names[gene_indices_of_module]))

    if ut.logging_calc():
        ut.log_calc("final modules:")
        for module_index, (module_cells_count, module_gene_names) in enumerate(
                zip(cell_counts_of_modules,
                    list_of_names_of_genes_of_modules)):
            ut.log_calc(
                f"- module: {module_index} cells: {module_cells_count} genes: {module_gene_names}"
            )  #

    return list_of_rare_gene_indices_of_modules
Exemplo n.º 6
0
def find_rare_gene_modules(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    max_gene_cell_fraction: float = pr.rare_max_gene_cell_fraction,
    min_gene_maximum: int = pr.rare_min_gene_maximum,
    genes_similarity_method: str = pr.rare_genes_similarity_method,
    genes_cluster_method: str = pr.rare_genes_cluster_method,
    forbidden_gene_names: Optional[Collection[str]] = None,
    forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    min_genes_of_modules: int = pr.rare_min_genes_of_modules,
    min_cells_of_modules: int = pr.rare_min_cells_of_modules,
    target_pile_size: int = pr.min_target_pile_size,
    max_cells_factor_of_random_pile: float = pr.
    rare_max_cells_factor_of_random_pile,
    target_metacell_size: float = pr.target_metacell_size,
    min_modules_size_factor: float = pr.rare_min_modules_size_factor,
    min_module_correlation: float = pr.rare_min_module_correlation,
    min_related_gene_fold_factor: float = pr.rare_min_related_gene_fold_factor,
    max_related_gene_increase_factor: float = pr.
    rare_max_related_gene_increase_factor,
    min_cell_module_total: int = pr.rare_min_cell_module_total,
    reproducible: bool = pr.reproducible,
    inplace: bool = True,
) -> Optional[Tuple[ut.PandasFrame, ut.PandasFrame]]:
    """
    Detect rare genes modules based on ``what`` (default: {what}) data.

    Rare gene modules include genes which are weakly and rarely expressed, yet are highly correlated
    with each other, allowing for robust detection. Global analysis algorithms (such as metacells)
    tend to ignore or at least discount such genes.

    It is therefore useful to explicitly identify, in a pre-processing step, the few cells which
    express such rare gene modules. Once identified, these cells can be exempt from the global
    algorithm, or the global algorithm can be tweaked in some way to pay extra attention to them.

    If ``reproducible`` (default: {reproducible}) is ``True``, a slower (still parallel) but
    reproducible algorithm will be used to compute pearson correlations.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Observation (Cell) Annotations
        ``cells_rare_gene_module``
            The index of the rare gene module each cell expresses the most, or ``-1`` in the common
            case it does not express any rare genes module.

        ``rare_cell``
            A boolean mask for the (few) cells that express a rare gene module.

    Variable (Gene) Annotations
        ``rare_gene_module_<N>``
            A boolean mask for the genes in the gene module with index ``N``.

        ``rare_gene``
            A boolean mask for the genes in any of the rare gene modules.

    If ``inplace``, these are written to to the data, and the function returns ``None``. Otherwise
    they are returned as tuple containing two data frames.

    **Computation Parameters**

    1. Pick as candidates all genes that are expressed in at most than ``max_gene_cell_fraction``
       (default: {max_gene_cell_fraction}) of the cells, and whose maximal value in a cell is at
       least ``min_gene_maximum`` (default: {min_gene_maximum}), as long as they do not match the
       ``forbidden_gene_names`` or the ``forbidden_gene_patterns``.

    2. Compute the similarity between the genes using
       :py:func:`metacells.tools.similarity.compute_var_var_similarity` using the
       ``genes_similarity_method`` (default: {genes_similarity_method}).

    3. Create a hierarchical clustering of the candidate genes using the ``genes_cluster_method``
       (default: {genes_cluster_method}).

    4. Identify gene modules in the hierarchical clustering which contain at least
       ``min_genes_of_modules`` genes (default: {min_genes_of_modules}), with an average gene-gene
       cross-correlation of at least ``min_module_correlation`` (default:
       {min_module_correlation}).

    5. Consider cells expressing of any of the genes in the gene module. If the expected number of
       such cells in each random pile of size ``target_pile_size`` (default: {target_pile_size}), whose total number of
       UMIs of the rare gene module is at least ``min_cell_module_total`` (default: {min_cell_module_total}), is more
       than the ``max_cells_factor_of_random_pile`` (default: {max_cells_factor_of_random_pile}) as a fraction of the
       mean metacells size, then discard the rare gene module as not that rare after all.

    6. Add to the gene module all genes whose fraction in cells expressing any of the genes in the
       rare gene module is at least 2^``min_related_gene_fold_factor`` (default:
       {min_related_gene_fold_factor}) times their fraction in the rest of the population, as long
       as their maximal value in one of the expressing cells is at least ``min_gene_maximum``,
       as long as this doesn't add more than ``max_related_gene_increase_factor`` times the original
       number of cells to the rare gene module, and as long as they do not match the
       ``forbidden_gene_names`` or the ``forbidden_gene_patterns``. If a gene is above the threshold
       for multiple gene modules, associate it with the gene module for which its fold factor is
       higher.

    7. Associate cells with the rare gene module if they contain at least ``min_cell_module_total``
       (default: {min_cell_module_total}) UMIs of the expanded rare gene module. If a cell meets the
       above threshold for several rare gene modules, it is associated with the one for which it
       contains more UMIs.

    8. Discard modules which have less than ``min_cells_of_modules`` (default:
       {min_cells_of_modules}) cells or whose total UMIs are less than the ``target_metacell_size``
       (default: {target_metacell_size}) times the ``min_modules_size_factor`` (default:
       {min_modules_size_factor}).
    """
    assert min_cells_of_modules > 0
    assert min_genes_of_modules > 0

    umis_per_gene = ut.get_v_numpy(adata, what, sum=True)
    total_umis = np.sum(umis_per_gene)
    mean_umis_per_cell = total_umis / adata.n_obs
    mean_metacells_size = target_metacell_size / mean_umis_per_cell
    ut.log_calc("mean_metacells_size", mean_metacells_size)
    max_cells_of_random_pile = mean_metacells_size * max_cells_factor_of_random_pile
    ut.log_calc("max_cells_of_random_pile", max_cells_of_random_pile)

    forbidden_genes_mask = find_named_genes(adata,
                                            names=forbidden_gene_names,
                                            patterns=forbidden_gene_patterns)
    assert forbidden_genes_mask is not None

    allowed_genes_mask = ~forbidden_genes_mask.values
    ut.log_calc("allowed_genes_mask", allowed_genes_mask)

    rare_module_of_cells = np.full(adata.n_obs, -1, dtype="int32")
    list_of_rare_gene_indices_of_modules: List[List[int]] = []

    candidates = _pick_candidates(
        adata_of_all_genes_of_all_cells=adata,
        what=what,
        max_gene_cell_fraction=max_gene_cell_fraction,
        min_gene_maximum=min_gene_maximum,
        min_genes_of_modules=min_genes_of_modules,
        allowed_genes_mask=allowed_genes_mask,
    )
    if candidates is None:
        return _results(
            adata=adata,
            rare_module_of_cells=rare_module_of_cells,
            list_of_rare_gene_indices_of_modules=
            list_of_rare_gene_indices_of_modules,
            inplace=inplace,
        )
    candidate_data, candidate_genes_indices = candidates

    similarities_between_candidate_genes = _genes_similarity(
        candidate_data=candidate_data,
        what=what,
        method=genes_similarity_method,
        reproducible=reproducible)

    linkage = _cluster_genes(
        similarities_between_candidate_genes=
        similarities_between_candidate_genes,
        genes_cluster_method=genes_cluster_method,
    )

    rare_gene_indices_of_modules = _identify_genes(
        candidate_genes_indices=candidate_genes_indices,
        similarities_between_candidate_genes=
        similarities_between_candidate_genes,
        linkage=linkage,
        min_module_correlation=min_module_correlation,
    )

    max_cells_of_modules = int(max_cells_of_random_pile * adata.n_obs /
                               target_pile_size)
    ut.log_calc("max_cells_of_modules", max_cells_of_modules)

    related_gene_indices_of_modules = _related_genes(
        adata_of_all_genes_of_all_cells=adata,
        what=what,
        rare_gene_indices_of_modules=rare_gene_indices_of_modules,
        allowed_genes_mask=allowed_genes_mask,
        min_genes_of_modules=min_genes_of_modules,
        min_cells_of_modules=min_cells_of_modules,
        max_cells_of_modules=max_cells_of_modules,
        min_cell_module_total=min_cell_module_total,
        min_gene_maximum=min_gene_maximum,
        min_related_gene_fold_factor=min_related_gene_fold_factor,
        max_related_gene_increase_factor=max_related_gene_increase_factor,
    )

    _identify_cells(
        adata_of_all_genes_of_all_cells=adata,
        what=what,
        related_gene_indices_of_modules=related_gene_indices_of_modules,
        min_cells_of_modules=min_cells_of_modules,
        max_cells_of_modules=max_cells_of_modules,
        min_cell_module_total=min_cell_module_total,
        rare_module_of_cells=rare_module_of_cells,
    )

    list_of_rare_gene_indices_of_modules = _compress_modules(
        adata_of_all_genes_of_all_cells=adata,
        what=what,
        min_cells_of_modules=min_cells_of_modules,
        max_cells_of_modules=max_cells_of_modules,
        target_metacell_size=target_metacell_size,
        min_modules_size_factor=min_modules_size_factor,
        related_gene_indices_of_modules=related_gene_indices_of_modules,
        rare_module_of_cells=rare_module_of_cells,
    )

    return _results(
        adata=adata,
        rare_module_of_cells=rare_module_of_cells,
        list_of_rare_gene_indices_of_modules=
        list_of_rare_gene_indices_of_modules,
        inplace=inplace,
    )
Exemplo n.º 7
0
def _related_genes(  # pylint: disable=too-many-statements,too-many-branches
    *,
    adata_of_all_genes_of_all_cells: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    rare_gene_indices_of_modules: List[List[int]],
    allowed_genes_mask: ut.NumpyVector,
    min_genes_of_modules: int,
    min_gene_maximum: int,
    min_cells_of_modules: int,
    max_cells_of_modules: int,
    min_cell_module_total: int,
    min_related_gene_fold_factor: float,
    max_related_gene_increase_factor: float,
) -> List[List[int]]:
    total_all_cells_umis_of_all_genes = ut.get_v_numpy(
        adata_of_all_genes_of_all_cells, what, sum=True)

    ut.log_calc("genes for modules:")
    modules_count = 0
    related_gene_indices_of_modules: List[List[int]] = []

    rare_gene_indices_of_any: Set[int] = set()
    for rare_gene_indices_of_module in rare_gene_indices_of_modules:
        if len(rare_gene_indices_of_module) >= min_genes_of_modules:
            rare_gene_indices_of_any.update(list(rare_gene_indices_of_module))

    for rare_gene_indices_of_module in rare_gene_indices_of_modules:
        if len(rare_gene_indices_of_module) < min_genes_of_modules:
            continue

        module_index = modules_count
        modules_count += 1

        with ut.log_step("- module", module_index):
            ut.log_calc(
                "rare_gene_names",
                sorted(adata_of_all_genes_of_all_cells.
                       var_names[rare_gene_indices_of_module]))

            adata_of_module_genes_of_all_cells = ut.slice(
                adata_of_all_genes_of_all_cells,
                name=f".module{module_index}.rare_gene",
                vars=rare_gene_indices_of_module,
                top_level=False,
            )

            total_module_genes_umis_of_all_cells = ut.get_o_numpy(
                adata_of_module_genes_of_all_cells, what, sum=True)

            mask_of_expressed_cells = total_module_genes_umis_of_all_cells > 0

            expressed_cells_count = np.sum(mask_of_expressed_cells)

            if expressed_cells_count > max_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc(
                        "expressed_cells",
                        ut.mask_description(mask_of_expressed_cells) +
                        " (too many)")
                continue

            if expressed_cells_count < min_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc(
                        "expressed_cells",
                        ut.mask_description(mask_of_expressed_cells) +
                        " (too few)")
                continue

            ut.log_calc("expressed_cells", mask_of_expressed_cells)

            adata_of_all_genes_of_expressed_cells_of_module = ut.slice(
                adata_of_all_genes_of_all_cells,
                name=f".module{module_index}.rare_cell",
                obs=mask_of_expressed_cells,
                top_level=False,
            )

            total_expressed_cells_umis_of_all_genes = ut.get_v_numpy(
                adata_of_all_genes_of_expressed_cells_of_module,
                what,
                sum=True)

            data = ut.get_vo_proper(
                adata_of_all_genes_of_expressed_cells_of_module,
                what,
                layout="column_major")
            max_expressed_cells_umis_of_all_genes = ut.max_per(data,
                                                               per="column")

            total_background_cells_umis_of_all_genes = (
                total_all_cells_umis_of_all_genes -
                total_expressed_cells_umis_of_all_genes)

            expressed_cells_fraction_of_all_genes = total_expressed_cells_umis_of_all_genes / sum(
                total_expressed_cells_umis_of_all_genes)

            background_cells_fraction_of_all_genes = total_background_cells_umis_of_all_genes / sum(
                total_background_cells_umis_of_all_genes)

            mask_of_related_genes = (
                allowed_genes_mask
                & (max_expressed_cells_umis_of_all_genes >= min_gene_maximum)
                & (expressed_cells_fraction_of_all_genes >=
                   background_cells_fraction_of_all_genes *
                   (2**min_related_gene_fold_factor)))

            related_gene_indices = np.where(mask_of_related_genes)[0]
            assert np.all(mask_of_related_genes[rare_gene_indices_of_module])

            base_genes_of_all_cells_adata = ut.slice(
                adata_of_all_genes_of_all_cells,
                name=f".module{module_index}.base",
                vars=rare_gene_indices_of_module)
            total_base_genes_of_all_cells = ut.get_o_numpy(
                base_genes_of_all_cells_adata, what, sum=True)
            mask_of_strong_base_cells = total_base_genes_of_all_cells >= min_cell_module_total
            count_of_strong_base_cells = np.sum(mask_of_strong_base_cells)

            if ut.logging_calc():
                ut.log_calc(
                    "candidate_gene_names",
                    sorted(adata_of_all_genes_of_all_cells.
                           var_names[related_gene_indices]))
                ut.log_calc("base_strong_genes", count_of_strong_base_cells)

            related_gene_indices_of_module = list(rare_gene_indices_of_module)
            for gene_index in related_gene_indices:
                if gene_index in rare_gene_indices_of_module:
                    continue

                if gene_index in rare_gene_indices_of_any:
                    ut.log_calc(
                        f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} "
                        f"belongs to another module")
                    continue

                if gene_index not in rare_gene_indices_of_module:
                    related_gene_of_all_cells_adata = ut.slice(
                        adata_of_all_genes_of_all_cells,
                        name=
                        f".{adata_of_all_genes_of_all_cells.var_names[gene_index]}",
                        vars=np.array([gene_index]),
                    )
                    assert related_gene_of_all_cells_adata.n_vars == 1
                    total_related_genes_of_all_cells = ut.get_o_numpy(
                        related_gene_of_all_cells_adata, what, sum=True)
                    total_related_genes_of_all_cells += total_base_genes_of_all_cells
                    mask_of_strong_related_cells = total_related_genes_of_all_cells >= min_cell_module_total
                    count_of_strong_related_cells = np.sum(
                        mask_of_strong_related_cells)
                    ut.log_calc(
                        f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} "
                        f"strong cells: {count_of_strong_related_cells} "
                        f"factor: {count_of_strong_related_cells / count_of_strong_base_cells}"
                    )
                    if count_of_strong_related_cells > max_related_gene_increase_factor * count_of_strong_base_cells:
                        continue

                related_gene_indices_of_module.append(gene_index)

            related_gene_indices_of_modules.append(
                related_gene_indices_of_module)  #

    if ut.logging_calc():
        ut.log_calc("related genes for modules:")
        for module_index, related_gene_indices_of_module in enumerate(
                related_gene_indices_of_modules):
            ut.log_calc(
                f"- module {module_index} related_gene_names",
                sorted(adata_of_all_genes_of_all_cells.
                       var_names[related_gene_indices_of_module]),
            )

    return related_gene_indices_of_modules
Exemplo n.º 8
0
def compute_direct_metacells(  # pylint: disable=too-many-statements,too-many-branches
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    feature_downsample_min_samples: int = pr.feature_downsample_min_samples,
    feature_downsample_min_cell_quantile: float = pr.feature_downsample_min_cell_quantile,
    feature_downsample_max_cell_quantile: float = pr.feature_downsample_max_cell_quantile,
    feature_min_gene_total: Optional[int] = pr.feature_min_gene_total,
    feature_min_gene_top3: Optional[int] = pr.feature_min_gene_top3,
    feature_min_gene_relative_variance: Optional[float] = pr.feature_min_gene_relative_variance,
    feature_gene_names: Optional[Collection[str]] = None,
    feature_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    forbidden_gene_names: Optional[Collection[str]] = None,
    forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    cells_similarity_value_normalization: float = pr.cells_similarity_value_normalization,
    cells_similarity_log_data: bool = pr.cells_similarity_log_data,
    cells_similarity_method: str = pr.cells_similarity_method,
    target_metacell_size: float = pr.target_metacell_size,
    max_cell_size: Optional[float] = pr.max_cell_size,
    max_cell_size_factor: Optional[float] = pr.max_cell_size_factor,
    cell_sizes: Optional[Union[str, ut.Vector]] = pr.cell_sizes,
    knn_k: Optional[int] = pr.knn_k,
    min_knn_k: Optional[int] = pr.min_knn_k,
    knn_balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
    knn_incoming_degree_factor: float = pr.knn_incoming_degree_factor,
    knn_outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
    candidates_cell_seeds: Optional[Union[str, ut.Vector]] = None,
    min_seed_size_quantile: float = pr.min_seed_size_quantile,
    max_seed_size_quantile: float = pr.max_seed_size_quantile,
    candidates_cooldown_pass: float = pr.cooldown_pass,
    candidates_cooldown_node: float = pr.cooldown_node,
    candidates_cooldown_phase: float = pr.cooldown_phase,
    candidates_min_split_size_factor: Optional[float] = pr.candidates_min_split_size_factor,
    candidates_max_merge_size_factor: Optional[float] = pr.candidates_max_merge_size_factor,
    candidates_min_metacell_cells: Optional[int] = pr.min_metacell_cells,
    candidates_max_split_min_cut_strength: Optional[float] = pr.max_split_min_cut_strength,
    candidates_min_cut_seed_cells: Optional[int] = pr.min_cut_seed_cells,
    must_complete_cover: bool = False,
    deviants_min_gene_fold_factor: float = pr.deviants_min_gene_fold_factor,
    deviants_abs_folds: bool = pr.deviants_abs_folds,
    deviants_max_gene_fraction: Optional[float] = pr.deviants_max_gene_fraction,
    deviants_max_cell_fraction: Optional[float] = pr.deviants_max_cell_fraction,
    dissolve_min_robust_size_factor: Optional[float] = pr.dissolve_min_robust_size_factor,
    dissolve_min_convincing_size_factor: Optional[float] = pr.dissolve_min_convincing_size_factor,
    dissolve_min_convincing_gene_fold_factor: float = pr.dissolve_min_convincing_gene_fold_factor,
    dissolve_min_metacell_cells: int = pr.dissolve_min_metacell_cells,
    random_seed: int = pr.random_seed,
) -> AnnData:
    """
    Directly compute metacells using ``what`` (default: {what}) data.

    This directly computes the metacells on the whole data. Like any method that directly looks at
    the whole data at once, the amount of CPU and memory needed becomes unreasonable when the data
    size grows. Above O(10,000) you are much better off using the divide-and-conquer method.

    .. note::

        The current implementation is naive in that it computes the full dense N^2 correlation
        matrix, and only then extracts the sparse graph out of it. We actually need two copies where
        each requires 4 bytes per entry, so for O(100,000) cells, we have storage of
        O(100,000,000,000). In addition, the implementation is serial for the graph clustering
        phases.

        It is possible to mitigate this by fusing the correlations phase and the graph generation
        phase, parallelizing the result, and also (somehow) parallelizing the graph clustering
        phase. This might increase the "reasonable" size for the direct approach to O(100,000).

        We have decided not to invest in this direction since it won't allow us to push the size to
        O(1,000,000) and above. Instead we provide the divide-and-conquer method, which easily
        scales to O(1,000,000) on a single multi-core server, and to "unlimited" size if we further
        enhance the implementation to use a distributed compute cluster of such servers.

    .. todo::

        Should :py:func:`compute_direct_metacells` avoid computing the graph and partition it for a
        very small number of cells?

    **Input**

    The presumably "clean" annotated ``adata``, where the observations are cells and the variables
    are genes, where ``what`` is a per-variable-per-observation matrix or the name of a
    per-variable-per-observation annotation containing such a matrix.

    **Returns**

    Sets the following annotations in ``adata``:

    Variable (Gene) Annotations
        ``high_total_gene``
            A boolean mask of genes with "high" expression level.

        ``high_relative_variance_gene``
            A boolean mask of genes with "high" normalized variance, relative to other genes with a
            similar expression level.

        ``forbidden_gene``
            A boolean mask of genes which are forbidden from being chosen as "feature" genes based
            on their name.

        ``feature_gene``
            A boolean mask of the "feature" genes.

        ``gene_deviant_votes``
            The number of cells each gene marked as deviant (if zero, the gene did not mark any cell
            as deviant). This will be zero for non-"feature" genes.

    Observation (Cell) Annotations
        ``seed``
            The index of the seed metacell each cell was assigned to to. This is ``-1`` for
            non-"clean" cells.

        ``candidate``
            The index of the candidate metacell each cell was assigned to to. This is ``-1`` for
            non-"clean" cells.

        ``cell_deviant_votes``
            The number of genes that were the reason the cell was marked as deviant (if zero, the
            cell is not deviant).

        ``dissolved``
            A boolean mask of the cells contained in a dissolved metacell.

        ``metacell``
            The integer index of the metacell each cell belongs to. The metacells are in no
            particular order. Cells with no metacell assignment ("outliers") are given a metacell
            index of ``-1``.

        ``outlier``
            A boolean mask of the cells contained in no metacell.

    **Computation Parameters**

    1. Invoke :py:func:`metacells.pipeline.feature.extract_feature_data` to extract "feature" data
       from the clean data, using the
       ``feature_downsample_min_samples`` (default: {feature_downsample_min_samples}),
       ``feature_downsample_min_cell_quantile`` (default: {feature_downsample_min_cell_quantile}),
       ``feature_downsample_max_cell_quantile`` (default: {feature_downsample_max_cell_quantile}),
       ``feature_min_gene_total`` (default: {feature_min_gene_total}), ``feature_min_gene_top3``
       (default: {feature_min_gene_top3}), ``feature_min_gene_relative_variance`` (default:
       {feature_min_gene_relative_variance}), ``feature_gene_names`` (default:
       {feature_gene_names}), ``feature_gene_patterns`` (default: {feature_gene_patterns}),
       ``forbidden_gene_names`` (default: {forbidden_gene_names}), ``forbidden_gene_patterns``
       (default: {forbidden_gene_patterns}) and ``random_seed`` (default: {random_seed}) to make
       this replicable.

    2. Compute the fractions of each variable in each cell, and add the
       ``cells_similarity_value_normalization`` (default: {cells_similarity_value_normalization}) to
       it.

    3. If ``cells_similarity_log_data`` (default: {cells_similarity_log_data}), invoke the
       :py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of
       the data.

    4. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` to compute the
       similarity between each pair of cells, using the
       ``cells_similarity_method`` (default: {cells_similarity_method}).

    5. Invoke :py:func:`metacells.pipeline.collect.compute_effective_cell_sizes` using
       ``max_cell_size`` (default: {max_cell_size}), ``max_cell_size_factor`` (default:
       {max_cell_size_factor}) and ``cell_sizes`` (default: {cell_sizes}) to get the effective cell
       sizes to use.

    5. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` to compute a
       K-Nearest-Neighbors graph, using the
       ``knn_balanced_ranks_factor`` (default: {knn_balanced_ranks_factor}),
       ``knn_incoming_degree_factor`` (default: {knn_incoming_degree_factor})
       and
       ``knn_outgoing_degree_factor`` (default: {knn_outgoing_degree_factor}).
       If ``knn_k`` (default: {knn_k}) is not specified, then it is
       chosen to be the median number of cells required to reach the target metacell size,
       but at least ``min_knn_k`` (default: {min_knn_k}).

    6. Invoke :py:func:`metacells.tools.candidates.compute_candidate_metacells` to compute
       the candidate metacells, using the
       ``candidates_cell_seeds`` (default: {candidates_cell_seeds}),
       ``min_seed_size_quantile`` (default: {min_seed_size_quantile}),
       ``max_seed_size_quantile`` (default: {max_seed_size_quantile}),
       ``candidates_cooldown_pass`` (default: {candidates_cooldown_pass}),
       ``candidates_cooldown_node`` (default: {candidates_cooldown_node}),
       ``candidates_cooldown_phase`` (default: {candidates_cooldown_phase}),
       ``candidates_min_split_size_factor`` (default: {candidates_min_split_size_factor}),
       ``candidates_max_merge_size_factor`` (default: {candidates_max_merge_size_factor}),
       ``candidates_min_metacell_cells`` (default: {candidates_min_metacell_cells}),
       and
       ``random_seed`` (default: {random_seed})
       to make this replicable. This tries to build metacells of the
       ``target_metacell_size`` (default: {target_metacell_size})
       using the effective cell sizes.

    7. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke
       :py:func:`metacells.tools.deviants.find_deviant_cells` to remove deviants from the candidate
       metacells, using the
       ``deviants_min_gene_fold_factor`` (default: {deviants_min_gene_fold_factor}),
       ``deviants_abs_folds`` (default: {deviants_abs_folds}),
       ``deviants_max_gene_fraction`` (default: {deviants_max_gene_fraction})
       and
       ``deviants_max_cell_fraction`` (default: {deviants_max_cell_fraction}).

    8. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke
       :py:func:`metacells.tools.dissolve.dissolve_metacells` to dissolve small unconvincing
       metacells, using the same
       ``target_metacell_size`` (default: {target_metacell_size}),
       and the effective cell sizes
       and the
       ``dissolve_min_robust_size_factor`` (default: {dissolve_min_robust_size_factor}),
       ``dissolve_min_convincing_size_factor`` (default: {dissolve_min_convincing_size_factor}),
       ``dissolve_min_convincing_gene_fold_factor`` (default: {dissolve_min_convincing_size_factor})
       and
       ``dissolve_min_metacell_cells`` (default: ``dissolve_min_metacell_cells``).
    """
    fdata = extract_feature_data(
        adata,
        what,
        top_level=False,
        downsample_min_samples=feature_downsample_min_samples,
        downsample_min_cell_quantile=feature_downsample_min_cell_quantile,
        downsample_max_cell_quantile=feature_downsample_max_cell_quantile,
        min_gene_relative_variance=feature_min_gene_relative_variance,
        min_gene_total=feature_min_gene_total,
        min_gene_top3=feature_min_gene_top3,
        forced_gene_names=feature_gene_names,
        forced_gene_patterns=feature_gene_patterns,
        forbidden_gene_names=forbidden_gene_names,
        forbidden_gene_patterns=forbidden_gene_patterns,
        random_seed=random_seed,
    )

    if fdata is None:
        raise ValueError("Empty feature data, giving up")

    effective_cell_sizes, max_cell_size, _cell_scale_factors = compute_effective_cell_sizes(
        adata, max_cell_size=max_cell_size, max_cell_size_factor=max_cell_size_factor, cell_sizes=cell_sizes
    )
    ut.log_calc("effective_cell_sizes", effective_cell_sizes, formatter=ut.sizes_description)

    if max_cell_size is not None:
        if candidates_min_metacell_cells is not None:
            target_metacell_size = max(target_metacell_size, max_cell_size * candidates_min_metacell_cells)

        if dissolve_min_metacell_cells is not None:
            target_metacell_size = max(target_metacell_size, max_cell_size * dissolve_min_metacell_cells)

        if candidates_min_metacell_cells is not None or dissolve_min_metacell_cells is not None:
            ut.log_calc("target_metacell_size", target_metacell_size)

    data = ut.get_vo_proper(fdata, "downsampled", layout="row_major")
    data = ut.to_numpy_matrix(data, copy=True)

    if cells_similarity_value_normalization > 0:
        data += cells_similarity_value_normalization

    if cells_similarity_log_data:
        data = ut.log_data(data, base=2)

    if knn_k is None:
        if effective_cell_sizes is None:
            median_cell_size = 1.0
        else:
            median_cell_size = float(np.median(effective_cell_sizes))
        knn_k = int(round(target_metacell_size / median_cell_size))
        if min_knn_k is not None:
            knn_k = max(knn_k, min_knn_k)

    if knn_k == 0:
        ut.log_calc("knn_k: 0 (too small, try single metacell)")
        ut.set_o_data(fdata, "candidate", np.full(fdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0")
    elif knn_k >= fdata.n_obs:
        ut.log_calc(f"knn_k: {knn_k} (too large, try single metacell)")
        ut.set_o_data(fdata, "candidate", np.full(fdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0")

    else:
        ut.log_calc("knn_k", knn_k)

        tl.compute_obs_obs_similarity(fdata, data, method=cells_similarity_method, reproducible=(random_seed != 0))

        tl.compute_obs_obs_knn_graph(
            fdata,
            k=knn_k,
            balanced_ranks_factor=knn_balanced_ranks_factor,
            incoming_degree_factor=knn_incoming_degree_factor,
            outgoing_degree_factor=knn_outgoing_degree_factor,
        )

        tl.compute_candidate_metacells(
            fdata,
            target_metacell_size=target_metacell_size,
            cell_sizes=effective_cell_sizes,
            cell_seeds=candidates_cell_seeds,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            cooldown_pass=candidates_cooldown_pass,
            cooldown_node=candidates_cooldown_node,
            cooldown_phase=candidates_cooldown_phase,
            min_split_size_factor=candidates_min_split_size_factor,
            max_merge_size_factor=candidates_max_merge_size_factor,
            min_metacell_cells=candidates_min_metacell_cells,
            max_split_min_cut_strength=candidates_max_split_min_cut_strength,
            min_cut_seed_cells=candidates_min_cut_seed_cells,
            must_complete_cover=must_complete_cover,
            random_seed=random_seed,
        )

        ut.set_oo_data(adata, "obs_similarity", ut.get_oo_proper(fdata, "obs_similarity"))

        ut.set_oo_data(adata, "obs_outgoing_weights", ut.get_oo_proper(fdata, "obs_outgoing_weights"))

        seed_of_cells = ut.get_o_numpy(fdata, "seed", formatter=ut.groups_description)

        ut.set_o_data(adata, "seed", seed_of_cells, formatter=ut.groups_description)

    candidate_of_cells = ut.get_o_numpy(fdata, "candidate", formatter=ut.groups_description)

    ut.set_o_data(adata, "candidate", candidate_of_cells, formatter=ut.groups_description)

    if must_complete_cover:
        assert np.min(candidate_of_cells) == 0

        deviant_votes_of_genes = np.zeros(adata.n_vars, dtype="float32")
        deviant_votes_of_cells = np.zeros(adata.n_obs, dtype="float32")
        dissolved_of_cells = np.zeros(adata.n_obs, dtype="bool")

        ut.set_v_data(adata, "gene_deviant_votes", deviant_votes_of_genes, formatter=ut.mask_description)

        ut.set_o_data(adata, "cell_deviant_votes", deviant_votes_of_cells, formatter=ut.mask_description)

        ut.set_o_data(adata, "dissolved", dissolved_of_cells, formatter=ut.mask_description)

        ut.set_o_data(adata, "metacell", candidate_of_cells, formatter=ut.groups_description)

    else:
        tl.find_deviant_cells(
            adata,
            candidates=candidate_of_cells,
            min_gene_fold_factor=deviants_min_gene_fold_factor,
            abs_folds=deviants_abs_folds,
            max_gene_fraction=deviants_max_gene_fraction,
            max_cell_fraction=deviants_max_cell_fraction,
        )

        tl.dissolve_metacells(
            adata,
            candidates=candidate_of_cells,
            target_metacell_size=target_metacell_size,
            cell_sizes=effective_cell_sizes,
            min_robust_size_factor=dissolve_min_robust_size_factor,
            min_convincing_size_factor=dissolve_min_convincing_size_factor,
            min_convincing_gene_fold_factor=dissolve_min_convincing_gene_fold_factor,
            min_metacell_cells=dissolve_min_metacell_cells,
        )

        metacell_of_cells = ut.get_o_numpy(adata, "metacell", formatter=ut.groups_description)

        outlier_of_cells = metacell_of_cells < 0
        ut.set_o_data(adata, "outlier", outlier_of_cells, formatter=ut.mask_description)

    return fdata
Exemplo n.º 9
0
def combine_masks(  # pylint: disable=too-many-branches,too-many-statements
    adata: AnnData,
    masks: List[str],
    *,
    invert: bool = False,
    to: Optional[str] = None,
) -> Optional[ut.PandasSeries]:
    """
    Combine different pre-computed masks into a final overall mask.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes.

    **Returns**

    If ``to`` (default: {to}) is ``None``, returns the computed mask. Otherwise, sets the
    mask as an annotation (per-variable or per-observation depending on the type of the combined masks).

    **Computation Parameters**

    1. For each of the mask in ``masks``, fetch it. Silently ignore missing masks if the name has a
       ``?`` suffix. Invert the mask if the name has a ``~`` prefix. If the name has a ``|`` prefix
       (before the ``~`` prefix, if any), then bitwise-OR the mask into the OR mask, otherwise (or if
       it has a ``&`` prefix), bitwise-AND the mask into the AND mask.

    2. Combine (bitwise-AND) the AND mask and the OR mask into a single mask.

    3. If ``invert`` (default: {invert}), invert the result combined mask.
    """
    assert len(masks) > 0

    per: Optional[str] = None

    and_mask: Optional[ut.NumpyVector] = None
    or_mask: Optional[ut.NumpyVector] = None

    for mask_name in masks:
        log_mask_name = mask_name

        if mask_name[0] == "|":
            is_or = True
            mask_name = mask_name[1:]
        else:
            is_or = False
            if mask_name[0] == "&":
                mask_name = mask_name[1:]

        if mask_name[0] == "~":
            invert_mask = True
            mask_name = mask_name[1:]
        else:
            invert_mask = False

        if mask_name[-1] == "?":
            must_exist = False
            mask_name = mask_name[:-1]
        else:
            must_exist = True

        if mask_name in adata.obs:
            mask_per = "o"
            mask = ut.get_o_numpy(
                adata, mask_name, formatter=ut.mask_description) > 0
        elif mask_name in adata.var:
            mask_per = "v"
            mask = ut.get_v_numpy(
                adata, mask_name, formatter=ut.mask_description) > 0
        else:
            if must_exist:
                raise KeyError(f"unknown mask data: {mask_name}")
            continue

        if mask.dtype != "bool":
            raise ValueError(f"the data: {mask_name} is not a boolean mask")

        if invert_mask:
            mask = ~mask

        if ut.logging_calc():
            ut.log_calc(log_mask_name, mask)

        if per is None:
            per = mask_per
        else:
            if mask_per != per:
                raise ValueError(
                    "mixing per-observation and per-variable masks")

        if is_or:
            if or_mask is None:
                or_mask = mask
            else:
                or_mask = or_mask | mask
        else:
            if and_mask is None:
                and_mask = mask
            else:
                and_mask = and_mask & mask

    if and_mask is not None:
        if or_mask is not None:
            combined_mask = and_mask & or_mask
        else:
            combined_mask = and_mask
    else:
        if or_mask is not None:
            combined_mask = or_mask
        else:
            raise ValueError("no masks to combine")

    if invert:
        combined_mask = ~combined_mask

    if to is None:
        ut.log_return("combined", combined_mask)
        if per == "o":
            return ut.to_pandas_series(combined_mask, index=adata.obs_names)
        assert per == "v"
        return ut.to_pandas_series(combined_mask, index=adata.var_names)

    if per == "o":
        ut.set_o_data(adata, to, combined_mask)
    else:
        ut.set_v_data(adata, to, combined_mask)

    return None
Exemplo n.º 10
0
def compute_significant_projected_fold_factors(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    total_umis: Optional[ut.Vector],
    projected: Union[str, ut.Matrix] = "projected",
    fold_normalization: float = pr.project_fold_normalization,
    min_significant_gene_value: float = pr.project_min_significant_gene_value,
    min_gene_fold_factor: float = pr.project_max_projection_fold_factor,
    min_entry_fold_factor: float = pr.min_entry_project_fold_factor,
    abs_folds: bool = pr.project_abs_folds,
) -> None:
    """
    Compute the significant projected fold factors of genes for each query metacell.

    This computes, for each metacell of the query, the fold factors between the actual query UMIs and the UMIs of the
    projection of the metacell onto the atlas (see :py:func:`metacells.tools.project.project_query_onto_atlas`). The
    result per-metacell-per-gene matrix is then made sparse by discarding too-low values (setting them to zero).
    Ideally, this matrix should be "very" sparse. If it contains "too many" non-zero values, more genes need to
    be ignored by the projection, or somehow corrected for batch effects prior to computing the projection.

    **Input**

    Annotated ``adata``, where the observations are query metacells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, the ``projected`` UMIs of each query metacells onto the atlas.

    **Returns**

    Sets the following in ``gdata``:

    Per-Variable Per-Observation (Gene-Cell) Annotations
        ``projected_fold``
            For each gene and query metacell, the fold factor of this gene between the query and its projection (unless
            the value is too low to be of interest, in which case it will be zero).

    **Computation Parameters**

    1. For each group (metacell), for each gene, compute the gene's fold factor
       log2((actual UMIs + ``fold_normalization``) / (expected UMIs + ``fold_normalization``)), similarly to
       :py:func:`metacells.tools.project.project_query_onto_atlas` (the default ``fold_normalization`` is
       {fold_normalization}).

    2. Set the fold factor to zero for every case where the total UMIs in the query metacell and the projected image is
       not at least ``min_significant_gene_value`` (default: {min_significant_gene_value}).

    3. If the maximal fold factor for a gene (across all metacells) is below ``min_gene_fold_factor`` (default:
       {min_gene_fold_factor}), then set all the gene's fold factors to zero (too low to be of interest).

    4. Otherwise, for any metacell whose fold factor for the gene is less than ``min_entry_fold_factor`` (default:
       {min_entry_fold_factor}), set the fold factor to zero (too low to be of interest). If ``abs_folds`` (default:
       {abs_folds}), consider the absolute fold factors.
    """
    assert 0 <= min_entry_fold_factor <= min_gene_fold_factor
    assert fold_normalization >= 0

    metacells_data = ut.get_vo_proper(adata, what, layout="row_major")
    projected_data = ut.get_vo_proper(adata, projected, layout="row_major")

    metacells_fractions = ut.fraction_by(metacells_data,
                                         by="row",
                                         sums=total_umis)
    projected_fractions = ut.fraction_by(projected_data,
                                         by="row",
                                         sums=total_umis)

    metacells_fractions += fold_normalization  # type: ignore
    projected_fractions += fold_normalization  # type: ignore

    dense_folds = metacells_fractions / projected_fractions  # type: ignore
    dense_folds = np.log2(dense_folds, out=dense_folds)

    total_umis = ut.to_numpy_matrix(metacells_data +
                                    projected_data)  # type: ignore
    insignificant_folds_mask = total_umis < min_significant_gene_value
    ut.log_calc("insignificant entries", insignificant_folds_mask)
    dense_folds[insignificant_folds_mask] = 0.0

    significant_folds = significant_folds_matrix(dense_folds,
                                                 min_gene_fold_factor,
                                                 min_entry_fold_factor,
                                                 abs_folds)
    ut.set_vo_data(adata, "projected_fold", significant_folds)
Exemplo n.º 11
0
def compute_inner_fold_factors(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    gdata: AnnData,
    group: Union[str, ut.Vector] = "metacell",
    min_gene_inner_fold_factor: float = pr.min_gene_inner_fold_factor,
    min_entry_inner_fold_factor: float = pr.min_entry_inner_fold_factor,
    inner_abs_folds: float = pr.inner_abs_folds,
) -> None:
    """
    Compute the inner fold factors of genes within in each metacell.

    This computes, for each cell of the metacell, the same fold factors that are used to detect deviant cells (see
    :py:func:`metacells.tools.deviants.find_deviant_cells`), and keeps the maximal fold for each gene in the metacell.
    The result per-metacell-per-gene matrix is then made sparse by discarding too-low values (setting them to zero).
    Ideally, this matrix should be "very" sparse. If it contains "too many" non-zero values, this indicates the
    metacells contains "too much" variability. This may be due to actual biology (e.g. immune cells or olfactory nerves
    which are all similar except for each one expressing one different gene), due to batch effects (similar cells in
    distinct batches differing in some genes due to technical issues), due to low data quality (the overall noise level
    is so high that this is simply the best the algorithm can do), or worse - a combination of the above.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, ``gdata`` is assumed to have one observation for each group, and use the same
    genes as ``adata``.

    **Returns**

    Sets the following in ``gdata``:

    Per-Variable Per-Observation (Gene-Cell) Annotations

        ``inner_fold``
            For each gene and group, the maximal fold factor of this gene in any cell contained in the group (unless the
            value is too low to be of interest, in which case it will be zero).

    **Computation Parameters**

    1. For each group (metacell), for each gene, compute the gene's maximal (in all the cells of the group) fold factor
       log2((actual UMIs + 1) / (expected UMIs + 1)), similarly to
       :py:func:`metacells.tools.deviants.find_deviant_cells`.

    2. If the maximal fold factor for a gene (across all metacells) is below ``min_gene_inner_fold_factor`` (default:
       {min_gene_inner_fold_factor}), then set all the gene's fold factors to zero (too low to be of interest). If
       ``inner_abs_folds`` (default: {inner_abs_folds}), consider the absolute fold factors.

    3. Otherwise, for any metacell whose fold factor for the gene is less than ``min_entry_inner_fold_factor`` (default:
       {min_entry_inner_fold_factor}), set the fold factor to zero (too low to be of interest).
    """
    assert 0 <= min_entry_inner_fold_factor <= min_gene_inner_fold_factor

    cells_data = ut.get_vo_proper(adata, what, layout="row_major")
    metacells_data = ut.get_vo_proper(gdata, what, layout="row_major")
    group_of_cells = ut.get_o_numpy(adata,
                                    group,
                                    formatter=ut.groups_description)
    total_umis_per_cell = ut.sum_per(cells_data, per="row")
    total_umis_per_metacell = ut.sum_per(metacells_data, per="row")

    @ut.timed_call("compute_metacell_inner_folds")
    def _compute_single_metacell_inner_folds(
            metacell_index: int) -> ut.NumpyVector:
        return _compute_metacell_inner_folds(
            metacell_index=metacell_index,
            cells_data=cells_data,
            metacells_data=metacells_data,
            group_of_cells=group_of_cells,
            total_umis_per_cell=total_umis_per_cell,
            total_umis_per_metacell=total_umis_per_metacell,
        )

    results = list(
        ut.parallel_map(_compute_single_metacell_inner_folds, gdata.n_obs))
    dense_inner_folds_by_row = np.array(results)
    dense_inner_folds_by_column = ut.to_layout(dense_inner_folds_by_row,
                                               "column_major")
    if inner_abs_folds:
        comparable_dense_inner_folds_by_column = np.abs(
            dense_inner_folds_by_column)
    else:
        comparable_dense_inner_folds_by_column = dense_inner_folds_by_column
    max_fold_per_gene = ut.max_per(comparable_dense_inner_folds_by_column,
                                   per="column")
    significant_genes_mask = max_fold_per_gene >= min_gene_inner_fold_factor
    ut.log_calc("significant_genes_mask", significant_genes_mask)
    dense_inner_folds_by_column[:, ~significant_genes_mask] = 0
    dense_inner_folds_by_column[comparable_dense_inner_folds_by_column <
                                min_entry_inner_fold_factor] = 0
    dense_inner_folds_by_row = ut.to_layout(dense_inner_folds_by_column,
                                            layout="row_major")
    sparse_inner_folds = sparse.csr_matrix(dense_inner_folds_by_row)
    ut.set_vo_data(gdata, "inner_fold", sparse_inner_folds)
Exemplo n.º 12
0
def compute_type_compatible_sizes(
    adatas: List[AnnData],
    *,
    size: str = "grouped",
    kind: str = "type",
) -> None:
    """
    Given multiple annotated data of groups, compute a "compatible" size for each one to allow for
    consistent inner normalized variance comparison.

    Since the inner normalized variance quality measure is sensitive to the group (metacell) sizes,
    it is useful to artificially shrink the groups so the sizes will be similar between the compared
    data sets. Assuming each group (metacell) has a type annotation, for each such type, we give
    each one a "compatible" size (less than or equal to its actual size) so that using this reduced
    size will give us comparable measures between all the data sets.

    The "compatible" sizes are chosen such that the density distributions of the sizes in all data
    sets would be as similar to each other as possible.

    .. note::

        This is only effective if the groups are "similar" in size. Using this to compare very coarse
        grouping (few thousands of cells) with fine-grained ones (few dozens of cells) will still
        result in very different results.

    **Input**

    Several annotated ``adatas`` where each observation is a group. Should contain per-observation
    ``size`` annotation (default: {size}) and ``kind`` annotation (default: {kind}).

    **Returns**

    Sets the following in each ``adata``:

    Per-Observation (group) Annotations:

        ``compatible_size``
            The number of grouped cells in the group to use for computing excess R^2 and inner
            normalized variance.

    **Computation**

    1. For each type, sort the groups (metacells) in increasing number of grouped observations (cells).

    2. Consider the maximal quantile (rank) of the next smallest group (metacell) in each data set.

    3. Compute the minimal number of grouped observations in all the metacells whose quantile is up
       to this maximal quantile.

    4. Use this as the "compatible" size for all these groups, and remove them from consideration.

    5. Loop until all groups are assigned a "compatible" size.
    """
    assert len(adatas) > 0
    if len(adatas) == 1:
        ut.set_o_data(
            adatas[0], "compatible_size",
            ut.get_o_numpy(adatas[0], size, formatter=ut.sizes_description))
        return

    group_sizes_of_data = [
        ut.get_o_numpy(adata, size, formatter=ut.sizes_description)
        for adata in adatas
    ]
    group_types_of_data = [ut.get_o_numpy(adata, kind) for adata in adatas]

    unique_types: Set[Any] = set()
    for group_types in group_types_of_data:
        unique_types.update(group_types)

    compatible_size_of_data = [np.full(adata.n_obs, -1) for adata in adatas]

    groups_count_of_data: List[int] = []
    for type_index, group_type in enumerate(sorted(unique_types)):
        with ut.log_step(
                f"- {group_type}",
                ut.progress_description(len(unique_types), type_index,
                                        "type")):
            sorted_group_indices_of_data = [
                np.argsort(group_sizes)[group_types == group_type]
                for group_sizes, group_types in zip(group_sizes_of_data,
                                                    group_types_of_data)
            ]

            groups_count_of_data = [
                len(sorted_group_indices)
                for sorted_group_indices in sorted_group_indices_of_data
            ]

            ut.log_calc("group_counts", groups_count_of_data)

            def _for_each(value_of_data: List[T]) -> List[T]:
                return [
                    value for groups_count, value in zip(
                        groups_count_of_data, value_of_data)
                    if groups_count > 0
                ]

            groups_count_of_each = _for_each(groups_count_of_data)

            if len(groups_count_of_each) == 0:
                continue

            sorted_group_indices_of_each = _for_each(
                sorted_group_indices_of_data)
            group_sizes_of_each = _for_each(group_sizes_of_data)
            compatible_size_of_each = _for_each(compatible_size_of_data)

            if len(groups_count_of_each) == 1:
                compatible_size_of_each[0][
                    sorted_group_indices_of_each[0]] = group_sizes_of_each[0][
                        sorted_group_indices_of_each[0]]

            group_quantile_of_each = [
                (np.arange(len(sorted_group_indices)) + 1) /
                len(sorted_group_indices)
                for sorted_group_indices in sorted_group_indices_of_each
            ]

            next_position_of_each = np.full(len(group_quantile_of_each), 0)

            while True:
                next_quantile_of_each = [
                    group_quantile[next_position]
                    for group_quantile, next_position in zip(
                        group_quantile_of_each, next_position_of_each)
                ]
                next_quantile = max(next_quantile_of_each)

                last_position_of_each = next_position_of_each.copy()
                next_position_of_each[:] = [
                    np.sum(group_quantile <= next_quantile)
                    for group_quantile in group_quantile_of_each
                ]

                positions_of_each = [
                    range(last_position, next_position)
                    for last_position, next_position in zip(
                        last_position_of_each, next_position_of_each)
                ]

                sizes_of_each = [
                    group_sizes[sorted_group_indices[positions]]
                    for group_sizes, sorted_group_indices, positions in zip(
                        group_sizes_of_each, sorted_group_indices_of_each,
                        positions_of_each)
                ]

                min_size_of_each = [
                    np.min(sizes) for sizes, positions in zip(
                        sizes_of_each, positions_of_each)
                ]
                min_size = min(min_size_of_each)

                for sorted_group_indices, positions, compatible_size in zip(
                        sorted_group_indices_of_each, positions_of_each,
                        compatible_size_of_each):
                    compatible_size[sorted_group_indices[positions]] = min_size

                is_done_of_each = [
                    next_position == groups_count
                    for next_position, groups_count in zip(
                        next_position_of_each, groups_count_of_each)
                ]
                if all(is_done_of_each):
                    break

                assert not any(is_done_of_each)

    for adata, compatible_size in zip(adatas, compatible_size_of_data):
        assert np.min(compatible_size) > 0
        ut.set_o_data(adata, "compatible_size", compatible_size)
Exemplo n.º 13
0
def _collect_group_data(
    group_index: int,
    *,
    group_of_cells: ut.NumpyVector,
    cells_data: ut.ProperMatrix,
    compatible_size: Optional[int],
    downsample_min_samples: int,
    downsample_min_cell_quantile: float,
    downsample_max_cell_quantile: float,
    min_gene_total: int,
    random_seed: int,
    variance_per_gene_per_group: ut.NumpyMatrix,
    normalized_variance_per_gene_per_group: ut.NumpyMatrix,
) -> None:
    cell_indices = np.where(group_of_cells == group_index)[0]
    cells_count = len(cell_indices)
    if cells_count < 2:
        return

    if compatible_size is None:
        ut.log_calc("  cells", cells_count)
    else:
        assert 0 < compatible_size <= cells_count
        if compatible_size < cells_count:
            np.random.seed(random_seed)
            if ut.logging_calc():
                ut.log_calc("  cells: " + ut.ratio_description(
                    len(cell_indices), "cell", compatible_size, "compatible"))
            cell_indices = np.random.choice(cell_indices,
                                            size=compatible_size,
                                            replace=False)
            assert len(cell_indices) == compatible_size

    assert ut.is_layout(cells_data, "row_major")
    group_data = cells_data[cell_indices, :]

    total_per_cell = ut.sum_per(group_data, per="row")
    samples = int(
        round(
            min(
                max(downsample_min_samples,
                    np.quantile(total_per_cell, downsample_min_cell_quantile)),
                np.quantile(total_per_cell, downsample_max_cell_quantile),
            )))
    if ut.logging_calc():
        ut.log_calc(f"  samples: {samples}")
    downsampled_data = ut.downsample_matrix(group_data,
                                            per="row",
                                            samples=samples,
                                            random_seed=random_seed)

    downsampled_data = ut.to_layout(downsampled_data, layout="column_major")
    total_per_gene = ut.sum_per(downsampled_data, per="column")
    too_small_genes = total_per_gene < min_gene_total
    if ut.logging_calc():
        included_genes_count = len(too_small_genes) - np.sum(too_small_genes)
        ut.log_calc(f"  included genes: {included_genes_count}")

    variance_per_gene = ut.variance_per(downsampled_data, per="column")
    normalized_variance_per_gene = ut.normalized_variance_per(downsampled_data,
                                                              per="column")

    variance_per_gene[too_small_genes] = None
    normalized_variance_per_gene[too_small_genes] = None

    variance_per_gene_per_group[group_index, :] = variance_per_gene
    normalized_variance_per_gene_per_group[
        group_index, :] = normalized_variance_per_gene
Exemplo n.º 14
0
def find_metacells_significant_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_range_fold: float = pr.min_significant_metacells_gene_range_fold_factor,
    normalization: float = pr.metacells_gene_range_normalization,
    min_gene_fraction: float = pr.min_significant_metacells_gene_fraction,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have a significant signal in metacells data. This computation is too unreliable to be used on
    cells.

    Find genes which have a high maximal expression in at least one metacell, and a wide range of expression across the
    metacells. Such genes are good candidates for being used as marker genes and/or to compute distances between
    metacells.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``significant_gene``
            A boolean mask indicating whether each gene was found to be significant.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Compute the minimal and maximal expression level of each gene.

    2. Select the genes whose fold factor (log2 of maximal over minimal value, using the ``normalization``
       (default: {normalization}) is at least ``min_gene_range_fold`` (default: {min_gene_range_fold}).

    3. Select the genes whose maximal expression is at least ``min_gene_fraction`` (default: {min_gene_fraction}).
    """
    assert normalization >= 0

    data = ut.get_vo_proper(adata, what, layout="row_major")
    fractions_of_genes = ut.to_layout(ut.fraction_by(data, by="row"), layout="column_major")

    min_fraction_of_genes = ut.min_per(fractions_of_genes, per="column")
    max_fraction_of_genes = ut.max_per(fractions_of_genes, per="column")

    high_max_fraction_genes_mask = max_fraction_of_genes >= min_gene_fraction
    ut.log_calc("high max fraction genes", high_max_fraction_genes_mask)

    min_fraction_of_genes += normalization
    max_fraction_of_genes += normalization

    max_fraction_of_genes /= min_fraction_of_genes
    range_fold_of_genes = np.log2(max_fraction_of_genes, out=max_fraction_of_genes)

    high_range_genes_mask = range_fold_of_genes >= min_gene_range_fold
    ut.log_calc("high range genes", high_range_genes_mask)

    significant_genes_mask = high_max_fraction_genes_mask & high_range_genes_mask

    if inplace:
        ut.set_v_data(adata, "significant_gene", significant_genes_mask)
        return None

    ut.log_return("significant_genes", significant_genes_mask)
    return ut.to_pandas_series(significant_genes_mask, index=adata.var_names)