Exemplo n.º 1
0
def optimize_partitions(
    *,
    edge_weights: ut.CompressedMatrix,
    community_of_nodes: ut.NumpyVector,
    cooldown_pass: float = pr.cooldown_pass,
    cooldown_node: float = pr.cooldown_node,
    random_seed: int,
) -> float:
    """
    Optimize partition to candidate metacells (communities) using the ``edge_weights``.

    Returns the score of the optimized partition.

    This modifies the ``community_of_nodes`` in-place.

    The goal is to minimize the "stability" goal function which is defined to be the ratio between
    (1) the probability that, selecting a random node and either a random outgoing edge or a random
    incoming edge (biased by their weights), that the node connected to by that edge is in the same
    community (metacell) and (2) the probability that a random edge would lead to this same
    community (the fraction of its number of nodes out of the total).

    To maximize this, we repeatedly pass on a randomized permutation of the nodes, and for each
    node, move it to a random "better" community. When deciding if a community is better, we
    consider both (1) just the "local" product of the sum of the weights of incoming and outgoing edges
    between the node and the current and candidate communities and (2) the effect on the "global" goal
    function (considering the impact on this product for all other nodes connected to the current
    node).

    We define a notion of ``temperature`` (initially, 1 - ``cooldown_pass``, default:
    {cooldown_pass}) and we give a weight of ``temperature`` to the local score and
    (1 - ``temperature``) to the global score. When we move to the next node, we multiply the
    temperature by 1 - ``cooldown_pass``. If we did not move the node, we multiply its temperature
    by ``cooldown_node`` (default: {cooldown_node}). We skip looking at nodes which are colder from
    the global temperature to accelerate the algorithm. If we don't move any node, we reduce the
    global temperature below that of any cold node; if there are no such nodes, we reduce it to zero
    to perform a final hill-climbing phase.

    This simulated-annealing-like behavior helps the algorithm to escape local maximums, although of
    course no claim is made of achieving the global maximum of the goal function.
    """
    outgoing_edge_weights = ut.mustbe_compressed_matrix(edge_weights)
    assert ut.is_layout(outgoing_edge_weights, "row_major")

    incoming_edge_weights = ut.mustbe_compressed_matrix(ut.to_layout(outgoing_edge_weights, layout="column_major"))
    assert ut.is_layout(incoming_edge_weights, "column_major")
    return _optimize_partitions(
        outgoing_edge_weights=outgoing_edge_weights,
        incoming_edge_weights=incoming_edge_weights,
        random_seed=random_seed,
        cooldown_pass=cooldown_pass,
        cooldown_node=cooldown_node,
        community_of_nodes=community_of_nodes,
        cold_communities_count=0,
        cold_temperature=cooldown_pass,
    )
Exemplo n.º 2
0
def _fold_ranks(
    *,
    cells_count: int,
    fold_factors: ut.CompressedMatrix,
    deviant_gene_indices: ut.NumpyVector,
) -> ut.NumpyMatrix:
    assert fold_factors.getformat() == "csc"

    deviant_genes_count = deviant_gene_indices.size

    ut.timed_parameters(cells=cells_count, deviant_genes=deviant_genes_count)

    deviant_genes_fold_ranks = np.full((cells_count, deviant_genes_count), cells_count, order="F")
    assert ut.is_layout(deviant_genes_fold_ranks, "column_major")

    for deviant_gene_index, gene_index in enumerate(deviant_gene_indices):
        gene_start_offset = fold_factors.indptr[gene_index]
        gene_stop_offset = fold_factors.indptr[gene_index + 1]

        gene_fold_factors = fold_factors.data[gene_start_offset:gene_stop_offset]
        gene_suspect_cell_indices = fold_factors.indices[gene_start_offset:gene_stop_offset]

        gene_fold_ranks = stats.rankdata(gene_fold_factors, method="min")
        gene_fold_ranks *= -1
        gene_fold_ranks += gene_fold_ranks.size + 1

        deviant_genes_fold_ranks[gene_suspect_cell_indices, deviant_gene_index] = gene_fold_ranks

    return deviant_genes_fold_ranks
Exemplo n.º 3
0
def score_partitions(
    *,
    edge_weights: ut.CompressedMatrix,
    partition_of_nodes: ut.NumpyVector,
    with_orphans: bool = True,
) -> None:
    """
    Compute the "stability" the "stability" goal function which is defined to be the ratio between
    (1) the probability that, selecting a random node and either a random outgoing edge or a random
    incoming edge (biased by their weights), that the node connected to by that edge is in the same
    community (metacell) and (2) the probability that a random edge would lead to this same
    community (the fraction of its number of nodes out of the total).

    If ``with_orphans`` is True (the default), outlier nodes are included in the computation. In
    general we add 1e-6 to the product of the incoming and outgoing weights so we can safely log it
    for efficient computation; thus orphans are given a very small (non-zero) weight so the overall
    score is not zeroed even when including them.
    """
    assert str(partition_of_nodes.dtype) == "int32"
    outgoing_edge_weights = ut.mustbe_compressed_matrix(edge_weights)
    assert ut.is_layout(outgoing_edge_weights, "row_major")

    incoming_edge_weights = ut.mustbe_compressed_matrix(ut.to_layout(outgoing_edge_weights, layout="column_major"))
    assert ut.is_layout(incoming_edge_weights, "column_major")

    with ut.unfrozen(partition_of_nodes):
        with ut.timed_step(".score"):
            score = xt.score_partitions(
                outgoing_edge_weights.data,
                outgoing_edge_weights.indices,
                outgoing_edge_weights.indptr,
                incoming_edge_weights.data,
                incoming_edge_weights.indices,
                incoming_edge_weights.indptr,
                partition_of_nodes,
                with_orphans,
            )

    ut.log_calc("score", score)
    return score
Exemplo n.º 4
0
def _rank_outgoing(similarity: ut.NumpyMatrix) -> ut.NumpyMatrix:
    size = similarity.shape[0]
    assert similarity.shape == (size, size)
    similarity = np.copy(similarity)

    min_similarity = ut.min_matrix(similarity)

    np.fill_diagonal(similarity, min_similarity - 1)

    assert ut.is_layout(similarity, "row_major")
    outgoing_ranks = ut.rank_matrix_by_layout(similarity, ascending=False)
    assert np.sum(np.diagonal(outgoing_ranks) == size) == size
    return outgoing_ranks
Exemplo n.º 5
0
def choose_seeds(
    *,
    edge_weights: ut.CompressedMatrix,
    seed_of_cells: Optional[ut.NumpyVector] = None,
    max_seeds_count: int,
    min_seed_size_quantile: float = pr.min_seed_size_quantile,
    max_seed_size_quantile: float = pr.max_seed_size_quantile,
    random_seed: int,
) -> ut.NumpyVector:
    """
    Choose initial assignment of cells to seeds based on the ``edge_weights``.

    Returns a vector assigning each node (cell) to a seed (initial community).

    If ``seed_of_cells`` is specified, it is expected to contain a vector of partial seeds. Only
    cells which have a negative seed will be assigned a new seed. New seeds will be created so that
    the total number of seeds will not exceed ``max_seeds_count``. The ``seed_of_cells`` will be
    modified in-place and returned.

    Otherwise, a new vector is created, initialized with ``-1`` (that is, no seed) for all nodes,
    filled as above, and returned.

    **Computation Parameters**

    1. We compute for each candidate node the number of nodes it is connected to (by an outgoing
       edge).

    2. We pick as a seed a random node whose number of connected nodes ("seed size") quantile is at
       least ``min_seed_size_quantile`` and at most ``max_seed_size_quantile``. This ensures we pick
       seeds that aren't too small or too large to get a good coverage of the population with a low
       number of seeds.

    3. We assign each of the connected nodes to their seed, and discount them from the number of
       connected nodes of the remaining unassigned nodes.

    4. We repeat this until we reach the target number of seeds.
    """
    size = edge_weights.shape[0]

    outgoing_edge_weights = ut.mustbe_compressed_matrix(edge_weights)
    assert ut.is_layout(outgoing_edge_weights, "row_major")

    incoming_edge_weights = ut.mustbe_compressed_matrix(ut.to_layout(outgoing_edge_weights, layout="column_major"))
    assert ut.is_layout(incoming_edge_weights, "column_major")

    if seed_of_cells is None:
        seed_of_cells = np.full(size, -1, dtype="int32")
    else:
        assert seed_of_cells.dtype == "int32"

    assert outgoing_edge_weights.shape == incoming_edge_weights.shape == (len(seed_of_cells), len(seed_of_cells))

    return _choose_seeds(
        outgoing_edge_weights=outgoing_edge_weights,
        incoming_edge_weights=incoming_edge_weights,
        seed_of_cells=seed_of_cells,
        max_seeds_count=max_seeds_count,
        min_seed_size_quantile=min_seed_size_quantile,
        max_seed_size_quantile=max_seed_size_quantile,
        random_seed=random_seed,
    )
Exemplo n.º 6
0
def compute_candidate_metacells(  # pylint: disable=too-many-statements,too-many-branches
    adata: AnnData,
    what: Union[str, ut.Matrix] = "obs_outgoing_weights",
    *,
    target_metacell_size: float,
    cell_sizes: Optional[Union[str, ut.Vector]] = pr.candidates_cell_sizes,
    cell_seeds: Optional[Union[str, ut.Vector]] = None,
    min_seed_size_quantile: float = pr.min_seed_size_quantile,
    max_seed_size_quantile: float = pr.max_seed_size_quantile,
    cooldown_pass: float = pr.cooldown_pass,
    cooldown_node: float = pr.cooldown_node,
    cooldown_phase: float = pr.cooldown_phase,
    min_split_size_factor: Optional[float] = pr.candidates_min_split_size_factor,
    max_merge_size_factor: Optional[float] = pr.candidates_max_merge_size_factor,
    min_metacell_cells: Optional[int] = pr.candidates_min_metacell_cells,
    max_split_min_cut_strength: Optional[float] = pr.max_split_min_cut_strength,
    min_cut_seed_cells: Optional[int] = pr.min_cut_seed_cells,
    must_complete_cover: bool = False,
    random_seed: int = 0,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Assign observations (cells) to (raw, candidate) metacells based on ``what`` data. (a weighted
    directed graph).

    These candidate metacells typically go through additional vetting (e.g. deviant detection and
    dissolving too-small metacells) to obtain the final metacells.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-observation-per-observation matrix where each row is the outgoing weights from
    each observation to the rest, or just the name of a per-observation-per-observation annotation
    containing such a matrix. Typically this matrix will be sparse for efficient processing.

    **Returns**

    Observation (Cell) Annotations
        ``candidate``
            The integer index of the (raw, candidate) metacell each cell belongs to. The metacells
            are in no particular order.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. We are trying to build metacells of ``target_metacell_size``, using the ``cell_sizes``
       (default: {cell_sizes}) to assign a size for each node (cell). This can be a string name of a
       per-observation annotation or a vector of values.

    2. We start with some an assignment of cells to ``cell_seeds`` (default: {cell_seeds}). If no
       seeds are provided, we use :py:func:`choose_seeds` using ``min_seed_size_quantile`` (default:
       {min_seed_size_quantile}) and ``max_seed_size_quantile`` (default: {max_seed_size_quantile})
       to compute them, picking a number of seeds such that the average metacell size would match
       the target.

    3. We optimize the seeds using :py:func:`optimize_partitions` to obtain initial communities by
       maximizing the "stability" of the solution (probability of starting at a random node and
       moving either forward or backward in the graph and staying within the same metacell, divided
       by the probability of staying in the metacell if the edges connected random nodes). We pass
       it the ``cooldown_pass`` {cooldown_pass}) and ``cooldown_node`` (default: {cooldown_node}).

    4. If ``min_split_size_factor`` (default: {min_split_size_factor}) is specified, randomly split
       to two each community whose size is partition method on each community whose size is at least
       ``target_metacell_size * min_split_size_factor`` and re-optimize the solution (resulting in
       one additional metacell). Every time we re-optimize, we multiply 1 - ``cooldown_pass`` by
       1 - ``cooldown_phase`` (default: {cooldown_phase}).

    5. If ``max_split_min_cut_strength`` (default: {max_split_min_cut_strength}) is specified, and
       the minimal cut of a candidate is lower, split it into two. If one of the partitions is
       smaller than ``min_cut_seed_cells``, then mark the cells in it as outliers, or if
       ``must_complete_cover`` is ``True``, skip the cut altogether.

    5. If ``max_merge_size_factor`` (default: {max_merge_size_factor}) or ``min_metacell_cells``
       (default: {min_metacell_cells}) are specified, make outliers of cells of a community whose
       size is at most ``target_metacell_size * max_merge_size_factor`` or contains less cells and
       re-optimize, which will assign these cells to other metacells (resulting on one less
       metacell). We again apply the ``cooldown_phase`` every time we re-optimize.

    6. Repeat the above steps until all metacells candidates are in the acceptable size range.
    """
    edge_weights = ut.get_oo_proper(adata, what, layout="row_major")
    assert edge_weights.shape[0] == edge_weights.shape[1]
    assert 0.0 < cooldown_pass < 1.0
    assert 0.0 <= cooldown_node <= 1.0
    assert 0.0 < cooldown_phase <= 1.0

    size = edge_weights.shape[0]

    outgoing_edge_weights = ut.mustbe_compressed_matrix(edge_weights)

    assert ut.is_layout(outgoing_edge_weights, "row_major")
    incoming_edge_weights = ut.mustbe_compressed_matrix(ut.to_layout(outgoing_edge_weights, layout="column_major"))
    assert ut.is_layout(incoming_edge_weights, "column_major")

    assert outgoing_edge_weights.data.dtype == "float32"
    assert outgoing_edge_weights.indices.dtype == "int32"
    assert outgoing_edge_weights.indptr.dtype == "int32"
    assert incoming_edge_weights.data.dtype == "float32"
    assert incoming_edge_weights.indices.dtype == "int32"
    assert incoming_edge_weights.indptr.dtype == "int32"

    node_sizes = ut.maybe_o_numpy(adata, cell_sizes, formatter=ut.sizes_description)
    if node_sizes is None:
        node_sizes = np.full(size, 1.0, dtype="float32")
    else:
        node_sizes = node_sizes.astype("float32")
    ut.log_calc("node_sizes", node_sizes, formatter=ut.sizes_description)

    assert target_metacell_size > 0
    max_metacell_size = None
    min_metacell_size = None

    if min_split_size_factor is not None:
        assert min_split_size_factor > 0
        max_metacell_size = ceil(target_metacell_size * min_split_size_factor) - 1
    ut.log_calc("max_metacell_size", max_metacell_size)

    if max_merge_size_factor is not None:
        assert max_merge_size_factor > 0
        min_metacell_size = floor(target_metacell_size * max_merge_size_factor) + 1
    ut.log_calc("min_metacell_size", min_metacell_size)

    target_metacell_cells = max(
        1.0 if min_metacell_cells is None else float(min_metacell_cells),
        float(target_metacell_size / np.mean(node_sizes)),
    )
    ut.log_calc("target_metacell_cells", target_metacell_cells)

    if min_split_size_factor is not None and max_merge_size_factor is not None:
        assert max_merge_size_factor < min_split_size_factor
        assert min_metacell_size is not None
        assert max_metacell_size is not None
        assert min_metacell_size <= max_metacell_size

    community_of_nodes = ut.maybe_o_numpy(adata, cell_seeds, formatter=ut.groups_description)

    if community_of_nodes is not None:
        assert community_of_nodes.dtype == "int32"
    else:
        target_seeds_count = ceil(size / target_metacell_cells)
        ut.log_calc("target_seeds_count", target_seeds_count)

        community_of_nodes = np.full(size, -1, dtype="int32")
        _choose_seeds(
            outgoing_edge_weights=outgoing_edge_weights,
            incoming_edge_weights=incoming_edge_weights,
            seed_of_cells=community_of_nodes,
            max_seeds_count=target_seeds_count,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            random_seed=random_seed,
        )

    ut.set_o_data(adata, "seed", community_of_nodes, formatter=ut.groups_description)
    community_of_nodes = community_of_nodes.copy()

    np.random.seed(random_seed)

    cold_temperature = 1 - cooldown_pass

    old_score = 1e9
    old_communities = community_of_nodes
    old_small_nodes_count = len(community_of_nodes)
    atomic_candidates: Set[Tuple[int, ...]] = set()
    kept_communities_count = 0

    while True:
        cold_temperature, score = _optimize_split_communities(  #
            outgoing_edge_weights=outgoing_edge_weights,
            incoming_edge_weights=incoming_edge_weights,
            community_of_nodes=community_of_nodes,
            node_sizes=node_sizes,
            target_metacell_size=target_metacell_size,
            max_metacell_size=max_metacell_size,
            max_split_min_cut_strength=max_split_min_cut_strength,
            min_cut_seed_cells=min_cut_seed_cells,
            must_complete_cover=must_complete_cover,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            random_seed=random_seed,
            cooldown_pass=cooldown_pass,
            cooldown_node=cooldown_node,
            cooldown_phase=cooldown_phase,
            kept_communities_count=kept_communities_count,
            cold_temperature=cold_temperature,
            atomic_candidates=atomic_candidates,
        )

        small_communities, small_nodes_count = _find_small_communities(
            community_of_nodes=community_of_nodes,
            node_sizes=node_sizes,
            min_metacell_size=min_metacell_size,
            min_metacell_cells=min_metacell_cells,
        )

        small_communities_count = len(small_communities)
        if small_communities_count < 2:
            break

        if (old_small_nodes_count, old_score) <= (small_nodes_count, score):
            ut.logger().debug("is not better, revert")
            community_of_nodes = old_communities
            score = old_score
            ut.log_calc("communities", community_of_nodes, formatter=ut.groups_description)
            ut.log_calc("score", score)
            break

        old_score = score
        old_communities = community_of_nodes.copy()
        old_small_nodes_count = small_nodes_count

        kept_communities_count = _cancel_communities(
            community_of_nodes=community_of_nodes, cancelled_communities=small_communities
        )

        _choose_seeds(
            outgoing_edge_weights=outgoing_edge_weights,
            incoming_edge_weights=incoming_edge_weights,
            seed_of_cells=community_of_nodes,
            max_seeds_count=kept_communities_count + small_communities_count - 1,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            random_seed=random_seed,
        )

    if inplace:
        ut.set_o_data(adata, "candidate", community_of_nodes, formatter=ut.groups_description)
        return None

    if must_complete_cover:
        assert np.min(community_of_nodes) == 0
    else:
        community_of_nodes[community_of_nodes < 0] = -1

    ut.log_return("candidate", community_of_nodes, formatter=ut.groups_description)
    return ut.to_pandas_series(community_of_nodes, index=adata.obs_names)
Exemplo n.º 7
0
def _collect_fold_factors(  # pylint: disable=too-many-statements
    *,
    data: ut.ProperMatrix,
    candidate_of_cells: ut.NumpyVector,
    totals_of_cells: ut.NumpyVector,
    min_gene_fold_factor: float,
    abs_folds: bool,
) -> Tuple[List[ut.CompressedMatrix], List[ut.NumpyVector]]:
    list_of_fold_factors: List[ut.CompressedMatrix] = []
    list_of_cell_index_of_rows: List[ut.NumpyVector] = []

    cells_count, genes_count = data.shape
    candidates_count = np.max(candidate_of_cells) + 1

    ut.timed_parameters(candidates=candidates_count, cells=cells_count, genes=genes_count)
    remaining_cells_count = cells_count

    for candidate_index in range(candidates_count):
        candidate_cell_indices = np.where(candidate_of_cells == candidate_index)[0]

        candidate_cells_count = candidate_cell_indices.size
        assert candidate_cells_count > 0

        list_of_cell_index_of_rows.append(candidate_cell_indices)
        remaining_cells_count -= candidate_cells_count

        if candidate_cells_count < 2:
            compressed = sparse.csr_matrix(
                ([], [], [0] * (candidate_cells_count + 1)), shape=(candidate_cells_count, genes_count)
            )
            list_of_fold_factors.append(compressed)
            assert compressed.has_sorted_indices
            assert compressed.has_canonical_format
            continue

        data_of_candidate: ut.ProperMatrix = data[candidate_cell_indices, :].copy()
        assert ut.is_layout(data_of_candidate, "row_major")
        assert data_of_candidate.shape == (candidate_cells_count, genes_count)

        totals_of_candidate_cells = totals_of_cells[candidate_cell_indices]

        totals_of_candidate_genes = ut.sum_per(ut.to_layout(data_of_candidate, "column_major"), per="column")
        assert totals_of_candidate_genes.size == genes_count

        fractions_of_candidate_genes = ut.to_numpy_vector(totals_of_candidate_genes / np.sum(totals_of_candidate_genes))

        _, dense, compressed = ut.to_proper_matrices(data_of_candidate)

        if compressed is not None:
            if compressed.nnz == 0:
                list_of_fold_factors.append(compressed)
                continue

            extension_name = "fold_factor_compressed_%s_t_%s_t_%s_t" % (  # pylint: disable=consider-using-f-string
                compressed.data.dtype,
                compressed.indices.dtype,
                compressed.indptr.dtype,
            )
            extension = getattr(xt, extension_name)

            with ut.timed_step("extensions.fold_factor_compressed"):
                extension(
                    compressed.data,
                    compressed.indices,
                    compressed.indptr,
                    min_gene_fold_factor,
                    abs_folds,
                    totals_of_candidate_cells,
                    fractions_of_candidate_genes,
                )

            ut.eliminate_zeros(compressed)

        else:
            assert dense is not None

            extension_name = f"fold_factor_dense_{dense.dtype}_t"
            extension = getattr(xt, extension_name)

            with ut.timed_step("extensions.fold_factor_dense"):
                extension(
                    dense,
                    min_gene_fold_factor,
                    abs_folds,
                    totals_of_candidate_cells,
                    fractions_of_candidate_genes,
                )

            compressed = sparse.csr_matrix(dense)
            assert compressed.has_sorted_indices
            assert compressed.has_canonical_format

        list_of_fold_factors.append(compressed)

    if remaining_cells_count > 0:
        assert remaining_cells_count == np.sum(candidate_of_cells < 0)
        list_of_cell_index_of_rows.append(np.where(candidate_of_cells < 0)[0])
        compressed = sparse.csr_matrix(
            ([], [], [0] * (remaining_cells_count + 1)), shape=(remaining_cells_count, genes_count)
        )
        assert compressed.has_sorted_indices
        assert compressed.has_canonical_format
        list_of_fold_factors.append(compressed)

    return list_of_fold_factors, list_of_cell_index_of_rows
Exemplo n.º 8
0
def find_noisy_lonely_genes(  # pylint: disable=too-many-statements
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    excluded_genes_mask: Optional[str] = None,
    max_sampled_cells: int = pr.noisy_lonely_max_sampled_cells,
    downsample_min_samples: int = pr.noisy_lonely_downsample_min_samples,
    downsample_min_cell_quantile: float = pr.
    noisy_lonely_downsample_max_cell_quantile,
    downsample_max_cell_quantile: float = pr.
    noisy_lonely_downsample_min_cell_quantile,
    min_gene_total: int = pr.noisy_lonely_min_gene_total,
    min_gene_normalized_variance: float = pr.
    noisy_lonely_min_gene_normalized_variance,
    max_gene_similarity: float = pr.noisy_lonely_max_gene_similarity,
    random_seed: int = pr.random_seed,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect "noisy lonely" genes based on ``what`` (default: {what}) data.

    Return the indices of genes which are "noisy" (have high variance compared to their mean) and
    also "lonely" (have low correlation with all other genes). Such genes should be excluded since
    they will never meaningfully help us compute groups, and will actively cause profiles to be
    considered "deviants".

    Noisy genes have high expression and variance. Lonely genes have no (or low) correlations with
    any other gene. Noisy lonely genes tend to throw off clustering algorithms. In general, such
    algorithms try to group together cells with the same overall biological state. Since the genes
    are lonely, they don't contribute towards this goal. Since they are noisy, they actively hamper
    this, because they make cells which are otherwise similar appear different (just for this lonely
    gene).

    It is therefore useful to explicitly identify, in a pre-processing step, the (few) such genes,
    and exclude them from the rest of the analysis.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``noisy_lonely_genes``
            A boolean mask indicating whether each gene was found to be a "noisy lonely" gene.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number
       of random cells from the data using the ``random_seed``.

    2. If we were specified an ``excluded_genes_mask``, this is the name of a per-variable (gene)
       annotation containing a mask of excluded genes. Get rid of all these excluded genes.

    3. Invoke :py:func:`metacells.tools.downsample.downsample_cells` to downsample the cells to the
       same total number of UMIs, using the ``downsample_min_samples`` (default:
       {downsample_min_samples}), ``downsample_min_cell_quantile`` (default:
       {downsample_min_cell_quantile}), ``downsample_max_cell_quantile`` (default:
       {downsample_max_cell_quantile}) and the ``random_seed`` (default: {random_seed}).

    4. Find "noisy" genes which have a total number of UMIs of at least ``min_gene_total`` (default:
       {min_gene_total}) and a normalized variance of at least ``min_gene_normalized_variance``
       (default: ``min_gene_normalized_variance``).

    5. Cross-correlate the noisy genes.

    6. Find the noisy "lonely" genes whose maximal correlation is at most
       ``max_gene_similarity`` (default: {max_gene_similarity}) with all other genes.
    """
    if max_sampled_cells < adata.n_obs:
        np.random.seed(random_seed)
        cell_indices = np.random.choice(np.arange(adata.n_obs),
                                        size=max_sampled_cells,
                                        replace=False)
        s_data = ut.slice(adata,
                          obs=cell_indices,
                          name=".sampled",
                          top_level=False)
    else:
        s_data = ut.copy_adata(adata, top_level=False)

    track_var: Optional[str] = "sampled_gene_index"

    if excluded_genes_mask is not None:
        results = filter_data(s_data,
                              name="included",
                              top_level=False,
                              track_var=track_var,
                              var_masks=[f"~{excluded_genes_mask}"])
        track_var = None
        assert results is not None
        i_data = results[0]
        assert i_data is not None
    else:
        i_data = s_data

    downsample_cells(
        i_data,
        what,
        downsample_min_samples=downsample_min_samples,
        downsample_min_cell_quantile=downsample_min_cell_quantile,
        downsample_max_cell_quantile=downsample_max_cell_quantile,
        random_seed=random_seed,
    )

    find_high_total_genes(i_data, "downsampled", min_gene_total=min_gene_total)

    results = filter_data(i_data,
                          name="high_total",
                          top_level=False,
                          track_var=track_var,
                          var_masks=["high_total_gene"])
    track_var = None
    assert results is not None
    ht_data = results[0]

    noisy_lonely_genes_mask = np.full(adata.n_vars, False)

    if ht_data is not None:
        ht_genes_count = ht_data.shape[1]

        ht_gene_ht_gene_similarity_frame = compute_var_var_similarity(
            ht_data,
            "downsampled",
            inplace=False,
            reproducible=(random_seed != 0))
        assert ht_gene_ht_gene_similarity_frame is not None

        ht_gene_ht_gene_similarity_matrix = ut.to_numpy_matrix(
            ht_gene_ht_gene_similarity_frame, only_extract=True)
        ht_gene_ht_gene_similarity_matrix = ut.to_layout(
            ht_gene_ht_gene_similarity_matrix,
            layout="row_major",
            symmetric=True)
        np.fill_diagonal(ht_gene_ht_gene_similarity_matrix, -1)

        htv_mask_series = find_high_normalized_variance_genes(
            ht_data,
            "downsampled",
            min_gene_normalized_variance=min_gene_normalized_variance,
            inplace=False)
        assert htv_mask_series is not None
        htv_mask = ut.to_numpy_vector(htv_mask_series)

        htv_genes_count = np.sum(htv_mask)
        assert htv_genes_count <= ht_genes_count

        if htv_genes_count > 0:
            htv_gene_ht_gene_similarity_matrix = ht_gene_ht_gene_similarity_matrix[
                htv_mask, :]
            assert ut.is_layout(htv_gene_ht_gene_similarity_matrix,
                                "row_major")
            assert htv_gene_ht_gene_similarity_matrix.shape == (
                htv_genes_count, ht_genes_count)

            max_similarity_of_htv_genes = ut.max_per(
                htv_gene_ht_gene_similarity_matrix, per="row")
            htvl_mask = max_similarity_of_htv_genes <= max_gene_similarity
            htvl_genes_count = np.sum(htvl_mask)
            ut.log_calc("noisy_lonely_genes_count", htvl_genes_count)

            if htvl_genes_count > 0:
                base_index_of_ht_genes = ut.get_v_numpy(
                    ht_data, "sampled_gene_index")
                base_index_of_htv_genes = base_index_of_ht_genes[htv_mask]
                base_index_of_htvl_genes = base_index_of_htv_genes[htvl_mask]

                noisy_lonely_genes_mask[base_index_of_htvl_genes] = True

                htvl_gene_ht_gene_similarity_matrix = htv_gene_ht_gene_similarity_matrix[
                    htvl_mask, :]
                htvl_gene_ht_gene_similarity_matrix = ut.to_layout(
                    htvl_gene_ht_gene_similarity_matrix, layout="row_major")
                assert htvl_gene_ht_gene_similarity_matrix.shape == (
                    htvl_genes_count, ht_genes_count)

                if ut.logging_calc():
                    i_gene_totals = ut.get_v_numpy(i_data,
                                                   "downsampled",
                                                   sum=True)
                    ht_mask = ut.get_v_numpy(i_data, "high_total_gene")
                    i_total = np.sum(i_gene_totals)
                    htvl_gene_totals = i_gene_totals[ht_mask][htv_mask][
                        htvl_mask]
                    top_similarity_of_htvl_genes = ut.top_per(
                        htvl_gene_ht_gene_similarity_matrix, 10, per="row")
                    for htvl_index, gene_index in enumerate(
                            base_index_of_htvl_genes):
                        gene_name = adata.var_names[gene_index]
                        gene_total = htvl_gene_totals[htvl_index]
                        gene_percent = 100 * gene_total / i_total
                        similar_ht_values = ut.to_numpy_vector(
                            top_similarity_of_htvl_genes[htvl_index, :])  #
                        assert len(similar_ht_values) == ht_genes_count
                        top_similar_ht_mask = similar_ht_values > 0
                        top_similar_ht_values = similar_ht_values[
                            top_similar_ht_mask]
                        top_similar_ht_indices = base_index_of_ht_genes[
                            top_similar_ht_mask]
                        top_similar_ht_names = adata.var_names[
                            top_similar_ht_indices]
                        ut.log_calc(
                            f"- {gene_name}",
                            f"total downsampled UMIs: {gene_total} " +
                            f"({gene_percent:.4g}%), correlated with: " +
                            ", ".join([
                                f"{similar_gene_name}: {similar_gene_value:.4g}"
                                for similar_gene_value, similar_gene_name in
                                reversed(
                                    sorted(
                                        zip(top_similar_ht_values,
                                            top_similar_ht_names)))
                            ]),
                        )

    if ut.logging_calc():
        ut.log_calc("noisy_lonely_gene_names",
                    sorted(list(adata.var_names[noisy_lonely_genes_mask])))

    if inplace:
        ut.set_v_data(adata, "noisy_lonely_gene", noisy_lonely_genes_mask)
        return None

    ut.log_return("noisy_lonely_genes", noisy_lonely_genes_mask)
    return ut.to_pandas_series(noisy_lonely_genes_mask, index=adata.var_names)
Exemplo n.º 9
0
def _collect_group_data(
    group_index: int,
    *,
    group_of_cells: ut.NumpyVector,
    cells_data: ut.ProperMatrix,
    compatible_size: Optional[int],
    downsample_min_samples: int,
    downsample_min_cell_quantile: float,
    downsample_max_cell_quantile: float,
    min_gene_total: int,
    random_seed: int,
    variance_per_gene_per_group: ut.NumpyMatrix,
    normalized_variance_per_gene_per_group: ut.NumpyMatrix,
) -> None:
    cell_indices = np.where(group_of_cells == group_index)[0]
    cells_count = len(cell_indices)
    if cells_count < 2:
        return

    if compatible_size is None:
        ut.log_calc("  cells", cells_count)
    else:
        assert 0 < compatible_size <= cells_count
        if compatible_size < cells_count:
            np.random.seed(random_seed)
            if ut.logging_calc():
                ut.log_calc("  cells: " + ut.ratio_description(
                    len(cell_indices), "cell", compatible_size, "compatible"))
            cell_indices = np.random.choice(cell_indices,
                                            size=compatible_size,
                                            replace=False)
            assert len(cell_indices) == compatible_size

    assert ut.is_layout(cells_data, "row_major")
    group_data = cells_data[cell_indices, :]

    total_per_cell = ut.sum_per(group_data, per="row")
    samples = int(
        round(
            min(
                max(downsample_min_samples,
                    np.quantile(total_per_cell, downsample_min_cell_quantile)),
                np.quantile(total_per_cell, downsample_max_cell_quantile),
            )))
    if ut.logging_calc():
        ut.log_calc(f"  samples: {samples}")
    downsampled_data = ut.downsample_matrix(group_data,
                                            per="row",
                                            samples=samples,
                                            random_seed=random_seed)

    downsampled_data = ut.to_layout(downsampled_data, layout="column_major")
    total_per_gene = ut.sum_per(downsampled_data, per="column")
    too_small_genes = total_per_gene < min_gene_total
    if ut.logging_calc():
        included_genes_count = len(too_small_genes) - np.sum(too_small_genes)
        ut.log_calc(f"  included genes: {included_genes_count}")

    variance_per_gene = ut.variance_per(downsampled_data, per="column")
    normalized_variance_per_gene = ut.normalized_variance_per(downsampled_data,
                                                              per="column")

    variance_per_gene[too_small_genes] = None
    normalized_variance_per_gene[too_small_genes] = None

    variance_per_gene_per_group[group_index, :] = variance_per_gene
    normalized_variance_per_gene_per_group[
        group_index, :] = normalized_variance_per_gene