예제 #1
0
파일: rare.py 프로젝트: tanaylab/metacells
def _pick_candidates(
    *,
    adata_of_all_genes_of_all_cells: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    max_gene_cell_fraction: float,
    min_gene_maximum: int,
    min_genes_of_modules: int,
    allowed_genes_mask: ut.NumpyVector,
) -> Optional[Tuple[AnnData, ut.NumpyVector]]:
    data = ut.get_vo_proper(adata_of_all_genes_of_all_cells,
                            what,
                            layout="column_major")
    nnz_cells_of_genes = ut.nnz_per(data, per="column")

    nnz_cell_fraction_of_genes = nnz_cells_of_genes / adata_of_all_genes_of_all_cells.n_obs
    nnz_cell_fraction_mask_of_genes = nnz_cell_fraction_of_genes <= max_gene_cell_fraction

    max_umis_of_genes = ut.max_per(data, per="column")
    max_umis_mask_of_genes = max_umis_of_genes >= min_gene_maximum

    candidates_mask_of_genes = max_umis_mask_of_genes & nnz_cell_fraction_mask_of_genes & allowed_genes_mask
    ut.log_calc("candidate_genes", candidates_mask_of_genes)

    candidate_genes_indices = np.where(candidates_mask_of_genes)[0]
    candidate_genes_count = candidate_genes_indices.size
    if candidate_genes_count < min_genes_of_modules:
        return None

    candidate_data = ut.slice(adata_of_all_genes_of_all_cells,
                              name=".candidate_genes",
                              vars=candidate_genes_indices,
                              top_level=False)
    return candidate_data, candidate_genes_indices
예제 #2
0
def _filter_genes(
    *,
    cells_count: int,
    genes_count: int,
    fold_factors: ut.CompressedMatrix,
    min_gene_fold_factor: float,
    max_gene_fraction: Optional[float] = None,
) -> ut.NumpyVector:
    ut.timed_parameters(cells=cells_count, genes=genes_count, fold_factors=fold_factors.nnz)
    max_fold_factors_of_genes = ut.max_per(fold_factors, per="column")
    assert max_fold_factors_of_genes.size == genes_count

    mask_of_deviant_genes = max_fold_factors_of_genes >= min_gene_fold_factor
    deviant_gene_fraction = np.sum(mask_of_deviant_genes) / genes_count

    if max_gene_fraction is not None and deviant_gene_fraction > max_gene_fraction:
        if ut.logging_calc():
            ut.log_calc("candidate_deviant_genes", mask_of_deviant_genes)

        quantile_gene_fold_factor = np.quantile(max_fold_factors_of_genes, 1 - max_gene_fraction)
        assert quantile_gene_fold_factor is not None
        ut.log_calc("quantile_gene_fold_factor", quantile_gene_fold_factor)

        if quantile_gene_fold_factor > min_gene_fold_factor:
            min_gene_fold_factor = quantile_gene_fold_factor
            mask_of_deviant_genes = max_fold_factors_of_genes >= min_gene_fold_factor

            fold_factors.data[fold_factors.data < min_gene_fold_factor] = 0
            ut.eliminate_zeros(fold_factors)

    if ut.logging_calc():
        ut.log_calc("deviant_genes", mask_of_deviant_genes)

    deviant_gene_indices = np.where(mask_of_deviant_genes)[0]
    return deviant_gene_indices
예제 #3
0
def _compute_metacell_inner_folds(
    *,
    metacell_index: int,
    cells_data: ut.Matrix,
    metacells_data: ut.Matrix,
    group_of_cells: ut.NumpyVector,
    total_umis_per_cell: ut.NumpyVector,
    total_umis_per_metacell: ut.NumpyVector,
) -> ut.NumpyVector:
    grouped_cells_mask = group_of_cells == metacell_index
    assert np.sum(grouped_cells_mask) > 0
    grouped_cells_data = ut.to_numpy_matrix(cells_data[grouped_cells_mask, :])
    total_umis_per_grouped_cell = total_umis_per_cell[grouped_cells_mask]
    metacell_data = metacells_data[metacell_index, :]
    total_umis_of_metacell = total_umis_per_metacell[metacell_index]
    expected_scale_per_grouped_cell = total_umis_per_grouped_cell / total_umis_of_metacell
    expected_cells_data = expected_scale_per_grouped_cell[:, np.
                                                          newaxis] * metacell_data[
                                                              np.newaxis, :]
    assert expected_cells_data.shape == grouped_cells_data.shape
    fold_factors_per_grouped_cell = np.log2(
        (expected_cells_data + 1) / (grouped_cells_data + 1))
    fold_factors = ut.max_per(ut.to_layout(fold_factors_per_grouped_cell,
                                           "column_major"),
                              per="column")
    max_fold_factor = np.max(fold_factors)
    ut.log_calc("max_fold_factor", max_fold_factor)
    return fold_factors
예제 #4
0
def _test_per(rows_matrix: ut.Matrix) -> None:
    columns_matrix = ut.to_layout(rows_matrix, layout="column_major")

    assert np.allclose(ut.nnz_per(rows_matrix, per="row"), np.array([2, 3]))
    assert np.allclose(ut.nnz_per(columns_matrix, per="column"), np.array([1, 2, 2]))

    assert np.allclose(ut.sum_per(rows_matrix, per="row"), np.array([3, 12]))
    assert np.allclose(ut.sum_per(columns_matrix, per="column"), np.array([3, 5, 7]))

    assert np.allclose(ut.max_per(rows_matrix, per="row"), np.array([2, 5]))
    assert np.allclose(ut.max_per(columns_matrix, per="column"), np.array([3, 4, 5]))

    assert np.allclose(ut.min_per(rows_matrix, per="row"), np.array([0, 3]))
    assert np.allclose(ut.min_per(columns_matrix, per="column"), np.array([0, 1, 2]))

    assert np.allclose(ut.sum_squared_per(rows_matrix, per="row"), np.array([5, 50]))
    assert np.allclose(ut.sum_squared_per(columns_matrix, per="column"), np.array([9, 17, 29]))

    assert np.allclose(ut.fraction_per(rows_matrix, per="row"), np.array([3 / 15, 12 / 15]))
    assert np.allclose(ut.fraction_per(columns_matrix, per="column"), np.array([3 / 15, 5 / 15, 7 / 15]))

    assert np.allclose(ut.mean_per(rows_matrix, per="row"), np.array([3 / 3, 12 / 3]))
    assert np.allclose(ut.mean_per(columns_matrix, per="column"), np.array([3 / 2, 5 / 2, 7 / 2]))

    assert np.allclose(
        ut.variance_per(rows_matrix, per="row"), np.array([5 / 3 - (3 / 3) ** 2, 50 / 3 - (12 / 3) ** 2])
    )

    assert np.allclose(
        ut.variance_per(columns_matrix, per="column"),
        np.array([9 / 2 - (3 / 2) ** 2, 17 / 2 - (5 / 2) ** 2, 29 / 2 - (7 / 2) ** 2]),
    )

    assert np.allclose(
        ut.normalized_variance_per(columns_matrix, per="column"),
        np.array(
            [(9 / 2 - (3 / 2) ** 2) / (3 / 2), (17 / 2 - (5 / 2) ** 2) / (5 / 2), (29 / 2 - (7 / 2) ** 2) / (7 / 2)]
        ),
    )

    dense = ut.to_numpy_matrix(ut.fraction_by(rows_matrix, by="row"))
    assert np.allclose(dense, np.array([[0 / 3, 1 / 3, 2 / 3], [3 / 12, 4 / 12, 5 / 12]]))

    dense = ut.to_numpy_matrix(ut.fraction_by(columns_matrix, by="column"))
    assert np.allclose(dense, np.array([[0 / 3, 1 / 5, 2 / 7], [3 / 3, 4 / 5, 5 / 7]]))
예제 #5
0
def significant_folds_matrix(
    folds: ut.NumpyMatrix,
    min_significant_value: float,
    min_entry_value: float,
    abs_folds: bool,
) -> ut.CompressedMatrix:
    """
    Convert a dense folds matrix to a sparse one containing just the significant values.
    """
    folds_by_column = ut.to_layout(folds, layout="column_major")
    if abs_folds:
        comparable_folds_by_column = np.abs(folds_by_column)
    else:
        comparable_folds_by_column = folds_by_column

    max_fold_per_gene = ut.max_per(comparable_folds_by_column, per="column")
    significant_genes_mask = max_fold_per_gene >= min_significant_value
    ut.log_calc("significant_genes_mask", significant_genes_mask)

    folds_by_column[:, ~significant_genes_mask] = 0
    folds_by_column[comparable_folds_by_column < min_entry_value] = 0

    folds_by_row = ut.to_layout(folds_by_column, layout="row_major")
    return sparse.csr_matrix(folds_by_row)
예제 #6
0
def find_noisy_lonely_genes(  # pylint: disable=too-many-statements
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    excluded_genes_mask: Optional[str] = None,
    max_sampled_cells: int = pr.noisy_lonely_max_sampled_cells,
    downsample_min_samples: int = pr.noisy_lonely_downsample_min_samples,
    downsample_min_cell_quantile: float = pr.
    noisy_lonely_downsample_max_cell_quantile,
    downsample_max_cell_quantile: float = pr.
    noisy_lonely_downsample_min_cell_quantile,
    min_gene_total: int = pr.noisy_lonely_min_gene_total,
    min_gene_normalized_variance: float = pr.
    noisy_lonely_min_gene_normalized_variance,
    max_gene_similarity: float = pr.noisy_lonely_max_gene_similarity,
    random_seed: int = pr.random_seed,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect "noisy lonely" genes based on ``what`` (default: {what}) data.

    Return the indices of genes which are "noisy" (have high variance compared to their mean) and
    also "lonely" (have low correlation with all other genes). Such genes should be excluded since
    they will never meaningfully help us compute groups, and will actively cause profiles to be
    considered "deviants".

    Noisy genes have high expression and variance. Lonely genes have no (or low) correlations with
    any other gene. Noisy lonely genes tend to throw off clustering algorithms. In general, such
    algorithms try to group together cells with the same overall biological state. Since the genes
    are lonely, they don't contribute towards this goal. Since they are noisy, they actively hamper
    this, because they make cells which are otherwise similar appear different (just for this lonely
    gene).

    It is therefore useful to explicitly identify, in a pre-processing step, the (few) such genes,
    and exclude them from the rest of the analysis.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``noisy_lonely_genes``
            A boolean mask indicating whether each gene was found to be a "noisy lonely" gene.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number
       of random cells from the data using the ``random_seed``.

    2. If we were specified an ``excluded_genes_mask``, this is the name of a per-variable (gene)
       annotation containing a mask of excluded genes. Get rid of all these excluded genes.

    3. Invoke :py:func:`metacells.tools.downsample.downsample_cells` to downsample the cells to the
       same total number of UMIs, using the ``downsample_min_samples`` (default:
       {downsample_min_samples}), ``downsample_min_cell_quantile`` (default:
       {downsample_min_cell_quantile}), ``downsample_max_cell_quantile`` (default:
       {downsample_max_cell_quantile}) and the ``random_seed`` (default: {random_seed}).

    4. Find "noisy" genes which have a total number of UMIs of at least ``min_gene_total`` (default:
       {min_gene_total}) and a normalized variance of at least ``min_gene_normalized_variance``
       (default: ``min_gene_normalized_variance``).

    5. Cross-correlate the noisy genes.

    6. Find the noisy "lonely" genes whose maximal correlation is at most
       ``max_gene_similarity`` (default: {max_gene_similarity}) with all other genes.
    """
    if max_sampled_cells < adata.n_obs:
        np.random.seed(random_seed)
        cell_indices = np.random.choice(np.arange(adata.n_obs),
                                        size=max_sampled_cells,
                                        replace=False)
        s_data = ut.slice(adata,
                          obs=cell_indices,
                          name=".sampled",
                          top_level=False)
    else:
        s_data = ut.copy_adata(adata, top_level=False)

    track_var: Optional[str] = "sampled_gene_index"

    if excluded_genes_mask is not None:
        results = filter_data(s_data,
                              name="included",
                              top_level=False,
                              track_var=track_var,
                              var_masks=[f"~{excluded_genes_mask}"])
        track_var = None
        assert results is not None
        i_data = results[0]
        assert i_data is not None
    else:
        i_data = s_data

    downsample_cells(
        i_data,
        what,
        downsample_min_samples=downsample_min_samples,
        downsample_min_cell_quantile=downsample_min_cell_quantile,
        downsample_max_cell_quantile=downsample_max_cell_quantile,
        random_seed=random_seed,
    )

    find_high_total_genes(i_data, "downsampled", min_gene_total=min_gene_total)

    results = filter_data(i_data,
                          name="high_total",
                          top_level=False,
                          track_var=track_var,
                          var_masks=["high_total_gene"])
    track_var = None
    assert results is not None
    ht_data = results[0]

    noisy_lonely_genes_mask = np.full(adata.n_vars, False)

    if ht_data is not None:
        ht_genes_count = ht_data.shape[1]

        ht_gene_ht_gene_similarity_frame = compute_var_var_similarity(
            ht_data,
            "downsampled",
            inplace=False,
            reproducible=(random_seed != 0))
        assert ht_gene_ht_gene_similarity_frame is not None

        ht_gene_ht_gene_similarity_matrix = ut.to_numpy_matrix(
            ht_gene_ht_gene_similarity_frame, only_extract=True)
        ht_gene_ht_gene_similarity_matrix = ut.to_layout(
            ht_gene_ht_gene_similarity_matrix,
            layout="row_major",
            symmetric=True)
        np.fill_diagonal(ht_gene_ht_gene_similarity_matrix, -1)

        htv_mask_series = find_high_normalized_variance_genes(
            ht_data,
            "downsampled",
            min_gene_normalized_variance=min_gene_normalized_variance,
            inplace=False)
        assert htv_mask_series is not None
        htv_mask = ut.to_numpy_vector(htv_mask_series)

        htv_genes_count = np.sum(htv_mask)
        assert htv_genes_count <= ht_genes_count

        if htv_genes_count > 0:
            htv_gene_ht_gene_similarity_matrix = ht_gene_ht_gene_similarity_matrix[
                htv_mask, :]
            assert ut.is_layout(htv_gene_ht_gene_similarity_matrix,
                                "row_major")
            assert htv_gene_ht_gene_similarity_matrix.shape == (
                htv_genes_count, ht_genes_count)

            max_similarity_of_htv_genes = ut.max_per(
                htv_gene_ht_gene_similarity_matrix, per="row")
            htvl_mask = max_similarity_of_htv_genes <= max_gene_similarity
            htvl_genes_count = np.sum(htvl_mask)
            ut.log_calc("noisy_lonely_genes_count", htvl_genes_count)

            if htvl_genes_count > 0:
                base_index_of_ht_genes = ut.get_v_numpy(
                    ht_data, "sampled_gene_index")
                base_index_of_htv_genes = base_index_of_ht_genes[htv_mask]
                base_index_of_htvl_genes = base_index_of_htv_genes[htvl_mask]

                noisy_lonely_genes_mask[base_index_of_htvl_genes] = True

                htvl_gene_ht_gene_similarity_matrix = htv_gene_ht_gene_similarity_matrix[
                    htvl_mask, :]
                htvl_gene_ht_gene_similarity_matrix = ut.to_layout(
                    htvl_gene_ht_gene_similarity_matrix, layout="row_major")
                assert htvl_gene_ht_gene_similarity_matrix.shape == (
                    htvl_genes_count, ht_genes_count)

                if ut.logging_calc():
                    i_gene_totals = ut.get_v_numpy(i_data,
                                                   "downsampled",
                                                   sum=True)
                    ht_mask = ut.get_v_numpy(i_data, "high_total_gene")
                    i_total = np.sum(i_gene_totals)
                    htvl_gene_totals = i_gene_totals[ht_mask][htv_mask][
                        htvl_mask]
                    top_similarity_of_htvl_genes = ut.top_per(
                        htvl_gene_ht_gene_similarity_matrix, 10, per="row")
                    for htvl_index, gene_index in enumerate(
                            base_index_of_htvl_genes):
                        gene_name = adata.var_names[gene_index]
                        gene_total = htvl_gene_totals[htvl_index]
                        gene_percent = 100 * gene_total / i_total
                        similar_ht_values = ut.to_numpy_vector(
                            top_similarity_of_htvl_genes[htvl_index, :])  #
                        assert len(similar_ht_values) == ht_genes_count
                        top_similar_ht_mask = similar_ht_values > 0
                        top_similar_ht_values = similar_ht_values[
                            top_similar_ht_mask]
                        top_similar_ht_indices = base_index_of_ht_genes[
                            top_similar_ht_mask]
                        top_similar_ht_names = adata.var_names[
                            top_similar_ht_indices]
                        ut.log_calc(
                            f"- {gene_name}",
                            f"total downsampled UMIs: {gene_total} " +
                            f"({gene_percent:.4g}%), correlated with: " +
                            ", ".join([
                                f"{similar_gene_name}: {similar_gene_value:.4g}"
                                for similar_gene_value, similar_gene_name in
                                reversed(
                                    sorted(
                                        zip(top_similar_ht_values,
                                            top_similar_ht_names)))
                            ]),
                        )

    if ut.logging_calc():
        ut.log_calc("noisy_lonely_gene_names",
                    sorted(list(adata.var_names[noisy_lonely_genes_mask])))

    if inplace:
        ut.set_v_data(adata, "noisy_lonely_gene", noisy_lonely_genes_mask)
        return None

    ut.log_return("noisy_lonely_genes", noisy_lonely_genes_mask)
    return ut.to_pandas_series(noisy_lonely_genes_mask, index=adata.var_names)
예제 #7
0
파일: rare.py 프로젝트: tanaylab/metacells
def _related_genes(  # pylint: disable=too-many-statements,too-many-branches
    *,
    adata_of_all_genes_of_all_cells: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    rare_gene_indices_of_modules: List[List[int]],
    allowed_genes_mask: ut.NumpyVector,
    min_genes_of_modules: int,
    min_gene_maximum: int,
    min_cells_of_modules: int,
    max_cells_of_modules: int,
    min_cell_module_total: int,
    min_related_gene_fold_factor: float,
    max_related_gene_increase_factor: float,
) -> List[List[int]]:
    total_all_cells_umis_of_all_genes = ut.get_v_numpy(
        adata_of_all_genes_of_all_cells, what, sum=True)

    ut.log_calc("genes for modules:")
    modules_count = 0
    related_gene_indices_of_modules: List[List[int]] = []

    rare_gene_indices_of_any: Set[int] = set()
    for rare_gene_indices_of_module in rare_gene_indices_of_modules:
        if len(rare_gene_indices_of_module) >= min_genes_of_modules:
            rare_gene_indices_of_any.update(list(rare_gene_indices_of_module))

    for rare_gene_indices_of_module in rare_gene_indices_of_modules:
        if len(rare_gene_indices_of_module) < min_genes_of_modules:
            continue

        module_index = modules_count
        modules_count += 1

        with ut.log_step("- module", module_index):
            ut.log_calc(
                "rare_gene_names",
                sorted(adata_of_all_genes_of_all_cells.
                       var_names[rare_gene_indices_of_module]))

            adata_of_module_genes_of_all_cells = ut.slice(
                adata_of_all_genes_of_all_cells,
                name=f".module{module_index}.rare_gene",
                vars=rare_gene_indices_of_module,
                top_level=False,
            )

            total_module_genes_umis_of_all_cells = ut.get_o_numpy(
                adata_of_module_genes_of_all_cells, what, sum=True)

            mask_of_expressed_cells = total_module_genes_umis_of_all_cells > 0

            expressed_cells_count = np.sum(mask_of_expressed_cells)

            if expressed_cells_count > max_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc(
                        "expressed_cells",
                        ut.mask_description(mask_of_expressed_cells) +
                        " (too many)")
                continue

            if expressed_cells_count < min_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc(
                        "expressed_cells",
                        ut.mask_description(mask_of_expressed_cells) +
                        " (too few)")
                continue

            ut.log_calc("expressed_cells", mask_of_expressed_cells)

            adata_of_all_genes_of_expressed_cells_of_module = ut.slice(
                adata_of_all_genes_of_all_cells,
                name=f".module{module_index}.rare_cell",
                obs=mask_of_expressed_cells,
                top_level=False,
            )

            total_expressed_cells_umis_of_all_genes = ut.get_v_numpy(
                adata_of_all_genes_of_expressed_cells_of_module,
                what,
                sum=True)

            data = ut.get_vo_proper(
                adata_of_all_genes_of_expressed_cells_of_module,
                what,
                layout="column_major")
            max_expressed_cells_umis_of_all_genes = ut.max_per(data,
                                                               per="column")

            total_background_cells_umis_of_all_genes = (
                total_all_cells_umis_of_all_genes -
                total_expressed_cells_umis_of_all_genes)

            expressed_cells_fraction_of_all_genes = total_expressed_cells_umis_of_all_genes / sum(
                total_expressed_cells_umis_of_all_genes)

            background_cells_fraction_of_all_genes = total_background_cells_umis_of_all_genes / sum(
                total_background_cells_umis_of_all_genes)

            mask_of_related_genes = (
                allowed_genes_mask
                & (max_expressed_cells_umis_of_all_genes >= min_gene_maximum)
                & (expressed_cells_fraction_of_all_genes >=
                   background_cells_fraction_of_all_genes *
                   (2**min_related_gene_fold_factor)))

            related_gene_indices = np.where(mask_of_related_genes)[0]
            assert np.all(mask_of_related_genes[rare_gene_indices_of_module])

            base_genes_of_all_cells_adata = ut.slice(
                adata_of_all_genes_of_all_cells,
                name=f".module{module_index}.base",
                vars=rare_gene_indices_of_module)
            total_base_genes_of_all_cells = ut.get_o_numpy(
                base_genes_of_all_cells_adata, what, sum=True)
            mask_of_strong_base_cells = total_base_genes_of_all_cells >= min_cell_module_total
            count_of_strong_base_cells = np.sum(mask_of_strong_base_cells)

            if ut.logging_calc():
                ut.log_calc(
                    "candidate_gene_names",
                    sorted(adata_of_all_genes_of_all_cells.
                           var_names[related_gene_indices]))
                ut.log_calc("base_strong_genes", count_of_strong_base_cells)

            related_gene_indices_of_module = list(rare_gene_indices_of_module)
            for gene_index in related_gene_indices:
                if gene_index in rare_gene_indices_of_module:
                    continue

                if gene_index in rare_gene_indices_of_any:
                    ut.log_calc(
                        f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} "
                        f"belongs to another module")
                    continue

                if gene_index not in rare_gene_indices_of_module:
                    related_gene_of_all_cells_adata = ut.slice(
                        adata_of_all_genes_of_all_cells,
                        name=
                        f".{adata_of_all_genes_of_all_cells.var_names[gene_index]}",
                        vars=np.array([gene_index]),
                    )
                    assert related_gene_of_all_cells_adata.n_vars == 1
                    total_related_genes_of_all_cells = ut.get_o_numpy(
                        related_gene_of_all_cells_adata, what, sum=True)
                    total_related_genes_of_all_cells += total_base_genes_of_all_cells
                    mask_of_strong_related_cells = total_related_genes_of_all_cells >= min_cell_module_total
                    count_of_strong_related_cells = np.sum(
                        mask_of_strong_related_cells)
                    ut.log_calc(
                        f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} "
                        f"strong cells: {count_of_strong_related_cells} "
                        f"factor: {count_of_strong_related_cells / count_of_strong_base_cells}"
                    )
                    if count_of_strong_related_cells > max_related_gene_increase_factor * count_of_strong_base_cells:
                        continue

                related_gene_indices_of_module.append(gene_index)

            related_gene_indices_of_modules.append(
                related_gene_indices_of_module)  #

    if ut.logging_calc():
        ut.log_calc("related genes for modules:")
        for module_index, related_gene_indices_of_module in enumerate(
                related_gene_indices_of_modules):
            ut.log_calc(
                f"- module {module_index} related_gene_names",
                sorted(adata_of_all_genes_of_all_cells.
                       var_names[related_gene_indices_of_module]),
            )

    return related_gene_indices_of_modules
예제 #8
0
def compute_inner_fold_factors(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    gdata: AnnData,
    group: Union[str, ut.Vector] = "metacell",
    min_gene_inner_fold_factor: float = pr.min_gene_inner_fold_factor,
    min_entry_inner_fold_factor: float = pr.min_entry_inner_fold_factor,
    inner_abs_folds: float = pr.inner_abs_folds,
) -> None:
    """
    Compute the inner fold factors of genes within in each metacell.

    This computes, for each cell of the metacell, the same fold factors that are used to detect deviant cells (see
    :py:func:`metacells.tools.deviants.find_deviant_cells`), and keeps the maximal fold for each gene in the metacell.
    The result per-metacell-per-gene matrix is then made sparse by discarding too-low values (setting them to zero).
    Ideally, this matrix should be "very" sparse. If it contains "too many" non-zero values, this indicates the
    metacells contains "too much" variability. This may be due to actual biology (e.g. immune cells or olfactory nerves
    which are all similar except for each one expressing one different gene), due to batch effects (similar cells in
    distinct batches differing in some genes due to technical issues), due to low data quality (the overall noise level
    is so high that this is simply the best the algorithm can do), or worse - a combination of the above.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, ``gdata`` is assumed to have one observation for each group, and use the same
    genes as ``adata``.

    **Returns**

    Sets the following in ``gdata``:

    Per-Variable Per-Observation (Gene-Cell) Annotations

        ``inner_fold``
            For each gene and group, the maximal fold factor of this gene in any cell contained in the group (unless the
            value is too low to be of interest, in which case it will be zero).

    **Computation Parameters**

    1. For each group (metacell), for each gene, compute the gene's maximal (in all the cells of the group) fold factor
       log2((actual UMIs + 1) / (expected UMIs + 1)), similarly to
       :py:func:`metacells.tools.deviants.find_deviant_cells`.

    2. If the maximal fold factor for a gene (across all metacells) is below ``min_gene_inner_fold_factor`` (default:
       {min_gene_inner_fold_factor}), then set all the gene's fold factors to zero (too low to be of interest). If
       ``inner_abs_folds`` (default: {inner_abs_folds}), consider the absolute fold factors.

    3. Otherwise, for any metacell whose fold factor for the gene is less than ``min_entry_inner_fold_factor`` (default:
       {min_entry_inner_fold_factor}), set the fold factor to zero (too low to be of interest).
    """
    assert 0 <= min_entry_inner_fold_factor <= min_gene_inner_fold_factor

    cells_data = ut.get_vo_proper(adata, what, layout="row_major")
    metacells_data = ut.get_vo_proper(gdata, what, layout="row_major")
    group_of_cells = ut.get_o_numpy(adata,
                                    group,
                                    formatter=ut.groups_description)
    total_umis_per_cell = ut.sum_per(cells_data, per="row")
    total_umis_per_metacell = ut.sum_per(metacells_data, per="row")

    @ut.timed_call("compute_metacell_inner_folds")
    def _compute_single_metacell_inner_folds(
            metacell_index: int) -> ut.NumpyVector:
        return _compute_metacell_inner_folds(
            metacell_index=metacell_index,
            cells_data=cells_data,
            metacells_data=metacells_data,
            group_of_cells=group_of_cells,
            total_umis_per_cell=total_umis_per_cell,
            total_umis_per_metacell=total_umis_per_metacell,
        )

    results = list(
        ut.parallel_map(_compute_single_metacell_inner_folds, gdata.n_obs))
    dense_inner_folds_by_row = np.array(results)
    dense_inner_folds_by_column = ut.to_layout(dense_inner_folds_by_row,
                                               "column_major")
    if inner_abs_folds:
        comparable_dense_inner_folds_by_column = np.abs(
            dense_inner_folds_by_column)
    else:
        comparable_dense_inner_folds_by_column = dense_inner_folds_by_column
    max_fold_per_gene = ut.max_per(comparable_dense_inner_folds_by_column,
                                   per="column")
    significant_genes_mask = max_fold_per_gene >= min_gene_inner_fold_factor
    ut.log_calc("significant_genes_mask", significant_genes_mask)
    dense_inner_folds_by_column[:, ~significant_genes_mask] = 0
    dense_inner_folds_by_column[comparable_dense_inner_folds_by_column <
                                min_entry_inner_fold_factor] = 0
    dense_inner_folds_by_row = ut.to_layout(dense_inner_folds_by_column,
                                            layout="row_major")
    sparse_inner_folds = sparse.csr_matrix(dense_inner_folds_by_row)
    ut.set_vo_data(gdata, "inner_fold", sparse_inner_folds)
예제 #9
0
def find_metacells_significant_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_range_fold: float = pr.min_significant_metacells_gene_range_fold_factor,
    normalization: float = pr.metacells_gene_range_normalization,
    min_gene_fraction: float = pr.min_significant_metacells_gene_fraction,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have a significant signal in metacells data. This computation is too unreliable to be used on
    cells.

    Find genes which have a high maximal expression in at least one metacell, and a wide range of expression across the
    metacells. Such genes are good candidates for being used as marker genes and/or to compute distances between
    metacells.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``significant_gene``
            A boolean mask indicating whether each gene was found to be significant.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Compute the minimal and maximal expression level of each gene.

    2. Select the genes whose fold factor (log2 of maximal over minimal value, using the ``normalization``
       (default: {normalization}) is at least ``min_gene_range_fold`` (default: {min_gene_range_fold}).

    3. Select the genes whose maximal expression is at least ``min_gene_fraction`` (default: {min_gene_fraction}).
    """
    assert normalization >= 0

    data = ut.get_vo_proper(adata, what, layout="row_major")
    fractions_of_genes = ut.to_layout(ut.fraction_by(data, by="row"), layout="column_major")

    min_fraction_of_genes = ut.min_per(fractions_of_genes, per="column")
    max_fraction_of_genes = ut.max_per(fractions_of_genes, per="column")

    high_max_fraction_genes_mask = max_fraction_of_genes >= min_gene_fraction
    ut.log_calc("high max fraction genes", high_max_fraction_genes_mask)

    min_fraction_of_genes += normalization
    max_fraction_of_genes += normalization

    max_fraction_of_genes /= min_fraction_of_genes
    range_fold_of_genes = np.log2(max_fraction_of_genes, out=max_fraction_of_genes)

    high_range_genes_mask = range_fold_of_genes >= min_gene_range_fold
    ut.log_calc("high range genes", high_range_genes_mask)

    significant_genes_mask = high_max_fraction_genes_mask & high_range_genes_mask

    if inplace:
        ut.set_v_data(adata, "significant_gene", significant_genes_mask)
        return None

    ut.log_return("significant_genes", significant_genes_mask)
    return ut.to_pandas_series(significant_genes_mask, index=adata.var_names)