示例#1
0
def _replace_with_layout(adata: AnnData, layout: str) -> Dict[str, utt.Matrix]:
    replaced: Dict[str, utt.Matrix] = {}

    matrix: utt.Matrix = adata.X
    if not utt.is_layout(matrix, layout):
        replaced["__x__"] = matrix
        adata.X = get_vo_proper(adata, "__x__", layout=layout)

    for name in adata.layers:
        matrix = adata.layers[name]
        if not utt.is_layout(matrix, layout):
            replaced[name] = matrix
            adata.layers[name] = get_vo_proper(adata, name, layout=layout)

    return replaced
示例#2
0
def process_transpose(ad: AnnData,
                      min_cells: int = 10,
                      min_genes: int = 200,
                      max_genes: int = 2500,
                      max_pct_mito: int = 30):
    ad = ad.copy()
    ad.X = ad.raw.X
    ad = _generic_preprocess(ad, min_cells, min_genes, max_genes, max_pct_mito)
    sc.pp.log1p(ad)
    sc.pp.highly_variable_genes(ad, batch_key="sample")
    ad = ad.transpose()
    sc.pp.pca(ad, n_comps=50)
    sc.pp.neighbors(ad)
    sc.tl.umap(ad)
    return ad
示例#3
0
def test_x_is_none():
    # test setter and getter
    adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), dict(o1=[1, 2],
                                                           o2=[3, 4]))
    adata.X = None
    assert adata.X is None

    # test setter and deleter
    adata.X = np.array([[4, 5, 6], [1, 2, 3]])
    assert adata.X is not None
    del adata.X
    assert adata.X is None

    # test initialiser
    shape = (3, 5)
    adata = AnnData(None, uns=dict(test=np.array((3, 3))), shape=shape)
    assert adata.X is None
    assert adata.shape == shape

    # test transpose
    adataT = adata.transpose()
    assert_equal(adataT.shape, (5, 3))
    assert_equal(adataT.obsp.keys(), adata.varp.keys())
    assert_equal(adataT.T, adata)
示例#4
0
def recipe_weinreb17(
    adata: AnnData,
    log: bool = True,
    mean_threshold: float = 0.01,
    cv_threshold: int = 2,
    n_pcs: int = 50,
    svd_solver='randomized',
    random_state: AnyRandom = 0,
    copy: bool = False,
) -> Optional[AnnData]:
    """\
    Normalization and filtering as of [Weinreb17]_.

    Expects non-logarithmized data.
    If using logarithmized data, pass `log=False`.

    Parameters
    ----------
    adata
        Annotated data matrix.
    log
        Logarithmize data?
    copy
        Return a copy if true.
    """
    from ._deprecated import normalize_per_cell_weinreb16_deprecated, zscore_deprecated
    from scipy.sparse import issparse

    if issparse(adata.X):
        raise ValueError('`recipe_weinreb16 does not support sparse matrices.')
    if copy:
        adata = adata.copy()
    if log:
        pp.log1p(adata)
    adata.X = normalize_per_cell_weinreb16_deprecated(
        adata.X, max_fraction=0.05, mult_with_mean=True
    )
    gene_subset = filter_genes_cv_deprecated(adata.X, mean_threshold, cv_threshold)
    adata._inplace_subset_var(gene_subset)  # this modifies the object itself
    X_pca = pp.pca(
        zscore_deprecated(adata.X),
        n_comps=n_pcs,
        svd_solver=svd_solver,
        random_state=random_state,
    )
    # update adata
    adata.obsm['X_pca'] = X_pca
    return adata if copy else None
示例#5
0
def get_example_data(*, sparse=False):
    # create test object
    adata = AnnData(np.multiply(binomial(1, 0.15, (100, 20)), negative_binomial(2, 0.25, (100, 20))))
    # adapt marker_genes for cluster (so as to have some form of reasonable input
    adata.X[0:10, 0:5] = np.multiply(binomial(1, 0.9, (10, 5)), negative_binomial(1, 0.5, (10, 5)))

    # The following construction is inefficient, but makes sure that the same data is used in the sparse case
    if sparse:
        adata.X = sp.csr_matrix(adata.X)

    # Create cluster according to groups
    adata.obs['true_groups'] = pd.Categorical(np.concatenate((
        np.zeros((10,), dtype=int),
        np.ones((90,), dtype=int),
    )))

    return adata
示例#6
0
def import_10X_mtx(directory):
    start = time.time()
    X = load_mtx(os.path.join(directory, 'matrix.mtx'))
    genes = pd.read_csv(os.path.join(directory, 'genes.tsv'),
                        header=None,
                        sep='\t')
    if len(genes) == X.shape[0]:  # transpose if matrix is genes x cells
        a = AnnData(X.T)
    else:
        a.X = Anndata(X)
    var_names = genes[1]
    a.var_names = var_names
    a.var['gene_ids'] = genes[0].values
    a.obs_names = pd.read_csv(os.path.join(directory, 'barcodes.tsv'),
                              header=None)[0]
    a.uns['network'] = np.ones([a.X.shape[1], a.X.shape[1]])
    return a
示例#7
0
def assert_adata(adata: AnnData, attempFix=True):
    """Asserts that an adata object is containing information needed for the besca pipeline to run and export information.
    This is particularly usefull when loading public data
    The parameter attempFix will try to fix the issue by itself.
    However, we advise the user to check by himself what is the leading problem.

    Parameters
    ----------
    adata: AnnData
    attempFix: `bool` if True will transform adata object to match requirements.
    Returns
    -------
    returns an AnnData object
    """
    if not 'CELL' in adata.obs.columns:
        if attempFix:
            adata.obs['CELL'] = adata.obs.index
            print('Creating columns CELL in adata.obs using adata.obs.index.')
        else:
            raise Exception('Required CELL columns in observations')

    if not all(adata.obs_names == adata.obs['CELL']):
        raise Exception('Required indexing of adata.obs by CELL column')
    if not issparse(adata.X):
        if attempFix:
            print(
                'Required count matrix to be sparse, X transformed to sparse')
            try:
                adata.X = sparse.csr_matrix(adata.X.copy())
            except:
                raise Exception('X transformation to sparse failed.')
        else:
            raise Exception('adata.X needs to be sparse.')
    # checking adata.var concordance
    for x in ['SYMBOL', 'ENSEMBL']:
        adata = add_var_column(adata, x, attempFix)
        if not all(isinstance(el, str) for el in adata.var.get(x)):
            raise Exception(
                'In {x} non string values will create an issue for export')
    return (adata)
示例#8
0
def clr(adata: AnnData, inplace: bool = True, axis: int = 0) -> Union[None, AnnData]:
    """
    Apply the centered log ratio (CLR) transformation
    to normalize counts in adata.X.

    Args:
        data: AnnData object with protein expression counts.
        inplace: Whether to update adata.X inplace.
        axis: Axis across which CLR is performed.
    """

    if axis not in [0, 1]:
        raise ValueError("Invalid value for `axis` provided. Admissible options are `0` and `1`.")

    if not inplace:
        adata = adata.copy()

    if issparse(adata.X) and axis == 0 and not isinstance(adata.X, csc_matrix):
        warn("adata.X is sparse but not in CSC format. Converting to CSC.")
        x = csc_matrix(adata.X)
    elif issparse(adata.X) and axis == 1 and not isinstance(adata.X, csr_matrix):
        warn("adata.X is sparse but not in CSR format. Converting to CSR.")
        x = csr_matrix(adata.X)
    else:
        x = adata.X

    if issparse(x):
        x.data /= np.repeat(
            np.exp(np.log1p(x).sum(axis=axis).A / x.shape[axis]), x.getnnz(axis=axis)
        )
        np.log1p(x.data, out=x.data)
    else:
        np.log1p(
            x / np.exp(np.log1p(x).sum(axis=axis, keepdims=True) / x.shape[axis]),
            out=x,
        )

    adata.X = x

    return None if inplace else adata
示例#9
0
def set_modality(
    adata: AnnData,
    new_value: Union[ndarray, spmatrix, DataFrame],
    modality: Optional[str] = None,
    inplace: bool = True,
) -> Optional[AnnData]:
    """Set modality of annotated data object to new value.

    Arguments
    ---------
    adata
        Annotated data object.
    new_value
        New value of modality.
    modality
        Modality to overwrite with new value. Defaults to `None`.
    inplace
        Boolean flag to indicate whether setting of modality should be inplace or
            not. Defaults to `True`.

    Returns
    -------
    Optional[AnnData]
        Copy of annotated data `adata` with updated modality if `inplace=True`.
    """

    if not inplace:
        adata = adata.copy()

    if (modality == "X") or (modality is None):
        adata.X = new_value
    elif modality in adata.layers.keys():
        adata.layers[modality] = new_value
    elif modality in adata.obsm.keys():
        adata.obsm[modality] = new_value

    if not inplace:
        return adata
示例#10
0
def load_file(path):
    """
    Load single cell dataset from file
    """
    if os.path.exists(DATA_PATH + path + '.h5ad'):
        adata = sc.read_h5ad(DATA_PATH + path + '.h5ad')
    elif os.path.isdir(path):  # mtx format
        adata = read_mtx(path)
    elif os.path.isfile(path):
        if path.endswith(('.csv', '.csv.gz')):
            adata = sc.read_csv(path).T
        elif path.endswith(('.txt', '.txt.gz', '.tsv', '.tsv.gz')):
            df = pd.read_csv(path, sep='\t', index_col=0).T
            adata = AnnData(df.values, dict(obs_names=df.index.values),
                            dict(var_names=df.columns.values))
        elif path.endswith('.h5ad'):
            adata = sc.read_h5ad(path)
    else:
        raise ValueError("File {} not exists".format(path))

    if not issparse(adata.X):
        adata.X = scipy.sparse.csr_matrix(adata.X)
    adata.var_names_make_unique()
    return adata
示例#11
0
def filter_cells(
    adata: AnnData,
    min_counts: int = -1,
    max_counts: int = -1,
    max_mt_ratio: int = 20,
    # doublet_detection: bool = False,
    # scrublet_kwargs: dict = {
    #     "total_counts": None,
    #     "sim_doublet_ratio": 2.0,
    #     "n_neighbors": None,
    #     "expected_doublet_rate": 0.1,
    #     "stdev_doublet_rate": 0.02,
    #     "random_state": 0,
    # },
    verbose=True,
):
    """Filter problematic cells in an AnnData

    Args:
      adata(AnnData): The AnnData object to be pre-processed.
      min_counts(int): Minimum number of counts required for a cell to pass filtering.
      `-1` -> median(counts) - std(counts)
      max_counts(int): Maximum number of counts required for a cell to pass filtering.
      `-1` -> median(counts) + std(counts)
      max_mt_ratio(int): Maximum proportion of mitochondrial genes in a cell to pass
      filtering.
      verbose: (Default value = True)

    Returns:
    * Sets
    """
    # doublet_detection(bool): Uses doublet detection instead of max counts to remove doublets
    # scrublet_kwargs(dict): Arguments passed to Scrublet for doublet detection

    # -- sparse -> array
    if 'ndarray' not in str(type(adata.X)):
        adata.X = adata.X.toarray()

    # -- Mitochondrial content
    adata.var["mt"] = adata.var_names.str.startswith("MT-")
    sc.pp.calculate_qc_metrics(
        adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True
    )

    # -- min/max suggestion
    counts = adata.X.sum(axis=1)
    md = np.median(counts)
    sd = np.std(counts)
    if min_counts == -1:
        min_counts = max(0, md - sd)
    if max_counts == -1:
        max_counts = md + sd
    # # -- Doublet detection
    # if doublet_detection:
    #     scrub = scr.Scrublet(
    #         adata.X,
    #         total_counts=scrublet_kwargs["total_counts"],
    #         sim_doublet_ratio=scrublet_kwargs["sim_doublet_ratio"],
    #         n_neighbors=scrublet_kwargs["n_neighbors"],
    #         expected_doublet_rate=scrublet_kwargs["expected_doublet_rate"],
    #         stdev_doublet_rate=scrublet_kwargs["stdev_doublet_rate"],
    #         random_state=scrublet_kwargs["random_state"],
    #     )
    #     (
    #         adata.obs["doublet_scores"],
    #         adata.obs["predicted_doublets"],
    #     ) = scrub.scrub_doublets()
    #     inds1 = np.where(
    #         (~adata.obs["predicted_doublets"].values)
    #         & (adata.obs["total_counts"] < max_counts)
    #         & (adata.obs["total_counts"] > min_counts)
    #     )
    #     del scrub
    # else:
    inds1 = np.where(
        (adata.obs["total_counts"] > min_counts) &
        (adata.obs["total_counts"] < max_counts))
    inds2 = np.where(adata.obs["pct_counts_mt"] < max_mt_ratio)
    if verbose:
        # if doublet_detection:
        #     print(np.sum(adata.obs["predicted_doublets"]), "doublets encountered")
        #     print(len(inds1[0]), "cells pass the doublet and counts filters.")
        # else:
        print(len(inds1[0]), "cells pass the count filter")
        print(len(inds2[0]), " cells pass the mt filter")
    ind_cells = np.intersect1d(inds1[0], inds2[0])
    if verbose:
        print("Cells selected", len(ind_cells))
    adata._inplace_subset_obs(ind_cells)
    gc.collect()
示例#12
0
def balanced_pca(adata: anndata.AnnData,
                 groups: str = "pre_clusters",
                 max_cell_prop=0.1,
                 n_comps=200,
                 scale=False):
    """
    Given a categorical variable (e.g., a pre-clustering label), perform balanced PCA by downsample
    cells in the large categories to make the overall population more balanced, so the PCs are expected
    to represent more variance among small categories.

    Parameters
    ----------
    adata
        adata after preprocessing and feature selection steps
    groups
        the name of the categorical variable in adata.obsm
    max_cell_prop
        any single category with cells > `n_cell * max_cell_prop` will be downsampled to this number.
    n_comps
        Number of components in PCA
    scale
        whether to scale the input matrix before PCA

    Returns
    -------
    adata with PC information stored in obsm, varm and uns like the :func:`scanpy.tl.pca` do.
    """
    # downsample large clusters
    use_cells = []
    size_to_downsample = max(int(adata.shape[0] * max_cell_prop), 50)
    for cluster, sub_df in adata.obs.groupby(groups):
        if sub_df.shape[0] > size_to_downsample:
            use_cells += sub_df.sample(size_to_downsample,
                                       random_state=0).index.tolist()
        else:
            use_cells += sub_df.index.tolist()

    # get training adata
    if len(use_cells) == adata.shape[0]:
        downsample = False
        adata_train = adata
    else:
        downsample = True
        adata_train = adata[use_cells, :].copy()

    # in case cells are smaller than n_comps
    n_comps = min(min(adata_train.shape), n_comps)

    # scale (optional)
    if scale:
        scaler = StandardScaler()
        adata_train.X = scaler.fit_transform(adata_train.X)
    else:
        scaler = None

    # pca
    sc.tl.pca(
        adata_train,
        n_comps=n_comps,
        zero_center=True,
        svd_solver="arpack",
        random_state=0,
        return_info=False,
        use_highly_variable=None,
        dtype="float32",
        copy=False,
        chunked=False,
        chunk_size=None,
    )

    # transfer PCA result to full adata
    if downsample:
        if scale:
            adata.X = scaler.transform(
                adata.X)  # scale all cells with the same scaler
        adata.varm["PCs"] = adata_train.varm["PCs"]
        adata.obsm["X_pca"] = adata.X @ adata_train.varm["PCs"]
        adata.uns["pca"] = adata_train.uns["pca"]
    return adata
示例#13
0
def combat(adata: AnnData, key: str = 'batch', inplace: bool = True):
    """
    ComBat function for batch effect correction [Johnson07]_ [Leek12]_.

    Corrects for batch effects by fitting linear models, gains statistical power
    via an EB framework where information is borrowed across genes. This uses the
    implementation of `ComBat <https://github.com/brentp/combat.py>`__ [Pedersen12]_.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix
    key: `str`, optional (default: `"batch"`)
        Key to a categorical annotation from adata.obs that will be used for batch effect removal
    inplace: bool, optional (default: `True`)
        Wether to replace adata.X or to return the corrected data

    Returns
    -------
    Depending on the value of inplace, either returns an updated AnnData object
        or modifies the passed one.
    """

    # check the input
    if key not in adata.obs.keys():
        raise ValueError(
            'Could not find the key {!r} in adata.obs'.format(key))

    # only works on dense matrices so far
    if issparse(adata.X):
        X = adata.X.A.T
    else:
        X = adata.X.T
    data = pd.DataFrame(
        data=X,
        index=adata.var_names,
        columns=adata.obs_names,
    )

    # construct a pandas series of the batch annotation
    batch = pd.Series(adata.obs[key])
    model = pd.DataFrame({'batch': batch})
    batch_items = model.groupby("batch").groups.items()
    batch_info = [v for k, v in batch_items]
    n_batch = len(batch_info)
    n_batches = np.array([len(v) for v in batch_info])
    n_array = float(sum(n_batches))

    # standardize across genes using a pooled variance estimator
    sys.stderr.write("Standardizing Data across genes.\n")
    s_data, design, var_pooled, stand_mean = stand_data(model, data)

    # fitting the parameters on the standardized data
    sys.stderr.write("Fitting L/S model and finding priors\n")
    batch_design = design[design.columns[:n_batch]]
    # first estimate of the additive batch effect
    gamma_hat = np.dot(
        np.dot(la.inv(np.dot(batch_design.T, batch_design)), batch_design.T),
        s_data.T)
    delta_hat = []

    # first estimate for the multiplicative batch effect
    for i, batch_idxs in enumerate(batch_info):
        delta_hat.append(s_data[batch_idxs].var(axis=1))

    # empirically fix the prior hyperparameters
    gamma_bar = gamma_hat.mean(axis=1)
    t2 = gamma_hat.var(axis=1)
    # a_prior and b_prior are the priors on lambda and theta from Johnson and Li (2006)
    a_prior = list(map(aprior, delta_hat))
    b_prior = list(map(bprior, delta_hat))

    sys.stderr.write("Finding parametric adjustments\n")
    # gamma star and delta star will be our empirical bayes (EB) estimators
    # for the additive and multiplicative batch effect per batch and cell
    gamma_star, delta_star = [], []
    for i, batch_idxs in enumerate(batch_info):
        # temp stores our estimates for the batch effect parameters.
        # temp[0] is the additive batch effect
        # temp[1] is the multiplicative batch effect
        gamma, delta = _it_sol(
            s_data[batch_idxs].values,
            gamma_hat[i],
            delta_hat[i].values,
            gamma_bar[i],
            t2[i],
            a_prior[i],
            b_prior[i],
        )

        gamma_star.append(gamma)
        delta_star.append(delta)

    sys.stdout.write("Adjusting data\n")
    bayesdata = s_data
    gamma_star = np.array(gamma_star)
    delta_star = np.array(delta_star)

    # we now apply the parametric adjustment to the standardized data from above
    # loop over all batches in the data
    for j, batch_idxs in enumerate(batch_info):

        # we basically substract the additive batch effect, rescale by the ratio
        # of multiplicative batch effect to pooled variance and add the overall gene
        # wise mean
        dsq = np.sqrt(delta_star[j, :])
        dsq = dsq.reshape((len(dsq), 1))
        denom = np.dot(dsq, np.ones((1, n_batches[j])))
        numer = np.array(bayesdata[batch_idxs] -
                         np.dot(batch_design.loc[batch_idxs], gamma_star).T)
        bayesdata[batch_idxs] = numer / denom

    vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1))
    bayesdata = bayesdata * np.dot(vpsq, np.ones(
        (1, int(n_array)))) + stand_mean

    # put back into the adata object or return
    if inplace:
        adata.X = bayesdata.values.transpose()
    else:
        return bayesdata.values.transpose()
示例#14
0
def test_set_x_is_none():
    # test setter and getter
    adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), dict(o1=[1, 2], o2=[3, 4]))
    adata.X = None
    assert adata.X is None
示例#15
0
def _replace_back(adata: AnnData, replaced: Dict[str, utt.Matrix]) -> None:
    for name, matrix in replaced.items():
        if name == "__x__":
            adata.X = matrix
        else:
            adata.layers[name] = matrix
示例#16
0
def normalize_total(
    adata: AnnData,
    target_sum: Optional[float] = None,
    exclude_highly_expressed: bool = False,
    max_fraction: float = 0.05,
    key_added: Optional[str] = None,
    layers: Union[Literal['all'], Iterable[str]] = None,
    layer_norm: Optional[str] = None,
    inplace: bool = True,
) -> Optional[Dict[str, np.ndarray]]:
    """\
    Normalize counts per cell.

    If choosing `target_sum=1e6`, this is CPM normalization.

    If `exclude_highly_expressed=True`, very highly expressed genes are excluded
    from the computation of the normalization factor (size factor) for each
    cell. This is meaningful as these can strongly influence the resulting
    normalized values for all other genes [Weinreb17]_.

    Similar functions are used, for example, by Seurat [Satija15]_, Cell Ranger
    [Zheng17]_ or SPRING [Weinreb17]_.

    Params
    ------
    adata
        The annotated data matrix of shape `n_obs` × `n_vars`.
        Rows correspond to cells and columns to genes.
    target_sum
        If `None`, after normalization, each observation (cell) has a total
        count equal to the median of total counts for observations (cells)
        before normalization.
    exclude_highly_expressed
        Exclude (very) highly expressed genes for the computation of the
        normalization factor (size factor) for each cell. A gene is considered
        highly expressed, if it has more than `max_fraction` of the total counts
        in at least one cell. The not-excluded genes will sum up to
        `target_sum`.
    max_fraction
        If `exclude_highly_expressed=True`, consider cells as highly expressed
        that have more counts than `max_fraction` of the original total counts
        in at least one cell.
    key_added
        Name of the field in `adata.obs` where the normalization factor is
        stored.
    layers
        List of layers to normalize. Set to `'all'` to normalize all layers.
    layer_norm
        Specifies how to normalize layers:

        * If `None`, after normalization, for each layer in *layers* each cell
          has a total count equal to the median of the *counts_per_cell* before
          normalization of the layer.
        * If `'after'`, for each layer in *layers* each cell has
          a total count equal to `target_sum`.
        * If `'X'`, for each layer in *layers* each cell has a total count
          equal to the median of total counts for observations (cells) of
          `adata.X` before normalization.

    inplace
        Whether to update `adata` or return dictionary with normalized copies of
        `adata.X` and `adata.layers`.

    Returns
    -------
    Returns dictionary with normalized copies of `adata.X` and `adata.layers`
    or updates `adata` with normalized version of the original
    `adata.X` and `adata.layers`, depending on `inplace`.

    Example
    --------
    >>> from anndata import AnnData
    >>> import scanpy as sc
    >>> sc.settings.verbosity = 2
    >>> np.set_printoptions(precision=2)
    >>> adata = AnnData(np.array([
    ...    [3, 3, 3, 6, 6],
    ...    [1, 1, 1, 2, 2],
    ...    [1, 22, 1, 2, 2],
    ... ]))
    >>> adata.X
    array([[ 3.,  3.,  3.,  6.,  6.],
           [ 1.,  1.,  1.,  2.,  2.],
           [ 1., 22.,  1.,  2.,  2.]], dtype=float32)
    >>> X_norm = sc.pp.normalize_total(adata, target_sum=1, inplace=False)['X']
    >>> X_norm
    array([[0.14, 0.14, 0.14, 0.29, 0.29],
           [0.14, 0.14, 0.14, 0.29, 0.29],
           [0.04, 0.79, 0.04, 0.07, 0.07]], dtype=float32)
    >>> X_norm = sc.pp.normalize_total(
    ...     adata, target_sum=1, exclude_highly_expressed=True,
    ...     max_fraction=0.2, inplace=False
    ... )['X']
    The following highly-expressed genes are not considered during normalization factor computation:
    ['1', '3', '4']
    >>> X_norm
    array([[ 0.5,  0.5,  0.5,  1. ,  1. ],
           [ 0.5,  0.5,  0.5,  1. ,  1. ],
           [ 0.5, 11. ,  0.5,  1. ,  1. ]], dtype=float32)
    """
    if max_fraction < 0 or max_fraction > 1:
        raise ValueError('Choose max_fraction between 0 and 1.')

    if layers == 'all':
        layers = adata.layers.keys()
    elif isinstance(layers, str):
        raise ValueError(
            f"`layers` needs to be a list of strings or 'all', not {layers!r}")

    view_to_actual(adata)

    gene_subset = None
    msg = 'normalizing counts per cell'
    if exclude_highly_expressed:
        counts_per_cell = adata.X.sum(1)  # original counts per cell
        counts_per_cell = np.ravel(counts_per_cell)

        # at least one cell as more than max_fraction of counts per cell
        gene_subset = (adata.X >
                       counts_per_cell[:, None] * max_fraction).sum(0)
        gene_subset = (np.ravel(gene_subset) == 0)

        msg += (
            ' The following highly-expressed genes are not considered during '
            f'normalization factor computation:\n{adata.var_names[~gene_subset].tolist()}'
        )
    start = logg.info(msg)

    # counts per cell for subset, if max_fraction!=1
    X = adata.X if gene_subset is None else adata[:, gene_subset].X
    counts_per_cell = X.sum(1)
    # get rid of adata view
    counts_per_cell = np.ravel(counts_per_cell).copy()

    cell_subset = counts_per_cell > 0
    if not np.all(cell_subset):
        logg.warning('Some cells have total count of genes equal to zero')

    if layer_norm == 'after':
        after = target_sum
    elif layer_norm == 'X':
        after = np.median(counts_per_cell[cell_subset])
    elif layer_norm is None:
        after = None
    else:
        raise ValueError('layer_norm should be "after", "X" or None')
    del cell_subset

    if inplace:
        if key_added is not None:
            adata.obs[key_added] = counts_per_cell
        adata.X = _normalize_data(adata.X, counts_per_cell, target_sum)
    else:
        # not recarray because need to support sparse
        dat = dict(
            X=_normalize_data(adata.X, counts_per_cell, target_sum, copy=True),
            norm_factor=counts_per_cell,
        )

    for layer_name in (layers or ()):
        layer = adata.layers[layer_name]
        counts = np.ravel(layer.sum(1))
        if inplace:
            adata.layers[layer_name] = _normalize_data(layer, counts, after)
        else:
            dat[layer_name] = _normalize_data(layer, counts, after, copy=True)

    logg.info(
        '    finished ({time_passed})',
        time=start,
    )
    if key_added is not None:
        logg.debug(
            f'and added {key_added!r}, counts per cell before normalization (adata.obs)'
        )

    return dat if not inplace else None
示例#17
0
def magic(
    adata: AnnData,
    name_list: Union[str, Sequence[str], None] = None,
    k: int = 10,
    a: int = 15,
    t: str = 'auto',
    n_pca: int = 100,
    knn_dist: str = 'euclidean',
    random_state: Optional[Union[int, RandomState]] = None,
    n_jobs: Optional[int] = None,
    verbose: bool = False,
    copy: Optional[bool] = None,
    **kwargs,
) -> Optional[AnnData]:
    """\
    Markov Affinity-based Graph Imputation of Cells (MAGIC) API [vanDijk18]_.

    MAGIC is an algorithm for denoising and transcript recover of single cells
    applied to single-cell sequencing data. MAGIC builds a graph from the data
    and uses diffusion to smooth out noise and recover the data manifold.

    More information and bug reports
    `here <https://github.com/KrishnaswamyLab/MAGIC>`__. For help, visit
    <https://krishnaswamylab.org/get-help>.

    Parameters
    ----------
    adata
        An anndata file with `.raw` attribute representing raw counts.
    name_list
        Denoised genes to return. The default `'all_genes'`/`None`
        may require a large amount of memory if the input data is sparse.
        Another possibility is `'pca_only'`.
    k
        number of nearest neighbors on which to build kernel
    a
        sets decay rate of kernel tails.
        If None, alpha decaying kernel is not used
    t
        power to which the diffusion operator is powered.
        This sets the level of diffusion. If 'auto', t is selected
        according to the Procrustes disparity of the diffused data
    n_pca
        Number of principal components to use for calculating
        neighborhoods. For extremely large datasets, using
        n_pca < 20 allows neighborhoods to be calculated in
        roughly log(n_samples) time.
    knn_dist
        recommended values: 'euclidean', 'cosine', 'precomputed'
        Any metric from `scipy.spatial.distance` can be used
        distance metric for building kNN graph. If 'precomputed',
        `data` should be an n_samples x n_samples distance or
        affinity matrix
    random_state
        Random seed. Defaults to the global `numpy` random number generator
    n_jobs
        Number of threads to use in training. All cores are used by default.
    verbose
        If `True` or an integer `>= 2`, print status messages.
        If `None`, `sc.settings.verbosity` is used.
    copy
        If true, a copy of anndata is returned. If `None`, `copy` is True if
        `genes` is not `'all_genes'` or `'pca_only'`. `copy` may only be False
        if `genes` is `'all_genes'` or `'pca_only'`, as the resultant data
        will otherwise have different column names from the input data.
    kwargs
        Additional arguments to `magic.MAGIC`

    Returns
    -------
    If `copy` is True, AnnData object is returned.

    If `subset_genes` is not `all_genes`, PCA on MAGIC values of cells are
    stored in `adata.obsm['X_magic']` and `adata.X` is not modified.

    The raw counts are stored in `.raw` attribute of AnnData object.

    Examples
    --------
    >>> import scanpy as sc
    >>> import scanpy.external as sce
    >>> adata = sc.datasets.paul15()
    >>> sc.pp.normalize_per_cell(adata)
    >>> sc.pp.sqrt(adata)  # or sc.pp.log1p(adata)
    >>> adata_magic = sce.pp.magic(adata, name_list=['Mpo', 'Klf1', 'Ifitm1'], k=5)
    >>> adata_magic.shape
    (2730, 3)
    >>> sce.pp.magic(adata, name_list='pca_only', k=5)
    >>> adata.obsm['X_magic'].shape
    (2730, 100)
    >>> sce.pp.magic(adata, name_list='all_genes', k=5)
    >>> adata.X.shape
    (2730, 3451)
    """

    try:
        from magic import MAGIC
    except ImportError:
        raise ImportError(
            'Please install magic package via `pip install --user '
            'git+git://github.com/KrishnaswamyLab/MAGIC.git#subdirectory=python`'
        )

    start = logg.info('computing PHATE')
    all_or_pca = isinstance(name_list, (str, type(None)))
    if all_or_pca and name_list not in {"all_genes", "pca_only", None}:
        raise ValueError("Invalid string value for `name_list`: "
                         "Only `'all_genes'` and `'pca_only'` are allowed.")
    if copy is None:
        copy = not all_or_pca
    elif not all_or_pca and not copy:
        raise ValueError(
            "Can only perform MAGIC in-place with `name_list=='all_genes' or "
            f"`name_list=='pca_only'` (got {name_list}). Consider setting "
            "`copy=True`")
    adata = adata.copy() if copy else adata
    n_jobs = settings.n_jobs if n_jobs is None else n_jobs

    X_magic = MAGIC(
        k=k,
        a=a,
        t=t,
        n_pca=n_pca,
        knn_dist=knn_dist,
        random_state=random_state,
        n_jobs=n_jobs,
        verbose=verbose,
        **kwargs,
    ).fit_transform(adata, genes=name_list)
    logg.info(
        '    finished',
        time=start,
        deep=("added\n    'X_magic', PCA on MAGIC coordinates (adata.obsm)"
              if name_list == "pca_only" else ''))
    # update AnnData instance
    if name_list == "pca_only":
        # special case - update adata.obsm with smoothed values
        adata.obsm["X_magic"] = X_magic.X
    elif copy:
        # just return X_magic
        X_magic.raw = adata
        adata = X_magic
    else:
        # replace data with smoothed data
        adata.raw = adata
        adata.X = X_magic.X

    if copy:
        return adata
示例#18
0
def generate_synthetic_dataset(adata: AnnData,
                               sim_type: str = "avg",
                               seed: int = 42):
    """Create cell-aggregate samples for ground-truth spatial decomposition task.

    Parameters
    ----------
    adata : AnnData
        Anndata object.
    sim_type : str
        Simulation type: either average `'avg'` or per cell `'cell'`.
    seed: int
        Seed for rng.

    Returns
    -------
    AnnData with:
        - `adata_spatial.obsm["proportions_true"]`: true proportion values.
        - `adata_spatial.X`: simulated counts (aggregate of sc dataset).
        - `adata_spatial.uns["sc_reference"]`: original sc adata for reference.

    The cell type labels are stored in adata_sc.obs["label"].
    """

    rng = np.random.default_rng(seed)

    adata.obs["label"] = adata.obs.label.astype("category")

    if isinstance(adata.X, csr_matrix):
        adata.X = adata.X.todense()

    n_genes = adata.shape[1]
    n_cells = adata.shape[0]
    n_types = len(set(adata.obs["label"].values))

    # TODO(make these arguments)
    bead_depth = 1000
    num_of_beads = n_cells * 2
    # generate proportion values
    props = rng.dirichlet(np.ones(n_types), num_of_beads)

    true_proportion = np.zeros((num_of_beads, n_types))
    bead_to_gene_matrix = np.zeros((num_of_beads, n_genes))

    # if sim_type avg
    # generate from avg profiles
    if sim_type == "avg":
        profile_mean = obs_means(adata, "label")
        sc.pp.normalize_total(profile_mean, target_sum=1, inplace=True)
        # run for each bead
        for bead_index in range(num_of_beads):
            allocation = rng.multinomial(bead_depth,
                                         props[bead_index, :],
                                         size=1)[0]
            true_proportion[bead_index, :] = allocation.copy()
            for j in range(n_types):
                profile_mean.X[j, :] /= (profile_mean.X[j, :].sum() + 1e-5
                                         )  # trick to make sum(arr) < 1.0
                gene_exp = rng.multinomial(allocation[j],
                                           profile_mean.X[j, :],
                                           size=1)[0]
                bead_to_gene_matrix[bead_index, :] += gene_exp

    elif sim_type == "cell":
        # generate from cells
        # assign beads to actual cells
        # cell_ids with this cluster
        cells_to_sample_from_celltype = []
        grouped = adata.obs.groupby("label")
        for idx in grouped.indices.values():
            cells_to_sample_from_celltype += [idx]

        # Actual cells assigned randomly
        cell_association = np.zeros((num_of_beads, n_types)).astype(np.int)
        for j in range(n_types):
            cell_association[:, j] = rng.integers(
                low=0,
                high=len(cells_to_sample_from_celltype[j]),
                size=num_of_beads)

        counts = np.array(adata.X)
        rowSums = counts.sum(axis=1, keepdims=True)
        X_norm_prof = np.divide(counts, rowSums, where=rowSums > 0)

        for bead_index in range(num_of_beads):
            allocation = rng.multinomial(bead_depth,
                                         props[bead_index, :],
                                         size=1)[0]
            true_proportion[bead_index, :] = allocation.copy()
            for j in range(n_types):
                cell_index = cells_to_sample_from_celltype[j][cell_association[
                    bead_index, j]]
                print(cell_index)
                gene_exp = rng.multinomial(allocation[j],
                                           X_norm_prof[cell_index, :],
                                           size=1)[0]
                bead_to_gene_matrix[bead_index, :] += gene_exp
    else:
        raise ValueError(f"{sim_type} is not a valid key for `sim_type`.")

    bead_barcodes = np.arange(num_of_beads)

    adata_spatial = AnnData(
        bead_to_gene_matrix,
        obs=dict(obs_names=bead_barcodes),
        var=dict(var_names=adata.var_names),
    )

    true_proportion = true_proportion / true_proportion.sum(
        1)[:, np.newaxis].astype("float64")

    # fake coordinates
    adata_spatial.obsm["spatial"] = rng.random((adata_spatial.shape[0], 2))
    adata_spatial.obsm["proportions_true"] = true_proportion

    adata_spatial.uns["sc_reference"] = adata.copy()

    return adata_spatial
示例#19
0
def prep_simple(
    adata: AnnData,
    normalize_counts: bool = True,
    filter_var_genes: bool = True,
    n_top_genes: int = 10000,
    for_pooling: bool = False,
    log_transform: bool = True,
    division_factor: float = 1,
    score_cc: bool = True,
    verbose: bool = True,
):
    """Pre-processes AnnData without pooling. Should be done only once.

    Parameters
    ----------
    adata: AnnData
        The raw AnnData object to be pre-processed
    normalize_counts: bool
        Set it to False if library does not need normalization
    filter_var_genes: bool
        If True, only `n_top_genes` highly variable genes are kept.
    n_top_genes: int
        Number of genes to keep after highly variable filter. Used if
        `filter_var_genes` is True. Passed to sc.pp.highly_variable_genes.
    for_pooling: bool
        Set to True if the function is called by the `prep_pooling` function.
        Changes the return object parameters.
    log_transform: bool
        Set it to false if you do not want values to be log-transformed.
    division_factor: int
        Scaling factor, divides the counts matrix by this value.
    score_cc: bool
        If True, cell cycle scores will be added.
    verbose: bool
        If True, messages about function progress will be printed.

    Returns
    ----------
    - `None`
    """

    assert division_factor != 0, "Null division factor. Terminating..."
    if 'ndarray' not in str(type(adata.X)):
        adata.X = adata.X.toarray()
    # np.divide(adata.X, division_factor, out=adata.X)

    if "total_counts" not in adata.obs.keys():
        adata.obs["total_counts"] = adata.X.sum(1)

    # Normalization step
    if normalize_counts:
        sc.pp.normalize_total(adata, target_sum=np.median(adata.obs["total_counts"]))

    # Score cell cycle (multiple signatures)
    if score_cc:
        if verbose:
            print("Scoring cell cycle...")
        _score_cell_cycle(adata, g1s_markers, "G1S_Tirosh")
        _score_cell_cycle(adata, g2m_markers, "G2M_Tirosh")
        _score_cell_cycle(adata, G1S_genes_Freeman, "G1S_Freeman")
        _score_cell_cycle(adata, G2M_genes_Freeman, "G2M_Freeman")
        _score_cell_cycle(adata, g1s_markers_short, "G1S_short")
        _score_cell_cycle(adata, g2m_markers_short, "G2M_short")
        _score_cell_cycle(adata, histone_markers, "Histones")

        adata.obs['G1-S'] = adata.obs['G1S_Tirosh']
        adata.obs['G2-M'] = adata.obs['G2M_Tirosh']

    # Highly variable genes filtering
    if filter_var_genes:
        variances = np.var(adata.X, axis=0)
        inds = np.flip(np.argsort(variances))
        ind_genes = inds[0:n_top_genes]
        if 0 in variances[ind_genes]:
            ind_first_zero = np.argwhere(variances[ind_genes] == 0)[0][0]
            ind_genes = ind_genes[0:ind_first_zero]
        adata._inplace_subset_var(ind_genes)

    # Logarithmization
    if log_transform:
        sc.pp.log1p(adata, base=10)

    if not for_pooling:
        adata.uns["scycle"] = {
            "preprocess": {
                "method": "simple",
                "n_top_genes": n_top_genes,
                "normalize_counts": normalize_counts,
                "filter_var_genes": filter_var_genes,
                "division_factor": division_factor,
                "log_transform": log_transform,
                "n_top_genes": n_top_genes,
            }
        }
    gc.collect()
示例#20
0
def embedding(
    data: Union[AnnData, MuData],
    basis: str,
    color: Optional[Union[str, Sequence[str]]] = None,
    use_raw: Optional[bool] = None,
    layer: Optional[str] = None,
    **kwargs,
):
    """
    Scatter plot for .obs

    Produce a scatter plot in the define basis,
    which can also be a basis inside any modality,
    e.g. ``"rna:X_pca"``.

    See :func:`scanpy.pl.embedding` for details.

    Parameters
    ----------
    data : Union[AnnData, MuData]
        MuData or AnnData object
    basis : str
        Name of the `obsm` basis to use
    color : Optional[Union[str, typing.Sequence[str]]], optional (default: None)
        Keys for variables or annotations of observations (.obs columns).
        Can be from any modality.
    use_raw : Optional[bool], optional (default: None)
        Use `.raw` attribute of the modality where a feature (from `color`) is derived from.
        If `None`, defaults to `True` if `.raw` is present and a valid `layer` is not provided.
    layer : Optional[str], optional (default: None)
        Name of the layer in the modality where a feature (from `color`) is derived from.
        No layer is used by default. If a valid `layer` is provided, this takes precedence
        over `use_raw=True`.
    """
    if isinstance(data, AnnData):
        return sc.pl.embedding(data,
                               basis=basis,
                               color=color,
                               use_raw=use_raw,
                               layer=layer,
                               **kwargs)

    # `data` is MuData
    if basis not in data.obsm and "X_" + basis in data.obsm:
        basis = "X_" + basis

    if basis in data.obsm:
        adata = data
        basis_mod = basis
    else:
        # basis is not a joint embedding
        try:
            mod, basis_mod = basis.split(":")
        except ValueError:
            raise ValueError(
                f"Basis {basis} is not present in the MuData object (.obsm)")

        if mod not in data.mod:
            raise ValueError(
                f"Modality {mod} is not present in the MuData object with modalities {', '.join(data.mod)}"
            )

        adata = data.mod[mod]
        if basis_mod not in adata.obsm:
            if "X_" + basis_mod in adata.obsm:
                basis_mod = "X_" + basis_mod
            elif len(adata.obsm) > 0:
                raise ValueError(
                    f"Basis {basis_mod} is not present in the modality {mod} with embeddings {', '.join(adata.obsm)}"
                )
            else:
                raise ValueError(
                    f"Basis {basis_mod} is not present in the modality {mod} with no embeddings"
                )

    obs = data.obs.loc[adata.obs.index.values]

    if color is None:
        ad = AnnData(obs=obs, obsm=adata.obsm, obsp=adata.obsp)
        return sc.pl.embedding(ad, basis=basis_mod, **kwargs)

    # Some `color` has been provided
    if isinstance(color, str):
        keys = [color]
    elif isinstance(color, Iterable):
        keys = color
    else:
        raise TypeError("Expected color to be a string or an iterable.")

    # Fetch respective features
    if not all([key in obs for key in keys]):
        # {'rna': [True, False], 'prot': [False, True]}
        keys_in_mod = {
            m: [key in data.mod[m].var_names for key in keys]
            for m in data.mod
        }

        # .raw slots might have exclusive var_names
        if use_raw is None or use_raw:
            for i, k in enumerate(keys):
                for m in data.mod:
                    if keys_in_mod[m][i] == False and data.mod[
                            m].raw is not None:
                        keys_in_mod[m][i] = k in data.mod[m].raw.var_names

        for m in data.mod:
            if np.sum(keys_in_mod[m]) > 0:
                mod_keys = np.array(keys)[keys_in_mod[m]]

                if use_raw is None or use_raw:
                    if data.mod[m].raw is not None:
                        keysidx = data.mod[m].raw.var.index.get_indexer_for(
                            mod_keys)
                        fmod_adata = AnnData(
                            X=data.mod[m].raw.X[:, keysidx],
                            var=pd.DataFrame(index=mod_keys),
                            obs=data.mod[m].obs,
                        )
                    else:
                        if use_raw:
                            warnings.warn(
                                f"Attibute .raw is None for the modality {m}, using .X instead"
                            )
                        fmod_adata = data.mod[m][:, mod_keys]
                else:
                    fmod_adata = data.mod[m][:, mod_keys]

                if layer is not None:
                    if layer in data.mod[m].layers:
                        fmod_adata.X = data.mod[m][:, mod_keys].layers[layer]
                        if use_raw:
                            warnings.warn(
                                f"Layer='{layer}' superseded use_raw={use_raw}"
                            )
                    else:
                        warnings.warn(
                            f"Layer {layer} is not present for the modality {m}, using count matrix instead"
                        )
                x = fmod_adata.X.toarray() if issparse(
                    fmod_adata.X) else fmod_adata.X
                obs = obs.join(
                    pd.DataFrame(x,
                                 columns=mod_keys,
                                 index=fmod_adata.obs_names),
                    how="left",
                )

    ad = AnnData(obs=obs, obsm=adata.obsm, obsp=adata.obsp, uns=adata.uns)
    return sc.pl.embedding(ad, basis=basis_mod, color=color, **kwargs)
示例#21
0
def sqrt_cpm(adata: ad.AnnData) -> ad.AnnData:
    """Normalize data to sqrt counts per million."""
    _cpm(adata)
    adata.X = scprep.transform.sqrt(adata.X)
    return adata
示例#22
0
def _high_dim(adata: AnnData) -> np.ndarray:
    adata.X = adata.layers["counts"]
    adata = log_cpm_hvg(adata)
    high_dim = adata.X
    return high_dim.A if issparse(high_dim) else high_dim
示例#23
0
文件: _magic.py 项目: jeffhsu3/scanpy
def magic(
    adata: AnnData,
    name_list: Union[Literal['all_genes', 'pca_only'], Sequence[str], None] = None,
    *,
    knn: int = 5,
    decay: Optional[float] = 1,
    knn_max: Optional[int] = None,
    t: Union[Literal['auto'], int] = 3,
    n_pca: Optional[int] = 100,
    solver: Literal['exact', 'approximate'] = 'exact',
    knn_dist: str = 'euclidean',
    random_state: Optional[Union[int, RandomState]] = None,
    n_jobs: Optional[int] = None,
    verbose: bool = False,
    copy: Optional[bool] = None,
    **kwargs,
) -> Optional[AnnData]:
    """\
    Markov Affinity-based Graph Imputation of Cells (MAGIC) API [vanDijk18]_.

    MAGIC is an algorithm for denoising and transcript recover of single cells
    applied to single-cell sequencing data. MAGIC builds a graph from the data
    and uses diffusion to smooth out noise and recover the data manifold.

    The algorithm implemented here has changed primarily in two ways
    compared to the algorithm described in [vanDijk18]_. Firstly, we use
    the adaptive kernel described in Moon et al, 2019 [Moon17]_ for
    improved stability. Secondly, data diffusion is applied
    in the PCA space, rather than the data space, for speed and
    memory improvements.

    More information and bug reports
    `here <https://github.com/KrishnaswamyLab/MAGIC>`__. For help, visit
    <https://krishnaswamylab.org/get-help>.

    Parameters
    ----------
    adata
        An anndata file with `.raw` attribute representing raw counts.
    name_list
        Denoised genes to return. The default `'all_genes'`/`None`
        may require a large amount of memory if the input data is sparse.
        Another possibility is `'pca_only'`.
    knn
        number of nearest neighbors on which to build kernel.
    decay
        sets decay rate of kernel tails.
        If None, alpha decaying kernel is not used.
    knn_max
        maximum number of nearest neighbors with nonzero connection.
        If `None`, will be set to 3 * `knn`.
    t
        power to which the diffusion operator is powered.
        This sets the level of diffusion. If 'auto', t is selected
        according to the Procrustes disparity of the diffused data.
    n_pca
        Number of principal components to use for calculating
        neighborhoods. For extremely large datasets, using
        n_pca < 20 allows neighborhoods to be calculated in
        roughly log(n_samples) time. If `None`, no PCA is performed.
    solver
        Which solver to use. "exact" uses the implementation described
        in van Dijk et al. (2018) [vanDijk18]_. "approximate" uses a faster
        implementation that performs imputation in the PCA space and then
        projects back to the gene space. Note, the "approximate" solver may
        return negative values.
    knn_dist
        recommended values: 'euclidean', 'cosine', 'precomputed'
        Any metric from `scipy.spatial.distance` can be used
        distance metric for building kNN graph. If 'precomputed',
        `data` should be an n_samples x n_samples distance or
        affinity matrix.
    random_state
        Random seed. Defaults to the global `numpy` random number generator.
    n_jobs
        Number of threads to use in training. All cores are used by default.
    verbose
        If `True` or an integer `>= 2`, print status messages.
        If `None`, `sc.settings.verbosity` is used.
    copy
        If true, a copy of anndata is returned. If `None`, `copy` is True if
        `genes` is not `'all_genes'` or `'pca_only'`. `copy` may only be False
        if `genes` is `'all_genes'` or `'pca_only'`, as the resultant data
        will otherwise have different column names from the input data.
    kwargs
        Additional arguments to `magic.MAGIC`.

    Returns
    -------
    If `copy` is True, AnnData object is returned.

    If `subset_genes` is not `all_genes`, PCA on MAGIC values of cells are
    stored in `adata.obsm['X_magic']` and `adata.X` is not modified.

    The raw counts are stored in `.raw` attribute of AnnData object.

    Examples
    --------
    >>> import scanpy as sc
    >>> import scanpy.external as sce
    >>> adata = sc.datasets.paul15()
    >>> sc.pp.normalize_per_cell(adata)
    >>> sc.pp.sqrt(adata)  # or sc.pp.log1p(adata)
    >>> adata_magic = sce.pp.magic(adata, name_list=['Mpo', 'Klf1', 'Ifitm1'], knn=5)
    >>> adata_magic.shape
    (2730, 3)
    >>> sce.pp.magic(adata, name_list='pca_only', knn=5)
    >>> adata.obsm['X_magic'].shape
    (2730, 100)
    >>> sce.pp.magic(adata, name_list='all_genes', knn=5)
    >>> adata.X.shape
    (2730, 3451)
    """

    try:
        from magic import MAGIC, __version__
    except ImportError:
        raise ImportError(
            'Please install magic package via `pip install --user '
            'git+git://github.com/KrishnaswamyLab/MAGIC.git#subdirectory=python`'
        )
    else:
        if not version.parse(__version__) >= version.parse(MIN_VERSION):
            raise ImportError(
                'scanpy requires magic-impute >= '
                f'v{MIN_VERSION} (detected: v{__version__}). '
                'Please update magic package via `pip install --user '
                '--upgrade magic-impute`'
            )

    start = logg.info('computing MAGIC')
    all_or_pca = isinstance(name_list, (str, type(None)))
    if all_or_pca and name_list not in {"all_genes", "pca_only", None}:
        raise ValueError(
            "Invalid string value for `name_list`: "
            "Only `'all_genes'` and `'pca_only'` are allowed."
        )
    if copy is None:
        copy = not all_or_pca
    elif not all_or_pca and not copy:
        raise ValueError(
            "Can only perform MAGIC in-place with `name_list=='all_genes' or "
            f"`name_list=='pca_only'` (got {name_list}). Consider setting "
            "`copy=True`"
        )
    adata = adata.copy() if copy else adata
    n_jobs = settings.n_jobs if n_jobs is None else n_jobs

    X_magic = MAGIC(
        knn=knn,
        decay=decay,
        knn_max=knn_max,
        t=t,
        n_pca=n_pca,
        solver=solver,
        knn_dist=knn_dist,
        random_state=random_state,
        n_jobs=n_jobs,
        verbose=verbose,
        **kwargs,
    ).fit_transform(adata, genes=name_list)
    logg.info(
        '    finished',
        time=start,
        deep=(
            "added\n    'X_magic', PCA on MAGIC coordinates (adata.obsm)"
            if name_list == "pca_only"
            else ''
        ),
    )
    # update AnnData instance
    if name_list == "pca_only":
        # special case – update adata.obsm with smoothed values
        adata.obsm["X_magic"] = X_magic.X
    elif copy:
        # just return X_magic
        X_magic.raw = adata
        adata = X_magic
    else:
        # replace data with smoothed data
        adata.raw = adata
        adata.X = X_magic.X

    if copy:
        return adata
示例#24
0
def pseudo_spot(
    adata: AnnData,
    tile_path: Union[Path, str] = Path("/tmp/tiles"),
    use_data: str = "raw",
    crop_size: int = "auto",
    platform: _PLATFORM = "Visium",
    weights: _WEIGHTING_MATRIX = "weights_matrix_all",
    copy: _COPY = "pseudo_spot_adata",
) -> Optional[AnnData]:
    """\
    using spatial location (S), tissue morphological feature (M) and gene expression (E) information to impute
    gap between spots and increase resolution for gene detection

    Parameters
    ----------
    adata
        Annotated data matrix.
    use_data
        Input data, can be `raw` counts, log transformed data or dimension reduced space(`X_pca` and `X_umap`)
    tile_path
        Path to save spot image tiles
    crop_size
        Size of tiles
        if `auto`, automatically detect crop size
    weights
        Weighting matrix for imputation.
        if `weights_matrix_all`, matrix combined all information from spatial location (S),
        tissue morphological feature (M) and gene expression (E)
        if `weights_matrix_pd_md`, matrix combined information from spatial location (S),
        tissue morphological feature (M)
    platform
        `Visium` or `Old_ST`
    copy
        Return Anndata
        if `pseudo_spot_adata`, imputed Anndata
        if `combined_adata`, merged Anndata of original data imputed Anndata.
    Returns
    -------
    Anndata
    """
    from sklearn.linear_model import LinearRegression
    import math

    if platform == "Visium":
        img_row = adata.obs["imagerow"]
        img_col = adata.obs["imagecol"]
        array_row = adata.obs["array_row"]
        array_col = adata.obs["array_col"]
        rate = 3
        obs_df_ = adata.obs[["array_row", "array_col"]].copy()
        obs_df_.loc[:, "array_row"] = obs_df_["array_row"].apply(lambda x: x - 2 / 3)
        obs_df = adata.obs[["array_row", "array_col"]].copy()
        obs_df.loc[:, "array_row"] = obs_df["array_row"].apply(lambda x: x + 2 / 3)
        obs_df = obs_df.append(obs_df_).reset_index()
        obs_df.drop_duplicates(subset=["array_row", "array_col"], keep="last")

    elif platform == "Old_ST":
        img_row = adata.obs["imagerow"]
        img_col = adata.obs["imagecol"]
        array_row = adata.obs_names.map(lambda x: x.split("x")[1])
        array_col = adata.obs_names.map(lambda x: x.split("x")[0])
        rate = 1.5
        obs_df_left = pd.DataFrame(
            {"array_row": array_row.to_list(), "array_col": array_col.to_list()},
            dtype=np.float64,
        )
        obs_df_left.loc[:, "array_row"] = obs_df_left["array_row"].apply(
            lambda x: x - 1 / 2
        )

        obs_df_right = pd.DataFrame(
            {"array_row": array_row.to_list(), "array_col": array_col.to_list()},
            dtype=np.float64,
        )
        obs_df_right.loc[:, "array_row"] = obs_df_right["array_row"].apply(
            lambda x: x + 1 / 2
        )

        obs_df_up = pd.DataFrame(
            {"array_row": array_row.to_list(), "array_col": array_col.to_list()},
            dtype=np.float64,
        )
        obs_df_up.loc[:, "array_col"] = obs_df_up["array_col"].apply(
            lambda x: x - 1 / 2
        )

        obs_df_down = pd.DataFrame(
            {"array_row": array_row.to_list(), "array_col": array_col.to_list()},
            dtype=np.float64,
        )
        obs_df_down.loc[:, "array_col"] = obs_df_down["array_col"].apply(
            lambda x: x + 1 / 2
        )

        obs_df_left_up = pd.DataFrame(
            {"array_row": array_row.to_list(), "array_col": array_col.to_list()},
            dtype=np.float64,
        )
        obs_df_left_up.loc[:, "array_row"] = obs_df_left_up["array_row"].apply(
            lambda x: x - 1 / 2
        )
        obs_df_left_up.loc[:, "array_col"] = obs_df_left_up["array_col"].apply(
            lambda x: x - 1 / 2
        )

        obs_df_right_up = pd.DataFrame(
            {"array_row": array_row.to_list(), "array_col": array_col.to_list()},
            dtype=np.float64,
        )
        obs_df_right_up.loc[:, "array_row"] = obs_df_right_up["array_row"].apply(
            lambda x: x + 1 / 2
        )
        obs_df_right_up.loc[:, "array_col"] = obs_df_right_up["array_col"].apply(
            lambda x: x - 1 / 2
        )

        obs_df_left_down = pd.DataFrame(
            {"array_row": array_row.to_list(), "array_col": array_col.to_list()},
            dtype=np.float64,
        )
        obs_df_left_down.loc[:, "array_row"] = obs_df_left_down["array_row"].apply(
            lambda x: x - 1 / 2
        )
        obs_df_left_down.loc[:, "array_col"] = obs_df_left_down["array_col"].apply(
            lambda x: x + 1 / 2
        )

        obs_df_right_down = pd.DataFrame(
            {"array_row": array_row.to_list(), "array_col": array_col.to_list()},
            dtype=np.float64,
        )
        obs_df_right_down.loc[:, "array_row"] = obs_df_right_down["array_row"].apply(
            lambda x: x + 1 / 2
        )
        obs_df_right_down.loc[:, "array_col"] = obs_df_right_down["array_col"].apply(
            lambda x: x + 1 / 2
        )

        obs_df = obs_df_left.append(
            [
                obs_df_right,
                obs_df_up,
                obs_df_down,
                obs_df_left_up,
                obs_df_right_up,
                obs_df_left_down,
                obs_df_right_down,
            ]
        ).reset_index()
        obs_df.drop_duplicates(subset=["array_row", "array_col"], keep="last")
    else:
        raise ValueError(
            f"""\
                {platform!r} does not support.
                """
        )

    reg_row = LinearRegression().fit(array_row.values.reshape(-1, 1), img_row)

    reg_col = LinearRegression().fit(array_col.values.reshape(-1, 1), img_col)

    obs_df.loc[:, "imagerow"] = (
        obs_df.loc[:, "array_row"] * reg_row.coef_ + reg_row.intercept_
    )
    obs_df.loc[:, "imagecol"] = (
        obs_df.loc[:, "array_col"] * reg_col.coef_ + reg_col.intercept_
    )

    impute_coor = obs_df[["imagecol", "imagerow"]]
    coor = adata.obs[["imagecol", "imagerow"]].append(impute_coor)

    point_tree = scipy.spatial.cKDTree(coor)
    n_neighbour = []
    unit = math.sqrt(reg_row.coef_**2 + reg_col.coef_**2)
    for i in range(len(impute_coor)):
        current_neighbour = point_tree.query_ball_point(
            impute_coor.values[i], round(unit)
        )

        current_neighbour = [x for x in current_neighbour if x < len(adata)]
        n_neighbour.append(len(current_neighbour))

    obs_df["n_neighbour"] = n_neighbour
    obs_df = obs_df.loc[obs_df["n_neighbour"] > 1, :].reset_index()

    obs_df.index = obs_df.index.map(lambda x: "Pseudo_Spot_" + str(x))

    impute_df = pd.DataFrame(0, index=obs_df.index, columns=adata.var_names)

    pseudo_spot_adata = AnnData(impute_df, obs=obs_df)
    pseudo_spot_adata.uns["spatial"] = adata.uns["spatial"]

    if crop_size == "auto":
        crop_size = round(unit / 2)

    stlearn.pp.tiling(pseudo_spot_adata, tile_path, crop_size=crop_size)

    stlearn.pp.extract_feature(pseudo_spot_adata)

    if use_data == "raw":
        if isinstance(adata.X, csr_matrix):
            count_embed = adata.X.toarray()
        elif isinstance(adata.X, np.ndarray):
            count_embed = adata.X
        elif isinstance(adata.X, pd.Dataframe):
            count_embed = adata.X.values
        else:
            print(f"{type(adata.X)} is not a valid type")
    else:
        count_embed = adata.obsm[use_data]

    calculate_weight_matrix(
        adata, pseudo_spot_adata, pseudo_spots=True, platform=platform
    )

    impute_neighbour(pseudo_spot_adata, count_embed=count_embed, weights=weights)

    assert pseudo_spot_adata.shape == pseudo_spot_adata.obsm["imputed_data"].shape

    pseudo_spot_adata.X = pseudo_spot_adata.obsm["imputed_data"]

    pseudo_spot_adata = pseudo_spot_adata[np.sum(pseudo_spot_adata.X, axis=1) > 0]

    print("Done")

    if copy == "pseudo_spot_adata":
        return pseudo_spot_adata
    else:
        return _merge(adata, pseudo_spot_adata)
示例#25
0
def vlm_to_adata(vlm, trans_mats=None, cells_ixs=None, em_key=None):
    """ Conversion function from the velocyto world to the scanpy world

	Parameters
	--------
	vlm: VelocytoLoom Object
	trans_mats: None or dict
		A dict of all relevant transition matrices
	cell_ixs: list of int
		These are the indices of the subsampled cells

	Output
	adata: AnnData object
	"""

    # create the anndata object
    adata = AnnData(vlm.Sx_sz.T,
                    vlm.ca,
                    vlm.ra,
                    layers=dict(unspliced=vlm.U.T,
                                spliced=vlm.S.T,
                                velocity=vlm.velocity.T),
                    uns=dict(velocity_graph=vlm.corrcoef,
                             louvain_colors=list(np.unique(vlm.colorandum))))

    # add uns annotations
    if trans_mats is not None:
        for key, value in trans_mats.items():
            adata.uns[key] = trans_mats[key]
    if cells_ixs is not None:
        adata.uns['cell_ixs'] = cells_ixs

    # rename clusters to louvain
    try:
        ix = np.where(adata.obs.columns == 'Clusters')[0][0]
        obs_names = list(adata.obs.columns)
        obs_names[ix] = 'louvain'
        adata.obs.columns = obs_names

        # make louvain a categorical field
        adata.obs['louvain'] = pd.Categorical(adata.obs['louvain'])
    except:
        print('Could not find a filed \'Clusters\' in vlm.ca.')

    # save the pca embedding
    adata.obsm['X_pca'] = vlm.pcs[:, range(50)]

    # transfer the embedding
    if em_key is not None:
        adata.obsm['X_' + em_key] = vlm.ts
        adata.obsm['velocity_' + em_key] = vlm.delta_embedding

    # make things sparse
    adata.X = scp.sparse.csr_matrix(adata.X)
    adata.uns['velocity_graph'] = scp.sparse.csr_matrix(
        adata.uns['velocity_graph'])

    # make the layers sparse
    adata.layers['unspliced'] = scp.sparse.csr_matrix(
        adata.layers['unspliced'])
    adata.layers['spliced'] = scp.sparse.csr_matrix(adata.layers['unspliced'])
    adata.layers['velocity'] = scp.sparse.csr_matrix(adata.layers['unspliced'])

    return adata
示例#26
0
def prep_simple(
    adata: AnnData,
    normalize_counts: bool = True,
    filter_var_genes: bool = True,
    n_top_genes: int = 10000,
    for_pooling: bool = True,
    log_transform: bool = True,
    division_factor: float = 1,
    verbose: bool = True,
):
    """Pre-processes AnnData without pooling. Should be done only once.

    Parameters
    ----------
    adata: AnnData
        The raw AnnData object to be pre-processed
    normalize_counts: bool
        Set it to False if library does not need normalization
    filter_var_genes: bool
        If True, only `n_top_genes` highly variable genes are kept.
    n_top_genes: int
        Number of genes to keep after highly variable filter. Used if
        `filter_var_genes` is True. Passed to sc.pp.highly_variable_genes.
    for_pooling: bool
        Set to True if the function is called by the `prep_pooling` function.
        Changes the return object parameters.
    log_transform: bool
        Set it to false if you do not want values to be log-transformed.
    division_factor: int
        Scaling factor, divides the counts matrix by this value.
    verbose: bool
        If True, messages about function progress will be printed.

    Returns
    ----------
    None
    """

    assert division_factor != 0, "Null division factor. Terminating..."
    adata.X = adata.X / division_factor

    # Normalization step
    if normalize_counts:
        sc.pp.normalize_total(adata,
                              target_sum=np.median(adata.obs["total_counts"]))

    # Highly variable genes filtering
    if filter_var_genes:
        variances = np.var(adata.X, axis=0)
        inds = np.flip(np.argsort(variances))
        ind_genes = inds[0:n_top_genes]
        if 0 in variances[ind_genes]:
            ind_first_zero = np.argwhere(variances[ind_genes] == 0)[0][0]
            ind_genes = ind_genes[0:ind_first_zero]
        adata._inplace_subset_var(ind_genes)

    # Logarithmization
    if log_transform:
        sc.pp.log1p(adata, base=10)

    if not for_pooling:
        adata.uns["scycle"] = {
            "preprocess": {
                "method": "simple",
                "n_top_genes": n_top_genes,
                "normalize_counts": normalize_counts,
                "filter_var_genes": filter_var_genes,
                "division_factor": division_factor,
                "log_transform": log_transform,
                "n_top_genes": n_top_genes,
            }
        }
示例#27
0
def combat(
    adata: AnnData,
    key: str = 'batch',
    covariates: Optional[Collection[str]] = None,
    inplace: bool = True,
) -> Union[AnnData, np.ndarray, None]:
    """\
    ComBat function for batch effect correction [Johnson07]_ [Leek12]_
    [Pedersen12]_.

    Corrects for batch effects by fitting linear models, gains statistical power
    via an EB framework where information is borrowed across genes.
    This uses the implementation `combat.py`_ [Pedersen12]_.

    .. _combat.py: https://github.com/brentp/combat.py

    Parameters
    ----------
    adata
        Annotated data matrix
    key
        Key to a categorical annotation from :attr:`~anndata.AnnData.obs`
        that will be used for batch effect removal.
    covariates
        Additional covariates besides the batch variable such as adjustment
        variables or biological condition. This parameter refers to the design
        matrix `X` in Equation 2.1 in [Johnson07]_ and to the `mod` argument in
        the original combat function in the sva R package.
        Note that not including covariates may introduce bias or lead to the
        removal of biological signal in unbalanced designs.
    inplace
        Whether to replace adata.X or to return the corrected data

    Returns
    -------
    Depending on the value of `inplace`, either returns the corrected matrix or
    or modifies `adata.X`.
    """

    # check the input
    if key not in adata.obs_keys():
        raise ValueError('Could not find the key {!r} in adata.obs'.format(key))

    if covariates is not None:
        cov_exist = np.isin(covariates, adata.obs_keys())
        if np.any(~cov_exist):
            missing_cov = np.array(covariates)[~cov_exist].tolist()
            raise ValueError(
                'Could not find the covariate(s) {!r} in adata.obs'.format(missing_cov)
            )

        if key in covariates:
            raise ValueError('Batch key and covariates cannot overlap')

        if len(covariates) != len(set(covariates)):
            raise ValueError('Covariates must be unique')

    # only works on dense matrices so far
    if issparse(adata.X):
        X = adata.X.A.T
    else:
        X = adata.X.T
    data = pd.DataFrame(data=X, index=adata.var_names, columns=adata.obs_names,)

    sanitize_anndata(adata)

    # construct a pandas series of the batch annotation
    model = adata.obs[[key] + (covariates if covariates else [])]
    batch_info = model.groupby(key).indices.values()
    n_batch = len(batch_info)
    n_batches = np.array([len(v) for v in batch_info])
    n_array = float(sum(n_batches))

    # standardize across genes using a pooled variance estimator
    logg.info("Standardizing Data across genes.\n")
    s_data, design, var_pooled, stand_mean = _standardize_data(model, data, key)

    # fitting the parameters on the standardized data
    logg.info("Fitting L/S model and finding priors\n")
    batch_design = design[design.columns[:n_batch]]
    # first estimate of the additive batch effect
    gamma_hat = (
        la.inv(batch_design.T @ batch_design) @ batch_design.T @ s_data.T
    ).values
    delta_hat = []

    # first estimate for the multiplicative batch effect
    for i, batch_idxs in enumerate(batch_info):
        delta_hat.append(s_data.iloc[:, batch_idxs].var(axis=1))

    # empirically fix the prior hyperparameters
    gamma_bar = gamma_hat.mean(axis=1)
    t2 = gamma_hat.var(axis=1)
    # a_prior and b_prior are the priors on lambda and theta from Johnson and Li (2006)
    a_prior = list(map(_aprior, delta_hat))
    b_prior = list(map(_bprior, delta_hat))

    logg.info("Finding parametric adjustments\n")
    # gamma star and delta star will be our empirical bayes (EB) estimators
    # for the additive and multiplicative batch effect per batch and cell
    gamma_star, delta_star = [], []
    for i, batch_idxs in enumerate(batch_info):
        # temp stores our estimates for the batch effect parameters.
        # temp[0] is the additive batch effect
        # temp[1] is the multiplicative batch effect
        gamma, delta = _it_sol(
            s_data.iloc[:, batch_idxs].values,
            gamma_hat[i],
            delta_hat[i].values,
            gamma_bar[i],
            t2[i],
            a_prior[i],
            b_prior[i],
        )

        gamma_star.append(gamma)
        delta_star.append(delta)

    logg.info("Adjusting data\n")
    bayesdata = s_data
    gamma_star = np.array(gamma_star)
    delta_star = np.array(delta_star)

    # we now apply the parametric adjustment to the standardized data from above
    # loop over all batches in the data
    for j, batch_idxs in enumerate(batch_info):
        # we basically substract the additive batch effect, rescale by the ratio
        # of multiplicative batch effect to pooled variance and add the overall gene
        # wise mean
        dsq = np.sqrt(delta_star[j, :])
        dsq = dsq.reshape((len(dsq), 1))
        denom = np.dot(dsq, np.ones((1, n_batches[j])))
        numer = np.array(
            bayesdata.iloc[:, batch_idxs]
            - np.dot(batch_design.iloc[batch_idxs], gamma_star).T
        )
        bayesdata.iloc[:, batch_idxs] = numer / denom

    vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1))
    bayesdata = bayesdata * np.dot(vpsq, np.ones((1, int(n_array)))) + stand_mean

    # put back into the adata object or return
    if inplace:
        adata.X = bayesdata.values.transpose()
    else:
        return bayesdata.values.transpose()
示例#28
0
def prep_pooling(
    adata: AnnData,
    dim_red_method_pooling: str = "pca",
    n_neighbors: int = 5,
    embed_n_comps: int = 20,
    filter_cells: bool = True,
    min_counts: int = 10000,
    max_counts: int = 40000,
    max_mt_ratio: int = 20,
    normalize_counts: bool = True,
    filter_var_genes: bool = True,
    n_top_genes: int = 10000,
    for_pooling: bool = True,
    log_transform: bool = True,
    division_factor: float = 1,
    verbose: bool = True,
):
    """Pre-processes AnnData without pooling

    Parameters
    ----------
    adata: AnnData
        The AnnData object to be pre-processed. This should already have been
        processed to remove "bad cells" (high mitochondrial percentage,
        aberrant total counts).
    dim_red_method_pooling: str
        Method to use for dimensionality reduction to do the pooling procedure.
        Default: 'pca'. TO-DO: support 'ica' and other?
    n_neighbors: int
        Number of nearest neighbors to use for pooling.
    embed_n_comps: int
        Number of components to use for the embedding to do the pooling.
    filter_cells: bool
        Set it to False if bad quality cells were already filtered
    min_counts: int
        Minimum number of counts required for a cell to pass filtering.
    max_counts: int
        Maximum number of counts required for a cell to pass filtering.
    max_mt_ratio: int
        Maximum proportion of mitochondrial genes in a cell to pass
        filtering.
    normalize_counts: bool
        Set it to False if library does not need normalization
    filter_var_genes: bool
        If True, only `n_top_genes` highly variable genes are kept.
    n_top_genes: int
        Number of genes to keep after highly variable filter. Used if
        `filter_var_genes` is True. Passed to sc.pp.highly_variable_genes.
    for_pooling: bool
        Set to True if the function is called by the `prep_pooling` function.
        Changes the return object parameters.
    log_transform: bool
        Set it to false if you do not want values to be log-transformed.
    division_factor: int
        Scaling factor, divides the counts matrix by this value.
    verbose: bool
        If True, messages about function progress will be printed.

    Returns
    ----------
    None
    """

    if "scycle" in adata.uns:
        raise Exception("Data has already been pre-processed")

    if verbose:
        print("Preparing embedding...")

    assert division_factor != 0, "Null division factor. Terminating..."
    adata.X = adata.X / division_factor

    if filter_cells:
        quality_control(adata, min_counts, max_counts, max_mt_ratio, verbose)

    adata_simple = adata.copy()
    prep_simple(
        adata_simple,
        normalize_counts,
        filter_var_genes,
        n_top_genes,
        True,
        log_transform,
        1,
        False,
    )

    if verbose:
        print("Embedding for pooling...")
    X_embed = _embed_for_pooling(adata_simple,
                                 dim_red_method_pooling,
                                 n_comps=embed_n_comps)

    if verbose:
        print("Pooling", str(X_embed.shape[0]), "samples...")
    _smooth_adata_by_pooling(adata, X_embed, n_neighbours=n_neighbors)
    prep_simple(
        adata,
        normalize_counts,
        filter_var_genes,
        n_top_genes,
        False,
        log_transform,
        1,
        verbose,
    )

    adata.uns["scycle"] = {
        "preprocess": {
            "method": "pooling",
            "n_neighbors": n_neighbors,
            "min_counts": min_counts,
            "max_counts": max_counts,
            "max_mt_ratio": max_mt_ratio,
            "normalize_counts": normalize_counts,
            "filter_var_genes": filter_var_genes,
            "division_factor": division_factor,
            "log_transform": log_transform,
            "n_top_genes": n_top_genes,
            "embed_n_comps": embed_n_comps,
        }
    }
示例#29
0
def regress_out(
    adata: AnnData,
    keys: Union[str, Sequence[str]],
    n_jobs: Optional[int] = None,
    copy: bool = False,
) -> Optional[AnnData]:
    """\
    Regress out (mostly) unwanted sources of variation.

    Uses simple linear regression. This is inspired by Seurat's `regressOut`
    function in R [Satija15]. Note that this function tends to overcorrect
    in certain circumstances as described in :issue:`526`.

    Parameters
    ----------
    adata
        The annotated data matrix.
    keys
        Keys for observation annotation on which to regress on.
    n_jobs
        Number of jobs for parallel computation.
        `None` means using :attr:`scanpy._settings.ScanpyConfig.n_jobs`.
    copy
        Determines whether a copy of `adata` is returned.

    Returns
    -------
    Depending on `copy` returns or updates `adata` with the corrected data matrix.
    """
    start = logg.info(f'regressing out {keys}')
    if issparse(adata.X):
        logg.info('    sparse input is densified and may '
                  'lead to high memory use')
    adata = adata.copy() if copy else adata

    sanitize_anndata(adata)

    # TODO: This should throw an implicit modification warning
    if adata.is_view:
        adata._init_as_actual(adata.copy())

    if isinstance(keys, str):
        keys = [keys]

    if issparse(adata.X):
        adata.X = adata.X.toarray()

    n_jobs = sett.n_jobs if n_jobs is None else n_jobs

    # regress on a single categorical variable
    variable_is_categorical = False
    if keys[0] in adata.obs_keys() and is_categorical_dtype(
            adata.obs[keys[0]]):
        if len(keys) > 1:
            raise ValueError('If providing categorical variable, '
                             'only a single one is allowed. For this one '
                             'we regress on the mean for each category.')
        logg.debug('... regressing on per-gene means within categories')
        regressors = np.zeros(adata.X.shape, dtype='float32')
        for category in adata.obs[keys[0]].cat.categories:
            mask = (category == adata.obs[keys[0]]).values
            for ix, x in enumerate(adata.X.T):
                regressors[mask, ix] = x[mask].mean()
        variable_is_categorical = True
    # regress on one or several ordinal variables
    else:
        # create data frame with selected keys (if given)
        if keys:
            regressors = adata.obs[keys]
        else:
            regressors = adata.obs.copy()

        # add column of ones at index 0 (first column)
        regressors.insert(0, 'ones', 1.0)

    len_chunk = np.ceil(min(1000, adata.X.shape[1]) / n_jobs).astype(int)
    n_chunks = np.ceil(adata.X.shape[1] / len_chunk).astype(int)

    tasks = []
    # split the adata.X matrix by columns in chunks of size n_chunk
    # (the last chunk could be of smaller size than the others)
    chunk_list = np.array_split(adata.X, n_chunks, axis=1)
    if variable_is_categorical:
        regressors_chunk = np.array_split(regressors, n_chunks, axis=1)
    for idx, data_chunk in enumerate(chunk_list):
        # each task is a tuple of a data_chunk eg. (adata.X[:,0:100]) and
        # the regressors. This data will be passed to each of the jobs.
        if variable_is_categorical:
            regres = regressors_chunk[idx]
        else:
            regres = regressors
        tasks.append(tuple((data_chunk, regres, variable_is_categorical)))

    if n_jobs > 1 and n_chunks > 1:
        import multiprocessing
        pool = multiprocessing.Pool(n_jobs)
        res = pool.map_async(_regress_out_chunk, tasks).get(9999999)
        pool.close()

    else:
        res = list(map(_regress_out_chunk, tasks))

    # res is a list of vectors (each corresponding to a regressed gene column).
    # The transpose is needed to get the matrix in the shape needed
    adata.X = np.vstack(res).T.astype(adata.X.dtype)
    logg.info('    finished', time=start)
    return adata if copy else None
示例#30
0
文件: hashsolo.py 项目: yynst2/solo
def hashsolo(
    cell_hashing_adata: anndata.AnnData,
    priors: list = [.01, .8, .19],
    pre_existing_clusters: str = None,
    clustering_data: anndata.AnnData = None,
    resolutions: list = [.1, .25, .5, .75, 1],
    number_of_noise_barcodes: int = None,
    inplace: bool = True,
):
    '''Demultiplex cell hashing dataset using HashSolo method

    Parameters
    ----------
    cell_hashing_adata : anndata.AnnData
        Anndata object filled only with hashing counts
    priors : list,
        a list of your prior for each hypothesis
        first element is your prior for the negative hypothesis
        second element is your prior for the singlet hypothesis
        third element is your prior for the doublet hypothesis
        We use [0.01, 0.8, 0.19] by default because we assume the barcodes
        in your cell hashing matrix are those cells which have passed QC
        in the transcriptome space, e.g. UMI counts, pct mito reads, etc.
    clustering_data : anndata.AnnData
        transcriptional data for clustering
    resolutions : list
        clustering resolutions for leiden
    pre_existing_clusters : str
        column in cell_hashing_adata.obs for how to break up demultiplexing
    inplace : bool
        To do operation in place

    Returns
    -------
    cell_hashing_adata : AnnData
        if inplace is False returns AnnData with demultiplexing results
        in .obs attribute otherwise does is in place
    '''
    if issparse(cell_hashing_adata.X):
        cell_hashing_adata.X = np.array(cell_hashing_adata.X.todense())

    if clustering_data is not None:
        print(
            'This may take awhile we are running clustering at {} different resolutions'
            .format(len(resolutions)))
        if not all(clustering_data.obs_names == cell_hashing_adata.obs_names):
            raise ValueError(
                'clustering_data and cell hashing cell_hashing_adata must have same index'
            )
        cell_hashing_adata.obs['best_leiden'] = _get_clusters(
            clustering_data, resolutions)

    data = cell_hashing_adata.X
    num_of_cells = cell_hashing_adata.shape[0]
    results = pd.DataFrame(np.zeros((num_of_cells, 6)),
                           columns=[
                               'most_likely_hypothesis',
                               'probs_hypotheses',
                               'cluster_feature',
                               'negative_hypothesis_probability',
                               'singlet_hypothesis_probability',
                               'doublet_hypothesis_probability',
                           ],
                           index=cell_hashing_adata.obs_names)
    if clustering_data is not None or pre_existing_clusters is not None:
        cluster_features = 'best_leiden' if pre_existing_clusters is None else pre_existing_clusters
        unique_cluster_features = np.unique(
            cell_hashing_adata.obs[cluster_features])
        for cluster_feature in unique_cluster_features:
            cluster_feature_bool_vector = cell_hashing_adata.obs[
                cluster_features] == cluster_feature
            posterior_dict = _calculate_bayes_rule(
                data[cluster_feature_bool_vector], priors,
                number_of_noise_barcodes)
            results.loc[cluster_feature_bool_vector,
                        'most_likely_hypothesis'] = posterior_dict[
                            'most_likely_hypothesis']
            results.loc[cluster_feature_bool_vector,
                        'cluster_feature'] = cluster_feature
            results.loc[cluster_feature_bool_vector,
                        'negative_hypothesis_probability'] = posterior_dict[
                            'probs_hypotheses'][:, 0]
            results.loc[cluster_feature_bool_vector,
                        'singlet_hypothesis_probability'] = posterior_dict[
                            'probs_hypotheses'][:, 1]
            results.loc[cluster_feature_bool_vector,
                        'doublet_hypothesis_probability'] = posterior_dict[
                            'probs_hypotheses'][:, 2]
    else:
        posterior_dict = _calculate_bayes_rule(data, priors,
                                               number_of_noise_barcodes)
        results.loc[:, 'most_likely_hypothesis'] = posterior_dict[
            'most_likely_hypothesis']
        results.loc[:, 'cluster_feature'] = 0
        results.loc[:, 'negative_hypothesis_probability'] = posterior_dict[
            'probs_hypotheses'][:, 0]
        results.loc[:, 'singlet_hypothesis_probability'] = posterior_dict[
            'probs_hypotheses'][:, 1]
        results.loc[:, 'doublet_hypothesis_probability'] = posterior_dict[
            'probs_hypotheses'][:, 2]

    cell_hashing_adata.obs['most_likely_hypothesis'] = results.loc[
        cell_hashing_adata.obs_names, 'most_likely_hypothesis']
    cell_hashing_adata.obs['cluster_feature'] = results.loc[
        cell_hashing_adata.obs_names, 'cluster_feature']
    cell_hashing_adata.obs['negative_hypothesis_probability'] = results.loc[
        cell_hashing_adata.obs_names, 'negative_hypothesis_probability']
    cell_hashing_adata.obs['singlet_hypothesis_probability'] = results.loc[
        cell_hashing_adata.obs_names, 'singlet_hypothesis_probability']
    cell_hashing_adata.obs['doublet_hypothesis_probability'] = results.loc[
        cell_hashing_adata.obs_names, 'doublet_hypothesis_probability']

    cell_hashing_adata.obs['Classification'] = None
    cell_hashing_adata.obs.loc[
        cell_hashing_adata.obs['most_likely_hypothesis'] == 2,
        'Classification'] = 'Doublet'
    cell_hashing_adata.obs.loc[
        cell_hashing_adata.obs['most_likely_hypothesis'] == 0,
        'Classification'] = 'Negative'
    all_sings = cell_hashing_adata.obs['most_likely_hypothesis'] == 1
    singlet_sample_index = np.argmax(cell_hashing_adata.X[all_sings], axis=1)
    cell_hashing_adata.obs.loc[
        all_sings,
        'Classification'] = cell_hashing_adata.var_names[singlet_sample_index]

    return cell_hashing_adata if not inplace else None