Пример #1
0
def correct_batch(data: MultimodalData, features: str = None) -> None:
    """Batch correction on data using Location-Scale (L/S) Adjustment method. ([Li-and-Wong03]_, [Li20]_). If L/S adjustment method is used, users must call this function every time before they call the pca function.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    features: `str`, optional, default: ``None``
        Features to be included in batch correction computation. If ``None``, simply consider all features.

    Returns
    -------
    ``None``

    Update ``data.X`` by the corrected count matrix.

    Examples
    --------
    >>> pg.correct_batch(data, features = "highly_variable_features")
    """

    tot_seconds = 0.0

    # estimate adjustment parameters
    start = time.perf_counter()
    can_correct = estimate_adjustment_matrices(data)
    end = time.perf_counter()
    tot_seconds += end - start
    logger.info("Adjustment parameters are estimated.")

    # select dense matrix
    keyword = select_features(
        data, features=features, standardize=False,
        max_value=None)  # do not standardize or truncate max_value
    logger.info("Features are selected.")

    if can_correct:
        start = time.perf_counter()
        correct_batch_effects(data, keyword, features)
        end = time.perf_counter()
        tot_seconds += end - start
        logger.info(
            "Batch correction is finished. Time spent = {:.2f}s.".format(
                tot_seconds))
Пример #2
0
def correct_batch(data: AnnData, features: str = None) -> None:
    """Batch correction on data.

    Parameters
    ----------
    data: ``anndata.AnnData``
        Annotated data matrix with rows for cells and columns for genes.

    features: `str`, optional, default: ``None``
        Features to be included in batch correction computation. If ``None``, simply consider all features.

    Returns
    -------
    ``None``

    Update ``data.X`` by the corrected count matrix.

    Examples
    --------
    >>> pg.correct_batch(adata, features = "highly_variable_features")
    """

    tot_seconds = 0.0

    # estimate adjustment parameters
    start = time.perf_counter()
    can_correct = estimate_adjustment_matrices(data)
    end = time.perf_counter()
    tot_seconds += end - start
    logger.info("Adjustment parameters are estimated.")

    # select dense matrix
    keyword = select_features(data, features)
    logger.info("Features are selected.")

    if can_correct:
        start = time.perf_counter()
        correct_batch_effects(data, keyword, features)
        end = time.perf_counter()
        tot_seconds += end - start
        logger.info(
            "Batch correction is finished. Time spent = {:.2f}s.".format(tot_seconds)
        )
Пример #3
0
def run_scanorama(
    data: MultimodalData,
    n_components: int = 50,
    features: str = "highly_variable_features",
    standardize: bool = True,
    max_value: float = 10,
    random_state: int = 0,
) -> str:
    """Batch correction using Scanorama.

    This is a wrapper of `Scanorama <https://github.com/brianhie/scanorama>`_ package. See [Hie19]_ for details on the algorithm.

    Parameters
    ----------
    data: ``MultimodalData``.
        Annotated data matrix with rows for cells and columns for genes.

    n_components: ``int``, optional default: ``50``.
        Number of integrated embedding components to keep. This sets Scanorama's dimred parameter.

    features: ``str``, optional, default: ``"highly_variable_features"``.
        Keyword in ``data.var`` to specify features used for Scanorama.

    standardize: ``bool``, optional, default: ``True``.
        Whether to scale the data to unit variance and zero mean.

    max_value: ``float``, optional, default: ``10``.
        The threshold to truncate data after scaling. If ``None``, do not truncate.

    random_state: ``int``, optional, default: ``0``.
        Seed for random number generator.

    Returns
    -------
    out_rep: ``str``
        The keyword in ``data.obsm`` referring to the embedding calculated by Scanorama algorithm. out_rep is always equal to "scanorama"

    Update ``data.obsm``:
        * ``data.obsm['X_scanorama']``: The embedding calculated by Scanorama algorithm.

    Examples
    --------
    >>> pg.run_scanorama(data, random_state = 25)
    """
    if not is_categorical_dtype(data.obs['Channel']):
        data.obs['Channel'] = pd.Categorical(data.obs['Channel'])
    if data.obs['Channel'].cat.categories.size  == 1:
        logger.warning("Warning: data only contains 1 channel. Cannot apply Scanorama!")
        return 'pca'

    try:
        from scanorama import integrate
    except ImportError as e:
        print(f"ERROR: {e}")
        print("ERROR: Need Scanorama! Try 'pip install scanorama'.")
        import sys
        sys.exit(-1)

    logger.info("Start integration using Scanorama.")

    rep = 'scanorama'
    keyword = select_features(data, features=features, standardize=standardize, max_value=max_value, use_cache=False)
    X = data.uns[keyword]

    datasets = []
    for channel in data.obs['Channel'].cat.categories:
        idx = (data.obs['Channel'] == channel).values
        assert idx.sum() > 0
        datasets.append(X[idx, :])
    genes_list = [[str(i) for i in range(X.shape[1])]] * data.obs['Channel'].cat.categories.size

    integrated, genes = integrate(datasets, genes_list, dimred = n_components, seed = random_state)
    data.obsm[f'X_{rep}'] = np.concatenate(integrated, axis = 0)

    return rep