Пример #1
0
def diffmap(
    data: AnnData,
    n_components: int = 100,
    rep: str = "pca",
    solver: str = "eigsh",
    random_state: int = 0,
    max_t: float = 5000,
) -> None:
    """Calculate Diffusion Map.

    Parameters
    ----------
    data: ``anndata.AnnData``
        Annotated data matrix with rows for cells and columns for genes.

    n_components: ``int``, optional, default: ``100``
        Number of diffusion components to calculate.

    rep: ``str``, optional, default: ``"pca"``
        Embedding Representation of data used for calculating the Diffusion Map. By default, use PCA coordinates.

    solver: ``str``, optional, default: ``"eigsh"``
        Solver for eigen decomposition:
            * ``"eigsh"``: default setting. Use *scipy* `eigsh <https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.eigsh.html>`_ as the solver to find eigenvalus and eigenvectors using the Implicitly Restarted Lanczos Method.
            * ``"randomized"``: Use *scikit-learn* `randomized_svd <https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_svd.html>`_ as the solver to calculate a truncated randomized SVD.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    max_t: ``float``, optional, default: ``5000``
        pegasus tries to determine the best t to sum up to between ``[1, max_t]``.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm["X_diffmap"]``: Diffusion Map matrix of the data.

    Update ``data.uns``:
        * ``data.uns["diffmap_evals"]``: Eigenvalues corresponding to Diffusion Map matrix.

    Examples
    --------
    >>> pg.diffmap(adata)
    """

    rep = update_rep(rep)
    Phi_pt, Lambda, Phi = calculate_diffusion_map(
        W_from_rep(data, rep),
        n_components=n_components,
        solver=solver,
        random_state=random_state,
        max_t=max_t,
    )

    data.obsm["X_diffmap"] = Phi_pt
    data.uns["diffmap_evals"] = Lambda
    data.obsm["X_phi"] = Phi
Пример #2
0
def get_neighbors(
    data: AnnData,
    K: int = 100,
    rep: str = "pca",
    n_jobs: int = -1,
    random_state: int = 0,
    full_speed: bool = False,
) -> Tuple[List[int], List[float]]:
    """Find K nearest neighbors for each data point and return the indices and distances arrays.

    Parameters
    ----------

    data : `AnnData`
        An AnnData object.
    K : `int`, optional (default: 100)
        Number of neighbors, including the data point itself.
    rep : `str`, optional (default: 'pca')
        Representation used to calculate kNN. If `None` use data.X
    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to all available threads
    random_state: `int`, optional (default: 0)
        Random seed for random number generator.
    full_speed: `bool`, optional (default: False)
        If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible.

    Returns
    -------

    kNN indices and distances arrays.

    Examples
    --------
    >>> indices, distances = tools.get_neighbors(adata)
    """

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if knn_is_cached(data, indices_key, distances_key, K):
        indices = data.uns[indices_key]
        distances = data.uns[distances_key]
        logger.info("Found cached kNN results, no calculation is required.")
    else:
        indices, distances = calculate_nearest_neighbors(
            X_from_rep(data, rep),
            K=K,
            n_jobs=effective_n_jobs(n_jobs),
            random_state=random_state,
            full_speed=full_speed,
        )
        data.uns[indices_key] = indices
        data.uns[distances_key] = distances

    return indices, distances
Пример #3
0
def net_fle(
    data: MultimodalData,
    file_name: str = None,
    n_jobs: int = -1,
    rep: str = "diffmap",
    K: int = 50,
    full_speed: bool = False,
    target_change_per_node: float = 2.0,
    target_steps: int = 5000,
    is3d: bool = False,
    memory: int = 8,
    random_state: int = 0,
    select_frac: float = 0.1,
    select_K: int = 25,
    select_alpha: float = 1.0,
    net_alpha: float = 0.1,
    polish_target_steps: int = 1500,
    out_basis: str = "net_fle",
) -> None:
    """Construct Net-Force-directed (FLE) graph.

    Net-FLE is an approximated FLE graph using Deep Learning model to improve the speed.

    In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor.

    See [Li20]_ for details.

    .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    file_name: ``str``, optional, default: ``None``
        Temporary file to store the coordinates as the input to forceatlas2. If ``None``, use ``tempfile.mkstemp`` to generate file name.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all available threads.

    rep: ``str``, optional, default: ``"diffmap"``
        Representation of data used for the calculation. By default, use Diffusion Map coordinates. If ``None``, use the count matrix ``data.X``.

    K: ``int``, optional, default: ``50``
        Number of nearest neighbors to be considered during the computation.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    target_change_per_node: ``float``, optional, default: ``2.0``
        Target change per node to stop ForceAtlas2.

    target_steps: ``int``, optional, default: ``5000``
        Maximum number of iterations before stopping the ForceAtlas2 algorithm.

    is3d: ``bool``, optional, default: ``False``
        If ``True``, calculate 3D force-directed layout.

    memory: ``int``, optional, default: ``8``
        Memory size in GB for the Java FA2 component. By default, use 8GB memory.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    select_frac: ``float``, optional, default: ``0.1``
        Down sampling fraction on the cells.

    select_K: ``int``, optional, default: ``25``
        Number of neighbors to be used to estimate local density for each data point for down sampling.

    select_alpha: ``float``, optional, default: ``1.0``
        Weight the down sample to be proportional to ``radius ** select_alpha``.

    net_alpha: ``float``, optional, default: ``0.1``
        L2 penalty (regularization term) parameter of the deep regressor.

    polish_target_steps: ``int``, optional, default: ``1500``
        After running the deep regressor to predict new coordinate, Number of ForceAtlas2 iterations.

    out_basis: ``str``, optional, default: ``"net_fle"``
        Key name for calculated FLE coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: Net FLE coordinates of the data.

    Update ``data.obs``:
        * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase.

    Examples
    --------
    >>> pg.net_fle(data)
    """

    if file_name is None:
        if file_name is None:
            import tempfile

            _, file_name = tempfile.mkstemp()

    n_jobs = effective_n_jobs(n_jobs)
    rep = update_rep(rep)

    if ("W_" + rep) not in data.uns:
        neighbors(
            data,
            K=K,
            rep=rep,
            n_jobs=n_jobs,
            random_state=random_state,
            full_speed=full_speed,
        )

    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if not knn_is_cached(data, indices_key, distances_key, select_K):
        raise ValueError("Please run neighbors first!")

    selected = select_cells(
        data.uns[distances_key],
        select_frac,
        K=select_K,
        alpha=select_alpha,
        random_state=random_state,
    )

    X_full = X_from_rep(data, rep)
    X = X_full[selected, :]

    ds_indices_key = "ds_" + rep + "_knn_indices"
    ds_distances_key = "ds_" + rep + "_knn_distances"
    indices, distances = calculate_nearest_neighbors(X,
                                                     K=K,
                                                     n_jobs=n_jobs,
                                                     random_state=random_state,
                                                     full_speed=full_speed)
    data.uns[ds_indices_key] = indices
    data.uns[ds_distances_key] = distances

    W = calculate_affinity_matrix(indices, distances)

    X_fle = calc_force_directed_layout(
        W,
        file_name + ".small",
        n_jobs,
        target_change_per_node,
        target_steps,
        is3d,
        memory,
        random_state,
    )

    data.uns["X_" + out_basis + "_small"] = X_fle
    data.obs["ds_diffmap_selected"] = selected

    n_components = 2 if not is3d else 3
    Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64)
    Y_init[selected, :] = X_fle
    Y_init[~selected, :] = net_train_and_predict(X,
                                                 X_fle,
                                                 X_full[~selected, :],
                                                 net_alpha,
                                                 random_state,
                                                 verbose=True)

    data.obsm["X_" + out_basis + "_pred"] = Y_init

    data.obsm["X_" + out_basis] = calc_force_directed_layout(
        W_from_rep(data, rep),
        file_name,
        n_jobs,
        target_change_per_node,
        polish_target_steps,
        is3d,
        memory,
        random_state,
        init=Y_init,
    )
Пример #4
0
def net_umap(
    data: MultimodalData,
    rep: str = "pca",
    n_jobs: int = -1,
    n_components: int = 2,
    n_neighbors: int = 15,
    min_dist: float = 0.5,
    spread: float = 1.0,
    random_state: int = 0,
    select_frac: float = 0.1,
    select_K: int = 25,
    select_alpha: float = 1.0,
    full_speed: bool = False,
    net_alpha: float = 0.1,
    polish_learning_rate: float = 10.0,
    polish_n_epochs: int = 30,
    out_basis: str = "net_umap",
) -> None:
    """Calculate Net-UMAP embedding of cells.

    Net-UMAP is an approximated UMAP embedding using Deep Learning model to improve the speed.

    In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor.

    See [Li20]_ for details.

    .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization.

    n_neighbors: ``int``, optional, default: ``15``
        Number of nearest neighbors considered during the computation.

    min_dist: ``float``, optional, default: ``0.5``
        The effective minimum distance between embedded data points.

    spread: ``float``, optional, default: ``1.0``
        The effective scale of embedded data points.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    select_frac: ``float``, optional, default: ``0.1``
        Down sampling fraction on the cells.

    select_K: ``int``, optional, default: ``25``
        Number of neighbors to be used to estimate local density for each data point for down sampling.

    select_alpha: ``float``, optional, default: ``1.0``
        Weight the down sample to be proportional to ``radius ** select_alpha``.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    net_alpha: ``float``, optional, default: ``0.1``
        L2 penalty (regularization term) parameter of the deep regressor.

    polish_learning_frac: ``float``, optional, default: ``10.0``
        After running the deep regressor to predict new coordinates, use ``polish_learning_frac`` * ``n_obs`` as the learning rate to polish the coordinates.

    polish_n_iter: ``int``, optional, default: ``30``
        Number of iterations for polishing UMAP run.

    out_basis: ``str``, optional, default: ``"net_umap"``
        Key name for calculated UMAP coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: Net UMAP coordinates of the data.

    Update ``data.obs``:
        * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase.

    Examples
    --------
    >>> pg.net_umap(data)
    """

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if not knn_is_cached(data, indices_key, distances_key, select_K):
        raise ValueError("Please run neighbors first!")

    n_jobs = effective_n_jobs(n_jobs)

    selected = select_cells(
        data.uns[distances_key],
        select_frac,
        K=select_K,
        alpha=select_alpha,
        random_state=random_state,
    )
    X_full = X_from_rep(data, rep)
    X = X_full[selected, :]

    ds_indices_key = "ds_" + rep + "_knn_indices"  # ds refers to down-sampling
    ds_distances_key = "ds_" + rep + "_knn_distances"
    indices, distances = calculate_nearest_neighbors(
        X,
        K=n_neighbors,
        n_jobs=n_jobs,
        random_state=random_state,
        full_speed=full_speed,
    )
    data.uns[ds_indices_key] = indices
    data.uns[ds_distances_key] = distances

    knn_indices = np.insert(data.uns[ds_indices_key][:, 0:n_neighbors - 1],
                            0,
                            range(X.shape[0]),
                            axis=1)
    knn_dists = np.insert(data.uns[ds_distances_key][:, 0:n_neighbors - 1],
                          0,
                          0.0,
                          axis=1)

    X_umap = calc_umap(
        X,
        n_components,
        n_neighbors,
        min_dist,
        spread,
        random_state,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )

    data.uns["X_" + out_basis + "_small"] = X_umap
    data.obs["ds_selected"] = selected

    Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64)
    Y_init[selected, :] = X_umap
    Y_init[~selected, :] = net_train_and_predict(X,
                                                 X_umap,
                                                 X_full[~selected, :],
                                                 net_alpha,
                                                 random_state,
                                                 verbose=True)

    data.obsm["X_" + out_basis + "_pred"] = Y_init

    knn_indices = np.insert(data.uns[indices_key][:, 0:n_neighbors - 1],
                            0,
                            range(data.shape[0]),
                            axis=1)
    knn_dists = np.insert(data.uns[distances_key][:, 0:n_neighbors - 1],
                          0,
                          0.0,
                          axis=1)

    data.obsm["X_" + out_basis] = calc_umap(
        X_full,
        n_components,
        n_neighbors,
        min_dist,
        spread,
        random_state,
        init=Y_init,
        n_epochs=polish_n_epochs,
        learning_rate=polish_learning_rate,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )
Пример #5
0
def net_tsne(
    data: MultimodalData,
    rep: str = "pca",
    n_jobs: int = -1,
    n_components: int = 2,
    perplexity: float = 30,
    early_exaggeration: int = 12,
    learning_rate: float = 1000,
    random_state: int = 0,
    select_frac: float = 0.1,
    select_K: int = 25,
    select_alpha: float = 1.0,
    net_alpha: float = 0.1,
    polish_learning_frac: float = 0.33,
    polish_n_iter: int = 150,
    out_basis: str = "net_tsne",
) -> None:
    """Calculate Net-tSNE embedding of cells.

    Net-tSNE is an approximated tSNE embedding using Deep Learning model to improve the calculation speed.

    In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor.

    See [Li20]_ for details.

    .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells (``n_obs``) and columns for genes (``n_feature``).

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all available threads.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated tSNE coordinates. By default, generate 2-dimensional data for 2D visualization.

    perplexity: ``float``, optional, default: ``30``
        The perplexity is related to the number of nearest neighbors used in other manifold learning algorithms. Larger datasets usually require a larger perplexity.

    early_exaggeration: ``int``, optional, default: ``12``
        Controls how tight natural clusters in the original space are in the embedded space, and how much space will be between them.

    learning_rate: ``float``, optional, default: ``1000``
        The learning rate can be a critical parameter, which should be between 100 and 1000.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    select_frac: ``float``, optional, default: ``0.1``
        Down sampling fraction on the cells.

    select_K: ``int``, optional, default: ``25``
        Number of neighbors to be used to estimate local density for each data point for down sampling.

    select_alpha: ``float``, optional, default: ``1.0``
        Weight the down sample to be proportional to ``radius ** select_alpha``.

    net_alpha: ``float``, optional, default: ``0.1``
        L2 penalty (regularization term) parameter of the deep regressor.

    polish_learning_frac: ``float``, optional, default: ``0.33``
        After running the deep regressor to predict new coordinates, use ``polish_learning_frac`` * ``n_obs`` as the learning rate to polish the coordinates.

    polish_n_iter: ``int``, optional, default: ``150``
        Number of iterations for polishing tSNE run.

    out_basis: ``str``, optional, default: ``"net_tsne"``
        Key name for the approximated tSNE coordinates calculated.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: Net tSNE coordinates of the data.

    Update ``data.obs``:
        * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase.

    Examples
    --------
    >>> pg.net_tsne(data)
    """

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if not knn_is_cached(data, indices_key, distances_key, select_K):
        raise ValueError("Please run neighbors first!")

    n_jobs = effective_n_jobs(n_jobs)

    selected = select_cells(
        data.uns[distances_key],
        select_frac,
        K=select_K,
        alpha=select_alpha,
        random_state=random_state,
    )

    X_full = X_from_rep(data, rep)
    X = X_full[selected, :]
    X_tsne = calc_tsne(
        X,
        n_jobs,
        n_components,
        perplexity,
        early_exaggeration,
        learning_rate,
        random_state,
    )

    data.uns["X_" + out_basis + "_small"] = X_tsne
    data.obs["ds_selected"] = selected

    Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64)
    Y_init[selected, :] = X_tsne
    Y_init[~selected, :] = net_train_and_predict(X,
                                                 X_tsne,
                                                 X_full[~selected, :],
                                                 net_alpha,
                                                 random_state,
                                                 verbose=True)

    data.obsm["X_" + out_basis + "_pred"] = Y_init

    polish_learning_rate = polish_learning_frac * data.shape[0]
    data.obsm["X_" + out_basis] = calc_tsne(
        X_full,
        n_jobs,
        n_components,
        perplexity,
        early_exaggeration,
        polish_learning_rate,
        random_state,
        init=Y_init,
        n_iter=polish_n_iter,
        n_iter_early_exag=0,
    )
Пример #6
0
def fle(
    data: MultimodalData,
    file_name: str = None,
    n_jobs: int = -1,
    rep: str = "diffmap",
    K: int = 50,
    full_speed: bool = False,
    target_change_per_node: float = 2.0,
    target_steps: int = 5000,
    is3d: bool = False,
    memory: int = 8,
    random_state: int = 0,
    out_basis: str = "fle",
) -> None:
    """Construct the Force-directed (FLE) graph.

    This implementation uses forceatlas2-python_ package, which is a Python wrapper of ForceAtlas2_.

    See [Jacomy14]_ for details on FLE.

    .. _forceatlas2-python: https://github.com/klarman-cell-observatory/forceatlas2-python
    .. _ForceAtlas2: https://github.com/klarman-cell-observatory/forceatlas2

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    file_name: ``str``, optional, default: ``None``
        Temporary file to store the coordinates as the input to forceatlas2. If ``None``, use ``tempfile.mkstemp`` to generate file name.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all available threads.

    rep: ``str``, optional, default: ``"diffmap"``
        Representation of data used for the calculation. By default, use Diffusion Map coordinates. If ``None``, use the count matrix ``data.X``.

    K: ``int``, optional, default: ``50``
        Number of nearest neighbors to be considered during the computation.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    target_change_per_node: ``float``, optional, default: ``2.0``
        Target change per node to stop ForceAtlas2.

    target_steps: ``int``, optional, default: ``5000``
        Maximum number of iterations before stopping the ForceAtlas2 algorithm.

    is3d: ``bool``, optional, default: ``False``
        If ``True``, calculate 3D force-directed layout.

    memory: ``int``, optional, default: ``8``
        Memory size in GB for the Java FA2 component. By default, use 8GB memory.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    out_basis: ``str``, optional, default: ``"fle"``
        Key name for calculated FLE coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: FLE coordinates of the data.

    Examples
    --------
    >>> pg.fle(data)
    """

    if file_name is None:
        import tempfile

        _, file_name = tempfile.mkstemp()

    n_jobs = effective_n_jobs(n_jobs)
    rep = update_rep(rep)

    if ("W_" + rep) not in data.uns:
        neighbors(
            data,
            K=K,
            rep=rep,
            n_jobs=n_jobs,
            random_state=random_state,
            full_speed=full_speed,
        )

    data.obsm["X_" + out_basis] = calc_force_directed_layout(
        W_from_rep(data, rep),
        file_name,
        n_jobs,
        target_change_per_node,
        target_steps,
        is3d,
        memory,
        random_state,
    )
Пример #7
0
def umap(
    data: MultimodalData,
    rep: str = "pca",
    n_components: int = 2,
    n_neighbors: int = 15,
    min_dist: float = 0.5,
    spread: float = 1.0,
    random_state: int = 0,
    out_basis: str = "umap",
) -> None:
    """Calculate UMAP embedding of cells.

    This function uses umap-learn_ package. See [McInnes18]_ for details on UMAP.

    .. _umap-learn: https://github.com/lmcinnes/umap

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization.

    n_neighbors: ``int``, optional, default: ``15``
        Number of nearest neighbors considered during the computation.

    min_dist: ``float``, optional, default: ``0.5``
        The effective minimum distance between embedded data points.

    spread: ``float``, optional, default: ``1.0``
        The effective scale of embedded data points.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    out_basis: ``str``, optional, default: ``"umap"``
        Key name for calculated UMAP coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: UMAP coordinates of the data.

    Examples
    --------
    >>> pg.umap(data)
    """
    start = time.time()

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    X = X_from_rep(data, rep)
    if not knn_is_cached(data, indices_key, distances_key, n_neighbors):
        if indices_key in data.uns and n_neighbors > data.uns[
                indices_key].shape[1] + 1:
            logger.warning(
                f"Reduce K for neighbors in UMAP from {n_neighbors} to {data.uns[indices_key].shape[1] + 1}"
            )
            n_neighbors = data.uns[indices_key].shape[1] + 1
        else:
            raise ValueError("Please run neighbors first!")

    knn_indices = np.insert(data.uns[indices_key][:, 0:n_neighbors - 1],
                            0,
                            range(data.shape[0]),
                            axis=1)
    knn_dists = np.insert(data.uns[distances_key][:, 0:n_neighbors - 1],
                          0,
                          0.0,
                          axis=1)
    data.obsm["X_" + out_basis] = calc_umap(
        X,
        n_components,
        n_neighbors,
        min_dist,
        spread,
        random_state,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )

    end = time.time()
    logger.info("UMAP is calculated. Time spent = {:.2f}s.".format(end -
                                                                   start))
Пример #8
0
def fitsne(
    data: MultimodalData,
    rep: str = "pca",
    n_jobs: int = -1,
    n_components: int = 2,
    perplexity: float = 30,
    early_exaggeration: int = 12,
    learning_rate: float = 1000,
    random_state: int = 0,
    out_basis: str = "fitsne",
) -> None:
    """Calculate FIt-SNE embedding of cells.

    This function uses fitsne_ package. See [Linderman19]_ for details on FIt-SNE.

    .. _fitsne: https://github.com/KlugerLab/FIt-SNE

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all available threads.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated FI-tSNE coordinates. By default, generate 2-dimensional data for 2D visualization.

    perplexity: ``float``, optional, default: ``30``
        The perplexity is related to the number of nearest neighbors used in other manifold learning algorithms. Larger datasets usually require a larger perplexity.

    early_exaggeration: ``int``, optional, default: ``12``
        Controls how tight natural clusters in the original space are in the embedded space, and how much space will be between them.

    learning_rate: ``float``, optional, default: ``1000``
        The learning rate can be a critical parameter, which should be between 100 and 1000.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    out_basis: ``str``, optional, default: ``"fitsne"``
        Key name for calculated FI-tSNE coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: FI-tSNE coordinates of the data.

    Examples
    --------
    >>> pg.fitsne(data)
    """

    rep = update_rep(rep)
    n_jobs = effective_n_jobs(n_jobs)

    data.obsm["X_" + out_basis] = calc_fitsne(
        X_from_rep(data, rep),
        n_jobs,
        n_components,
        perplexity,
        early_exaggeration,
        learning_rate,
        random_state,
    )
def net_umap(
    data: MultimodalData,
    rep: str = "pca",
    n_jobs: int = -1,
    n_components: int = 2,
    n_neighbors: int = 15,
    min_dist: float = 0.5,
    spread: float = 1.0,
    densmap: bool = False,
    dens_lambda: float = 2.0,
    dens_frac: float = 0.3,
    dens_var_shift: float = 0.1,
    random_state: int = 0,
    select_frac: float = 0.1,
    select_K: int = 25,
    select_alpha: float = 1.0,
    full_speed: bool = False,
    net_alpha: float = 0.1,
    polish_learning_rate: float = 10.0,
    polish_n_epochs: int = 30,
    out_basis: str = "net_umap",
) -> None:
    """Calculate Net-UMAP embedding of cells.

    Net-UMAP is an approximated UMAP embedding using Deep Learning model to improve the speed.

    In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor.

    See [Li20]_ for details.

    .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all physical CPU cores.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization.

    n_neighbors: ``int``, optional, default: ``15``
        Number of nearest neighbors considered during the computation.

    min_dist: ``float``, optional, default: ``0.5``
        The effective minimum distance between embedded data points.

    spread: ``float``, optional, default: ``1.0``
        The effective scale of embedded data points.

    densmap: ``bool``, optional, default: ``False``
        Whether the density-augmented objective of densMAP should be used for optimization, which will generate an embedding where
        local densities are encouraged to be correlated with those in the original space.

    dens_lambda: ``float``, optional, default: ``2.0``
        Controls the regularization weight of the density correlation term in densMAP. Only works when *densmap* is ``True``.
        Larger values prioritize density preservation over the UMAP objective, while values closer to 0 for the opposite direction.
        Notice that setting this parameter to ``0`` is equivalent to running the original UMAP algorithm.

    dens_frac: ``float``, optional, default: ``0.3``
        Controls the fraction of epochs (between 0 and 1) where the density-augmented objective is used in densMAP. Only works when
        *densmap* is ``True``.
        The first ``(1 - dens_frac)`` fraction of epochs optimize the original UMAP objective before introducing the density
        correlation term.

    dens_var_shift: ``float``, optional, default, ``0.1``
        A small constant added to the variance of local radii in the embedding when calculating the density correlation objective to
        prevent numerical instability from dividing by a small number. Only works when *densmap* is ``True``.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    select_frac: ``float``, optional, default: ``0.1``
        Down sampling fraction on the cells.

    select_K: ``int``, optional, default: ``25``
        Number of neighbors to be used to estimate local density for each data point for down sampling.

    select_alpha: ``float``, optional, default: ``1.0``
        Weight the down sample to be proportional to ``radius ** select_alpha``.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    net_alpha: ``float``, optional, default: ``0.1``
        L2 penalty (regularization term) parameter of the deep regressor.

    polish_learning_frac: ``float``, optional, default: ``10.0``
        After running the deep regressor to predict new coordinates, use ``polish_learning_frac`` * ``n_obs`` as the learning rate to polish the coordinates.

    polish_n_iter: ``int``, optional, default: ``30``
        Number of iterations for polishing UMAP run.

    out_basis: ``str``, optional, default: ``"net_umap"``
        Key name for calculated UMAP coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: Net UMAP coordinates of the data.

    Update ``data.obs``:
        * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase.

    Examples
    --------
    >>> pg.net_umap(data)
    """

    rep = update_rep(rep)
    n_jobs = eff_n_jobs(n_jobs)
    knn_indices, knn_dists = get_neighbors(data,
                                           K=select_K,
                                           rep=rep,
                                           n_jobs=n_jobs,
                                           random_state=random_state,
                                           full_speed=full_speed)

    selected = select_cells(
        knn_dists,
        select_frac,
        K=select_K,
        alpha=select_alpha,
        random_state=random_state,
    )
    X_full = X_from_rep(data, rep)
    X = X_full[selected, :]

    if data.shape[0] < n_neighbors:
        logger.warning(
            f"Warning: Number of samples = {data.shape[0]} < K = {n_neighbors}!\n Set K to {data.shape[0]}."
        )
        n_neighbors = data.shape[0]

    ds_indices_key = "ds_" + rep + "_knn_indices"  # ds refers to down-sampling
    ds_distances_key = "ds_" + rep + "_knn_distances"
    indices, distances = calculate_nearest_neighbors(
        X,
        K=n_neighbors,
        n_jobs=n_jobs,
        random_state=random_state,
        full_speed=full_speed,
    )
    data.uns[ds_indices_key] = indices
    data.uns[ds_distances_key] = distances

    knn_indices = np.insert(data.uns[ds_indices_key][:, 0:n_neighbors - 1],
                            0,
                            range(X.shape[0]),
                            axis=1)
    knn_dists = np.insert(data.uns[ds_distances_key][:, 0:n_neighbors - 1],
                          0,
                          0.0,
                          axis=1)

    X_umap = calc_umap(
        X,
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        spread=spread,
        densmap=densmap,
        dens_lambda=dens_lambda,
        dens_frac=dens_frac,
        dens_var_shift=dens_var_shift,
        random_state=random_state,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )

    data.uns["X_" + out_basis + "_small"] = X_umap
    data.obs["ds_selected"] = selected

    Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64)
    Y_init[selected, :] = X_umap
    Y_init[~selected, :] = net_train_and_predict(X,
                                                 X_umap,
                                                 X_full[~selected, :],
                                                 net_alpha,
                                                 n_jobs,
                                                 random_state,
                                                 verbose=True)

    data.obsm["X_" + out_basis + "_pred"] = Y_init

    knn_indices, knn_dists = get_neighbors(data,
                                           K=n_neighbors,
                                           rep=rep,
                                           n_jobs=n_jobs,
                                           random_state=random_state,
                                           full_speed=full_speed)
    knn_indices = np.insert(knn_indices[:, 0:n_neighbors - 1],
                            0,
                            range(data.shape[0]),
                            axis=1)
    knn_dists = np.insert(knn_dists[:, 0:n_neighbors - 1], 0, 0.0, axis=1)

    key = f"X_{out_basis}"
    data.obsm[key] = calc_umap(
        X_full,
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        spread=spread,
        densmap=densmap,
        dens_lambda=dens_lambda,
        dens_frac=dens_frac,
        dens_var_shift=dens_var_shift,
        random_state=random_state,
        init=Y_init,
        n_epochs=polish_n_epochs,
        learning_rate=polish_learning_rate,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )
    data.register_attr(key, "basis")
Пример #10
0
def tsne(
    data: MultimodalData,
    rep: str = "pca",
    n_jobs: int = -1,
    n_components: int = 2,
    perplexity: float = 30,
    early_exaggeration: int = 12,
    learning_rate: float = "auto",
    initialization: str = "pca",
    random_state: int = 0,
    out_basis: str = "tsne",
) -> None:
    """Calculate t-SNE embedding of cells using the FIt-SNE package.

    This function uses fitsne_ package. See [Linderman19]_ for details on FIt-SNE algorithm.

    .. _fitsne: https://github.com/KlugerLab/FIt-SNE

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all physical CPU cores.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated FI-tSNE coordinates. By default, generate 2-dimensional data for 2D visualization.

    perplexity: ``float``, optional, default: ``30``
        The perplexity is related to the number of nearest neighbors used in other manifold learning algorithms. Larger datasets usually require a larger perplexity.

    early_exaggeration: ``int``, optional, default: ``12``
        Controls how tight natural clusters in the original space are in the embedded space, and how much space will be between them.

    learning_rate: ``float``, optional, default: ``auto``
        By default, the learning rate is determined automatically as max(data.shape[0] / early_exaggeration, 200). See [Belkina19]_ and [Kobak19]_ for details.
    
    initialization: ``str``, optional, default: ``pca``
        Initialization can be either ``pca`` or ``random`` or np.ndarray. By default, we use ``pca`` initialization according to [Kobak19]_.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    out_basis: ``str``, optional, default: ``"fitsne"``
        Key name for calculated FI-tSNE coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: FI-tSNE coordinates of the data.

    Examples
    --------
    >>> pg.tsne(data)
    """

    rep = update_rep(rep)
    n_jobs = eff_n_jobs(n_jobs)
    X = X_from_rep(data, rep).astype(np.float64)

    if learning_rate == "auto":
        learning_rate = max(X.shape[0] / early_exaggeration, 200.0)

    if initialization == "random":
        initialization = None
    elif initialization == "pca":
        if rep == "pca":
            initialization = X[:, 0:n_components].copy()
        else:
            from sklearn.decomposition import PCA
            pca = PCA(n_components=n_components, random_state=random_state)
            with threadpool_limits(limits=n_jobs):
                initialization = np.ascontiguousarray(pca.fit_transform(X))
        initialization = initialization / np.std(initialization[:, 0]) * 0.0001
    else:
        assert isinstance(
            initialization,
            np.ndarray) and initialization.ndim == 2 and initialization.shape[
                0] == X.shape[0] and initialization.shape[1] == n_components
        if initialization.dtype != np.float64:
            initialization = initialization.astype(np.float64)

    data.obsm["X_" + out_basis] = calc_tsne(
        X,
        n_jobs,
        n_components,
        perplexity,
        early_exaggeration,
        learning_rate,
        random_state,
        initialization,
    )
Пример #11
0
def diffmap(
    data: MultimodalData,
    n_components: int = 100,
    rep: str = "pca",
    solver: str = "eigsh",
    max_t: float = 5000,
    n_jobs: int = -1,
    random_state: int = 0,
) -> None:
    """Calculate Diffusion Map.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    n_components: ``int``, optional, default: ``100``
        Number of diffusion components to calculate.

    rep: ``str``, optional, default: ``"pca"``
        Embedding Representation of data used for calculating the Diffusion Map. By default, use PCA coordinates.

    solver: ``str``, optional, default: ``"eigsh"``
        Solver for eigen decomposition:
            * ``"eigsh"``: default setting. Use *scipy* `eigsh <https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.eigsh.html>`_ as the solver to find eigenvalus and eigenvectors using the Implicitly Restarted Lanczos Method.
            * ``"randomized"``: Use *scikit-learn* `randomized_svd <https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_svd.html>`_ as the solver to calculate a truncated randomized SVD.

    max_t: ``float``, optional, default: ``5000``
        pegasus tries to determine the best t to sum up to between ``[1, max_t]``.

    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to using all physical CPU cores.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm["X_diffmap"]``: Diffusion Map matrix of the data.

    Update ``data.uns``:
        * ``data.uns["diffmap_evals"]``: Eigenvalues corresponding to Diffusion Map matrix.

    Examples
    --------
    >>> pg.diffmap(data)
    """

    rep = update_rep(rep)
    Phi_pt, Lambda, Phi = calculate_diffusion_map(
        W_from_rep(data, rep),
        n_components=n_components,
        solver=solver,
        max_t = max_t,
        n_jobs = n_jobs,
        random_state=random_state,
    )

    data.obsm["X_diffmap"] = np.ascontiguousarray(Phi_pt, dtype=np.float32)
    data.uns["diffmap_evals"] = Lambda.astype(np.float32)
    data.obsm["X_phi"] = np.ascontiguousarray(Phi, dtype=np.float32)
    # data.uns['W_norm'] = W_norm
    # data.obsm['X_dmnorm'] = U_df

    # remove previous FLE calculations
    data.uns.pop("diffmap_knn_indices", None)
    data.uns.pop("diffmap_knn_distances", None)
    data.uns.pop("W_diffmap", None)
Пример #12
0
def neighbors(
    data: AnnData,
    K: int = 100,
    rep: "str" = "pca",
    n_jobs: int = -1,
    random_state: int = 0,
    full_speed: bool = False,
) -> None:
    """Compute k nearest neighbors and affinity matrix, which will be used for diffmap and graph-based community detection algorithms.

    The kNN calculation uses hnswlib introduced by [Malkov16]_.

    Parameters
    ----------

    data: ``anndata.AnnData``
        Annotated data matrix with rows for cells and columns for genes.

    K: ``int``, optional, default: ``100``
        Number of neighbors, including the data point itself.

    rep: ``str``, optional, default: ``"pca"``
        Embedding representation used to calculate kNN. If ``None``, use ``data.X``; otherwise, keyword ``'X_' + rep`` must exist in ``data.obsm``.
    
    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all available threads.
    
    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.
    
    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible. 
        * Otherwise, use only one thread to make sure results are reproducible.

    Returns
    -------
    ``None``

    Update ``data.uns``:
        * ``data.uns[rep + "_knn_indices"]``: kNN index matrix. Row i is the index list of kNN of cell i (excluding itself), sorted from nearest to farthest.
        * ``data.uns[rep + "_knn_distances"]``: kNN distance matrix. Row i is the distance list of kNN of cell i (excluding itselt), sorted from smallest to largest.
        * ``data.uns["W_" + rep]``: kNN graph of the data in terms of affinity matrix.

    Examples
    --------
    >>> pg.neighbors(adata)
    """

    # calculate kNN
    start = time.time()
    rep = update_rep(rep)
    indices, distances = get_neighbors(
        data,
        K=K,
        rep=rep,
        n_jobs=n_jobs,
        random_state=random_state,
        full_speed=full_speed,
    )
    end = time.time()
    logger.info(
        "Nearest neighbor search is finished in {:.2f}s.".format(end - start))

    # calculate affinity matrix
    start = time.time()
    W = calculate_affinity_matrix(indices[:, 0:K - 1], distances[:, 0:K - 1])
    data.uns["W_" + rep] = W
    end = time.time()
    logger.info(
        "Affinity matrix calculation is finished in {:.2f}s".format(end -
                                                                    start))
Пример #13
0
def tsne(
    data: AnnData,
    rep: str = "pca",
    n_jobs: int = -1,
    n_components: int = 2,
    perplexity: float = 30,
    early_exaggeration: int = 12,
    learning_rate: float = 1000,
    random_state: int = 0,
    out_basis: str = "tsne",
) -> None:
    """Calculate tSNE embedding using MulticoreTSNE_ package.

    .. _MulticoreTSNE: https://github.com/DmitryUlyanov/Multicore-TSNE

    Parameters
    ----------
    data: ``anndata.AnnData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all available threads.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated tSNE coordinates. By default, generate 2-dimensional data for 2D visualization.

    perplexity: ``float``, optional, default: ``30``
        The perplexity is related to the number of nearest neighbors used in other manifold learning algorithms. Larger datasets usually require a larger perplexity.

    early_exaggeration: ``int``, optional, default: ``12``
        Controls how tight natural clusters in the original space are in the embedded space, and how much space will be between them.

    learning_rate: ``float``, optional, default: ``1000``
        The learning rate can be a critical parameter, which should be between 100 and 1000.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    out_basis: ``str``, optional, default: ``"tsne"``
        Key name for calculated tSNE coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: tSNE coordinates of the data.

    Examples
    --------
    >>> pg.tsne(adata)
    """
    start = time.time()
    rep = update_rep(rep)
    n_jobs = effective_n_jobs(n_jobs)

    data.obsm["X_" + out_basis] = calc_tsne(
        X_from_rep(data, rep),
        n_jobs,
        n_components,
        perplexity,
        early_exaggeration,
        learning_rate,
        random_state,
    )

    end = time.time()
    logger.info("t-SNE is calculated. Time spent = {:.2f}s.".format(end -
                                                                    start))
Пример #14
0
def get_neighbors(
    data: MultimodalData,
    K: int = 100,
    rep: str = "pca",
    n_comps: int = None,
    n_jobs: int = -1,
    random_state: int = 0,
    full_speed: bool = False,
    use_cache: bool = True,
    dist: str = "l2",
) -> Tuple[List[int], List[float]]:
    """Find K nearest neighbors for each data point and return the indices and distances arrays.

    Parameters
    ----------

    data : `pegasusio.MultimodalData`
        An AnnData object.
    K : `int`, optional (default: 100)
        Number of neighbors, including the data point itself.
    rep : `str`, optional (default: 'pca')
        Representation used to calculate kNN. If `None` use data.X
    n_comps: `int`, optional (default: None)
        Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions.
    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to using all physical CPU cores.
    random_state: `int`, optional (default: 0)
        Random seed for random number generator.
    full_speed: `bool`, optional (default: False)
        If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible.
    use_cache: `bool`, optional (default: True)
        If use_cache and found cached knn results, will not recompute.
    dist: `str`, optional (default: 'l2')
        Distance metric to use. By default, use squared L2 distance. Available options, inner product 'ip' or cosine similarity 'cosine'.

    Returns
    -------

    kNN indices and distances arrays.

    Examples
    --------
    >>> indices, distances = tools.get_neighbors(data)
    """

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if use_cache and knn_is_cached(data, indices_key, distances_key, K):
        indices = data.obsm[indices_key]
        distances = data.obsm[distances_key]
        logger.info("Found cached kNN results, no calculation is required.")
    else:
        indices, distances = calculate_nearest_neighbors(
            X_from_rep(data, rep, n_comps),
            K=K,
            n_jobs=eff_n_jobs(n_jobs),
            random_state=random_state,
            full_speed=full_speed,
            dist=dist,
        )
        data.obsm[indices_key] = indices
        data.register_attr(indices_key, "knn")
        data.obsm[distances_key] = distances
        data.register_attr(distances_key, "knn")

    return indices, distances
Пример #15
0
def umap(
    data: MultimodalData,
    rep: str = "pca",
    n_components: int = 2,
    n_neighbors: int = 15,
    min_dist: float = 0.5,
    spread: float = 1.0,
    n_jobs: int = -1,
    full_speed: bool = False,
    random_state: int = 0,
    out_basis: str = "umap",
) -> None:
    """Calculate UMAP embedding of cells.

    This function uses umap-learn_ package. See [McInnes18]_ for details on UMAP.

    .. _umap-learn: https://github.com/lmcinnes/umap

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization.

    n_neighbors: ``int``, optional, default: ``15``
        Number of nearest neighbors considered during the computation.

    min_dist: ``float``, optional, default: ``0.5``
        The effective minimum distance between embedded data points.

    spread: ``float``, optional, default: ``1.0``
        The effective scale of embedded data points.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use for computing kNN graphs. If ``-1``, use all physical CPU cores.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    out_basis: ``str``, optional, default: ``"umap"``
        Key name for calculated UMAP coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: UMAP coordinates of the data.

    Examples
    --------
    >>> pg.umap(data)
    """
    rep = update_rep(rep)
    X = X_from_rep(data, rep)

    if data.shape[0] < n_neighbors:
        logger.warning(
            f"Warning: Number of samples = {data.shape[0]} < K = {n_neighbors}!\n Set K to {data.shape[0]}."
        )
        n_neighbors = data.shape[0]

    knn_indices, knn_dists = get_neighbors(data,
                                           K=n_neighbors,
                                           rep=rep,
                                           n_jobs=n_jobs,
                                           random_state=random_state,
                                           full_speed=full_speed)
    knn_indices = np.insert(knn_indices[:, 0:n_neighbors - 1],
                            0,
                            range(data.shape[0]),
                            axis=1)
    knn_dists = np.insert(knn_dists[:, 0:n_neighbors - 1], 0, 0.0, axis=1)

    data.obsm["X_" + out_basis] = calc_umap(
        X,
        n_components,
        n_neighbors,
        min_dist,
        spread,
        random_state,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )
Пример #16
0
def neighbors(
    data: MultimodalData,
    K: int = 100,
    rep: "str" = "pca",
    n_comps: int = None,
    n_jobs: int = -1,
    random_state: int = 0,
    full_speed: bool = False,
    use_cache: bool = True,
    dist: str = "l2",
) -> None:
    """Compute k nearest neighbors and affinity matrix, which will be used for diffmap and graph-based community detection algorithms.

    The kNN calculation uses `hnswlib <https://github.com/nmslib/hnswlib>`_ introduced by [Malkov16]_.

    Parameters
    ----------

    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    K: ``int``, optional, default: ``100``
        Number of neighbors, including the data point itself.

    rep: ``str``, optional, default: ``"pca"``
        Embedding representation used to calculate kNN. If ``None``, use ``data.X``; otherwise, keyword ``'X_' + rep`` must exist in ``data.obsm``.

    n_comps: `int`, optional (default: None)
        Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all physical CPU cores.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    use_cache: ``bool``, optional, default: ``True``
        * If ``True`` and found cached knn results, Pegasus will use cached results and do not recompute.
        * Otherwise, compute kNN irrespective of caching status.

    dist: ``str``, optional (default: ``"l2"``)
        Distance metric to use. By default, use squared L2 distance. Available options, inner product ``"ip"`` or cosine similarity ``"cosine"``.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm[rep + "_knn_indices"]``: kNN index matrix. Row i is the index list of kNN of cell i (excluding itself), sorted from nearest to farthest.
        * ``data.obsm[rep + "_knn_distances"]``: kNN distance matrix. Row i is the distance list of kNN of cell i (excluding itselt), sorted from smallest to largest.

    Update ``data.obsp``:
        * ``data.obsp["W_" + rep]``: kNN graph of the data in terms of affinity matrix.

    Examples
    --------
    >>> pg.neighbors(data)
    """

    # calculate kNN
    rep = update_rep(rep)
    indices, distances = get_neighbors(
        data,
        K=K,
        rep=rep,
        n_comps=n_comps,
        n_jobs=n_jobs,
        random_state=random_state,
        full_speed=full_speed,
        use_cache=use_cache,
        dist=dist,
    )

    # calculate affinity matrix
    W = calculate_affinity_matrix(indices[:, 0:K - 1], distances[:, 0:K - 1])
    data.obsp["W_" + rep] = W
    # pop out jump method values
    data.uns.pop(f"{rep}_jump_values", None)
    data.uns.pop(f"{rep}_optimal_k", None)
def umap(
    data: MultimodalData,
    rep: str = "pca",
    rep_ncomps: int = None,
    n_components: int = 2,
    n_neighbors: int = 15,
    min_dist: float = 0.5,
    spread: float = 1.0,
    densmap: bool = False,
    dens_lambda: float = 2.0,
    dens_frac: float = 0.3,
    dens_var_shift: float = 0.1,
    n_jobs: int = -1,
    full_speed: bool = False,
    random_state: int = 0,
    out_basis: str = "umap",
) -> None:
    """Calculate UMAP embedding of cells.

    This function uses umap-learn_ package. See [McInnes18]_ for details on UMAP.

    .. _umap-learn: https://github.com/lmcinnes/umap

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    rep_ncomps: `int`, optional (default: None)
        Number of components to be used in `rep`. If rep_ncomps == None, use all components; otherwise, use the minimum of rep_ncomps and rep's dimensions.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization.

    n_neighbors: ``int``, optional, default: ``15``
        Number of nearest neighbors considered during the computation.

    min_dist: ``float``, optional, default: ``0.5``
        The effective minimum distance between embedded data points.

    spread: ``float``, optional, default: ``1.0``
        The effective scale of embedded data points.

    densmap: ``bool``, optional, default: ``False``
        Whether the density-augmented objective of densMAP should be used for optimization, which will generate an embedding where
        local densities are encouraged to be correlated with those in the original space.

    dens_lambda: ``float``, optional, default: ``2.0``
        Controls the regularization weight of the density correlation term in densMAP. Only works when *densmap* is ``True``.
        Larger values prioritize density preservation over the UMAP objective, while values closer to 0 for the opposite direction.
        Notice that setting this parameter to ``0`` is equivalent to running the original UMAP algorithm.

    dens_frac: ``float``, optional, default: ``0.3``
        Controls the fraction of epochs (between 0 and 1) where the density-augmented objective is used in densMAP. Only works when
        *densmap* is ``True``.
        The first ``(1 - dens_frac)`` fraction of epochs optimize the original UMAP objective before introducing the density
        correlation term.

    dens_var_shift: ``float``, optional, default, ``0.1``
        A small constant added to the variance of local radii in the embedding when calculating the density correlation objective to
        prevent numerical instability from dividing by a small number. Only works when *densmap* is ``True``.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use for computing kNN graphs. If ``-1``, use all physical CPU cores.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    out_basis: ``str``, optional, default: ``"umap"``
        Key name for calculated UMAP coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: UMAP coordinates of the data.

    Examples
    --------
    >>> pg.umap(data)
    """
    rep = update_rep(rep)
    X = X_from_rep(data, rep, rep_ncomps)

    if data.shape[0] < n_neighbors:
        logger.warning(
            f"Warning: Number of samples = {data.shape[0]} < K = {n_neighbors}!\n Set K to {data.shape[0]}."
        )
        n_neighbors = data.shape[0]

    knn_indices, knn_dists = get_neighbors(data,
                                           K=n_neighbors,
                                           rep=rep,
                                           n_jobs=n_jobs,
                                           random_state=random_state,
                                           full_speed=full_speed)
    knn_indices = np.insert(knn_indices[:, 0:n_neighbors - 1],
                            0,
                            range(data.shape[0]),
                            axis=1)
    knn_dists = np.insert(knn_dists[:, 0:n_neighbors - 1], 0, 0.0, axis=1)

    key = f"X_{out_basis}"
    data.obsm[key] = calc_umap(
        X,
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        spread=spread,
        densmap=densmap,
        dens_lambda=dens_lambda,
        dens_frac=dens_frac,
        dens_var_shift=dens_var_shift,
        random_state=random_state,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )
    data.register_attr(key, "basis")