Exemplo n.º 1
0
def get_neighbors(
    data: MultimodalData,
    K: int = 100,
    rep: str = "pca",
    n_jobs: int = -1,
    random_state: int = 0,
    full_speed: bool = False,
) -> Tuple[List[int], List[float]]:
    """Find K nearest neighbors for each data point and return the indices and distances arrays.

    Parameters
    ----------

    data : `pegasusio.MultimodalData`
        An AnnData object.
    K : `int`, optional (default: 100)
        Number of neighbors, including the data point itself.
    rep : `str`, optional (default: 'pca')
        Representation used to calculate kNN. If `None` use data.X
    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to using all physical CPU cores.
    random_state: `int`, optional (default: 0)
        Random seed for random number generator.
    full_speed: `bool`, optional (default: False)
        If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible.

    Returns
    -------

    kNN indices and distances arrays.

    Examples
    --------
    >>> indices, distances = tools.get_neighbors(data)
    """

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if knn_is_cached(data, indices_key, distances_key, K):
        indices = data.uns[indices_key]
        distances = data.uns[distances_key]
        logger.info("Found cached kNN results, no calculation is required.")
    else:
        indices, distances = calculate_nearest_neighbors(
            X_from_rep(data, rep),
            K=K,
            n_jobs=eff_n_jobs(n_jobs),
            random_state=random_state,
            full_speed=full_speed,
        )
        data.uns[indices_key] = indices
        data.uns[distances_key] = distances

    return indices, distances
Exemplo n.º 2
0
def calculate_diffusion_map(
    W: csr_matrix, n_components: int, solver: str, max_t: int, n_jobs: int, random_state: int,
) -> Tuple[np.array, np.array, np.array]:
    assert issparse(W)

    nc, labels = connected_components(W, directed=True, connection="strong")
    logger.info("Calculating connected components is done.")

    assert nc == 1

    W_norm, diag, diag_half = calculate_normalized_affinity(W.astype(np.float64)) # use double precision to guarantee reproducibility
    logger.info("Calculating normalized affinity matrix is done.")

    n_jobs = eff_n_jobs(n_jobs)
    with threadpool_limits(limits = n_jobs):
        if solver == "eigsh":
            np.random.seed(random_state)
            v0 = np.random.uniform(-1.0, 1.0, W_norm.shape[0])
            Lambda, U = eigsh(W_norm, k=n_components, v0=v0)
            Lambda = Lambda[::-1]
            U = U[:, ::-1]
        else:
            assert solver == "randomized"
            U, S, VT = randomized_svd(
                W_norm, n_components=n_components, random_state=random_state
            )
            signs = np.sign((U * VT.transpose()).sum(axis=0))  # get eigenvalue signs
            Lambda = signs * S  # get eigenvalues

    # remove the first eigen value and vector
    Lambda = Lambda[1:]
    U = U[:, 1:]
    Phi = U / diag_half[:, np.newaxis]

    if max_t == -1:
        Lambda_new = Lambda / (1.0 - Lambda)
    else:
        # Find the knee point
        x = np.array(range(1, max_t + 1), dtype = float)
        y = np.array([calc_von_neumann_entropy(Lambda, t) for t in x])
        t = x[find_knee_point(x, y)]
        logger.info("Detected knee point at t = {:.0f}.".format(t))

        # U_df = U * Lambda #symmetric diffusion component
        Lambda_new = Lambda * ((1.0 - Lambda ** t) / (1.0 - Lambda))
    Phi_pt = Phi * Lambda_new  # asym pseudo component

    return Phi_pt, Lambda, Phi  # , U_df, W_norm
Exemplo n.º 3
0
def partition_cells_by_kmeans(
    X: np.ndarray,
    n_clusters: int,
    n_clusters2: int,
    n_init: int,
    n_jobs: int,
    random_state: int,
    min_avg_cells_per_final_cluster: Optional[int] = 10,
) -> List[int]:

    n_clusters = min(n_clusters,
                     max(X.shape[0] // min_avg_cells_per_final_cluster, 1))
    if n_clusters == 1:
        return np.zeros(X.shape[0], dtype=np.int32)

    n_jobs = eff_n_jobs(n_jobs)

    kmeans_params = {
        'n_clusters': n_clusters,
        'n_init': n_init,
        'random_state': random_state,
    }
    km = KMeans(**kmeans_params)

    with threadpool_limits(limits=n_jobs):
        km.fit(X)
        coarse = km.labels_.copy()

        km.set_params(n_init=1)
        labels = coarse.copy()
        base_sum = 0
        for i in range(n_clusters):
            idx = coarse == i
            nc = min(n_clusters2,
                     max(idx.sum() // min_avg_cells_per_final_cluster, 1))
            if nc == 1:
                labels[idx] = base_sum
            else:
                km.set_params(n_clusters=nc)
                km.fit(X[idx, :])
                labels[idx] = base_sum + km.labels_
            base_sum += nc

    return labels
def select_hvf_seurat_multi(
    X: Union[csr_matrix, np.ndarray],
    batches: List[str],
    cell2batch: List[str],
    n_top: int,
    n_jobs: int,
    min_disp: float,
    max_disp: float,
    min_mean: float,
    max_mean: float,
) -> List[int]:
    Xs = []
    for batch in batches:
        Xs.append(X[np.isin(cell2batch, batch)])

    n_jobs = eff_n_jobs(n_jobs)
    with parallel_backend("loky", inner_max_num_threads=1):
        res_arr = np.array(
            Parallel(n_jobs=n_jobs)(delayed(select_hvf_seurat_single)(
                Xs[i], n_top, min_disp, max_disp, min_mean, max_mean)
                                    for i in range(batches.size)))

    selected = res_arr >= 0
    shared = selected.sum(axis=0)
    cands = (shared > 0).nonzero()[0]
    import numpy.ma as ma

    median_rank = ma.median(ma.masked_array(res_arr, mask=~selected),
                            axis=0).data
    cands = sorted(cands, key=lambda x: median_rank[x])
    cands = sorted(cands, key=lambda x: shared[x], reverse=True)

    hvf_rank = np.full(X.shape[1], -1, dtype=int)
    hvf_rank[cands[:n_top]] = range(n_top)

    return hvf_rank
def _run_scrublet(
    data: Union[MultimodalData, UnimodalData],
    name: Optional[str] = '',
    expected_doublet_rate: Optional[float] = None,
    sim_doublet_ratio: Optional[float] = 2.0,
    n_prin_comps: Optional[int] = 30,
    k: Optional[int] = None,
    n_jobs: Optional[int] = -1,
    random_state: Optional[int] = 0,
    plot_hist: Optional[bool] = True
) -> Union[None, Figure]:
    """Calculate doublet scores using Scrublet-like [Wolock18]_ strategy for the current data.X; determine a right threshold based on the KDE curve.
       This function should be called after highly_variable_gene selection.

    Parameters
    -----------
    data: ``Union[MultimodalData, UnimodalData]`` object.
        Annotated data matrix with rows for cells and columns for genes. Data must be low quality cell and gene filtered and log-transformed. Assume 'raw.X' stores the raw count matrix.

    name: ``str``, optional, default: ``''``
        Name of the sample.

    expected_doublet_rate: ``float``, optional, default: ``None``
        The expected doublet rate for the experiment. By default, calculate the expected rate based on number of cells from the 10x multiplet rate table

    sim_doublet_ratio: ``float``, optional, default: ``2.0``
        The ratio between synthetic doublets and observed cells.

    n_prin_comps: ``int``, optional, default: ``30``
        Number of principal components.

    k: ``int``, optional, default: ``None``
        Number of observed cell neighbors. If None, k = round(0.5 * sqrt(number of observed cells)). Total neighbors k_adj = round(k * (1.0 + sim_doublet_ratio)).

    n_jobs: ``int``, optional, default: ``-``
        Number of threads to use. If ``-1``, use all physical CPU cores.

    random_state: ``int``, optional, default: ``0``
        Random state for doublet simulation, PCA and approximate nearest neighbor search.

    plot_hist: ``bool``, optional, default: ``True``
        If True, plot diagnostic histograms. Each sample would have a figure consisting of 4 panels showing histograms of doublet scores for observed cells (panel 1, density in log scale), simulated doublets (panel 2, density in log scale), KDE plot (panel 3) and signed curvature plot (panel 4) of log doublet scores for simulated doublets.

    Returns
    --------
    ``None`` or a ``matplotlib Figure object`` if

    Update ``data.obs``:
        * ``data.obs['doublet_score']``: The calculated doublet scores on cells.
        * ``data.obs['pred_dbl']``: Predicted doublets as True.

    Update ``data.uns``:
        * ``data.uns['doublet_threshold']``: Inferred doublet threshold; any score > threshold is identified as a neotypic doublet.

    Examples
    --------
    >>> pg.run_scrublet(data)
    """
    from pegasus.tools import calculate_nearest_neighbors, simulate_doublets
    from sklearn.decomposition import PCA
    from scipy.stats import gaussian_kde

    if "highly_variable_features" not in data.var:
        raise ValueError("_run_scrublet must be run after highly_variable_features is called!")

    r = sim_doublet_ratio
    if expected_doublet_rate is None:
        expected_doublet_rate = _calc_expected_doublet_rate(data.shape[0])
    rho = expected_doublet_rate

    # subset the raw count matrix
    rawX = data.get_matrix("raw.X")
    obs_umis = rawX.sum(axis = 1, dtype = np.int32).A1
    rawX = rawX[:, data.var["highly_variable_features"].values]
    # Simulate synthetic doublets
    sim_rawX, pair_idx = simulate_doublets(rawX, r, random_state)
    sim_umis = obs_umis[pair_idx].sum(axis = 1, dtype = np.int32)

    # standardize and calculate PCA for rawX
    obsX = rawX.astype(np.float32).toarray()
    obsX /= obs_umis.reshape(-1, 1) # normalize each cell

    m1 = obsX.mean(axis = 0) # calculate mean and std
    psum = np.multiply(obsX, obsX).sum(axis=0)
    std = ((psum - obsX.shape[0] * (m1 ** 2)) / (obsX.shape[0] - 1.0)) ** 0.5
    std[std == 0] = 1

    obsX -= m1 # standardize
    obsX /= std

    pca = PCA(n_components=n_prin_comps, random_state=random_state)
    n_jobs = eff_n_jobs(n_jobs)
    with threadpool_limits(limits = n_jobs):
        obs_pca = pca.fit_transform(obsX.astype(np.float64)) # float64 for reproducibility
        obs_pca = np.ascontiguousarray(obs_pca, dtype=np.float32)

    # standardize and calculate PCA for sim_rawX
    simX = sim_rawX.astype(np.float32).toarray()
    simX /= sim_umis.reshape(-1, 1) # normalize each cell

    simX -= m1 # standardize
    simX /= std

    sim_pca = pca.transform(simX) # transform to PC coordinates
    sim_pca = np.ascontiguousarray(sim_pca, dtype=np.float32)

    # concatenate observed and simulated data
    pc_coords = np.vstack((obs_pca, sim_pca))
    is_doublet = np.repeat(np.array([0, 1], dtype = np.int32), [obsX.shape[0], simX.shape[0]])

    # Calculate k nearest neighbors
    if k is None:
        k = int(round(0.5 * np.sqrt(obsX.shape[0])))
    k_adj = int(round(k * (1.0 + r)))
    indices, _ = calculate_nearest_neighbors(pc_coords, K = k_adj + 1, n_jobs = n_jobs)

    # Calculate scrublet-like doublet score
    k_d = is_doublet[indices].sum(axis = 1)
    q = (k_d + 1.0) / (k_adj + 2.0) # Equation 5
    doublet_scores = (q * rho / r) / ((1.0 - rho) - q * (1.0 - rho - rho / r)) # Equation 4
    obs_scores = doublet_scores[0:obsX.shape[0]]
    sim_scores = doublet_scores[obsX.shape[0]:]

    # Determine a scrublet score threshold
    # log transformed
    sim_scores_log = np.log(sim_scores)

    # Estimate KDE
    min_score = sim_scores_log.min()
    max_score = sim_scores_log.max()
    min_gap = np.diff(np.unique(np.sort(sim_scores_log))).min()
    from math import ceil
    n_gap = max(int(ceil((max_score - min_score) / min_gap)), 200) # minimum is 200
    gap = (max_score - min_score) / n_gap

    n_ext = 5
    min_score -= gap * n_ext
    max_score += gap * n_ext
    x = np.linspace(min_score, max_score, n_gap + 1 + n_ext * 2) # generate x coordinates
    kde = gaussian_kde(sim_scores_log)
    y = kde(x)

    # Find local maxima
    maxima, maxima_by_x, filtered_maxima = _find_local_maxima(y)
    assert maxima.size > 0
    curv = _calc_vec_f(_curvature, x.size, y, gap) # calculate curvature

    if maxima.size >= 2:
        pos = _locate_cutoff_among_peaks(y, maxima)
    else:
        frac_right_thre = 0.41
        frac_left_thre = 0.39

        pos = -1
        for i in range(maxima_by_x.size):
            frac_right = (sim_scores_log > x[maxima_by_x[i]]).sum() / sim_scores.size
            if frac_right < frac_right_thre: # peak might represent a doublet peak, try to find a cutoff at the left side
                if i == 0:
                    peak_curv_value = _find_curv_minima_at_peak(curv, maxima_by_x[i])
                    end = _find_pos_curv(curv, maxima_by_x[i]-1, '-')
                    start = _find_pos_curv(curv, _find_curv_local_minima(curv, peak_curv_value, filtered_maxima, end-1, '-')+1, '+')
                    assert start <= end
                    pos = curv[start:end+1].argmax() + start
                else:
                    pos = y[maxima_by_x[i-1]+1:maxima_by_x[i]].argmin() + (maxima_by_x[i-1]+1)

                frac_left = (sim_scores_log < x[pos]).sum() / sim_scores.size    
                if frac_left < frac_left_thre:
                    pos = maxima_by_x[i]

                break

        if pos < 0:
            # peak represents embedded doublets, find a cutoff at the right side
            peak_curv_value = _find_curv_minima_at_peak(curv, maxima_by_x[-1])
            start = _find_pos_curv(curv, maxima_by_x[-1]+1, '+')
            end = _find_pos_curv(curv, _find_curv_local_minima(curv, peak_curv_value, filtered_maxima, start+1, '+')-1, '-')
            assert start <= end
            pos = curv[start:end+1].argmax() + start

    threshold = np.exp(x[pos])

    data.obs["doublet_score"] = obs_scores.astype(np.float32)
    data.obs["pred_dbl"] = obs_scores > threshold
    data.uns["doublet_threshold"] = float(threshold)

    logger.info(f"Sample {name}: doublet threshold = {threshold:.4f}; total cells = {data.shape[0]}; neotypic doublet rate = {data.obs['pred_dbl'].sum() / data.shape[0]:.2%}")
    fig = None
    if plot_hist:
        fig = _plot_hist(obs_scores, sim_scores, threshold, x, y, curv)
    return fig
Exemplo n.º 6
0
def calculate_nearest_neighbors(
    X: np.array,
    K: int = 100,
    n_jobs: int = -1,
    method: str = "hnsw",
    M: int = 20,
    efC: int = 200,
    efS: int = 200,
    random_state: int = 0,
    full_speed: int = False,
    dist: str = 'l2',
):
    """Calculate nearest neighbors
    X is the sample by feature matrix
    Return K -1 neighbors, the first one is the point itself and thus omitted.
    TODO: Documentation
    """

    nsample = X.shape[0]

    if nsample <= 1000:
        method = "sklearn"

    if nsample < K:
        logger.warning(
            f"Warning: in calculate_nearest_neighbors, number of samples = {nsample} < K = {K}!\n Set K to {nsample}."
        )
        K = nsample

    n_jobs = eff_n_jobs(n_jobs)

    if method == "hnsw":
        try:
            import hnswlib
        except ImportError:
            raise ImportError(
                "Need hnswlib! Try 'pip install hnswlib' or 'conda install -c conda-forge hnswlib'."
            )

        assert not issparse(X)
        # Build hnsw index
        knn_index = hnswlib.Index(space=dist, dim=X.shape[1])
        knn_index.init_index(max_elements=nsample,
                             ef_construction=efC,
                             M=M,
                             random_seed=random_state)
        knn_index.set_num_threads(n_jobs if full_speed else 1)
        knn_index.add_items(X)

        # KNN query
        knn_index.set_ef(efS)
        knn_index.set_num_threads(n_jobs)
        indices, distances = knn_index.knn_query(X, k=K)
        # eliminate the first neighbor, which is the node itself
        _reorg_knn(indices, distances)
        indices = indices[:, 1:]
        indices.dtype = np.int64
        distances = distances[:, 1:]
        distances = np.sqrt(distances, out=distances)
    else:
        assert method == "sklearn"
        knn = NearestNeighbors(
            n_neighbors=K - 1, n_jobs=n_jobs
        )  # eliminate the first neighbor, which is the node itself
        knn.fit(X)
        distances, indices = knn.kneighbors()

    return indices, distances
Exemplo n.º 7
0
def net_fle(
    data: MultimodalData,
    file_name: str = None,
    n_jobs: int = -1,
    rep: str = "diffmap",
    K: int = 50,
    full_speed: bool = False,
    target_change_per_node: float = 2.0,
    target_steps: int = 5000,
    is3d: bool = False,
    memory: int = 8,
    random_state: int = 0,
    select_frac: float = 0.1,
    select_K: int = 25,
    select_alpha: float = 1.0,
    net_alpha: float = 0.1,
    polish_target_steps: int = 1500,
    out_basis: str = "net_fle",
) -> None:
    """Construct Net-Force-directed (FLE) graph.

    Net-FLE is an approximated FLE graph using Deep Learning model to improve the speed.

    In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor.

    See [Li20]_ for details.

    .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    file_name: ``str``, optional, default: ``None``
        Temporary file to store the coordinates as the input to forceatlas2. If ``None``, use ``tempfile.mkstemp`` to generate file name.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all physical CPU cores.

    rep: ``str``, optional, default: ``"diffmap"``
        Representation of data used for the calculation. By default, use Diffusion Map coordinates. If ``None``, use the count matrix ``data.X``.

    K: ``int``, optional, default: ``50``
        Number of nearest neighbors to be considered during the computation.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    target_change_per_node: ``float``, optional, default: ``2.0``
        Target change per node to stop ForceAtlas2.

    target_steps: ``int``, optional, default: ``5000``
        Maximum number of iterations before stopping the ForceAtlas2 algorithm.

    is3d: ``bool``, optional, default: ``False``
        If ``True``, calculate 3D force-directed layout.

    memory: ``int``, optional, default: ``8``
        Memory size in GB for the Java FA2 component. By default, use 8GB memory.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    select_frac: ``float``, optional, default: ``0.1``
        Down sampling fraction on the cells.

    select_K: ``int``, optional, default: ``25``
        Number of neighbors to be used to estimate local density for each data point for down sampling.

    select_alpha: ``float``, optional, default: ``1.0``
        Weight the down sample to be proportional to ``radius ** select_alpha``.

    net_alpha: ``float``, optional, default: ``0.1``
        L2 penalty (regularization term) parameter of the deep regressor.

    polish_target_steps: ``int``, optional, default: ``1500``
        After running the deep regressor to predict new coordinate, Number of ForceAtlas2 iterations.

    out_basis: ``str``, optional, default: ``"net_fle"``
        Key name for calculated FLE coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: Net FLE coordinates of the data.

    Update ``data.obs``:
        * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase.

    Examples
    --------
    >>> pg.net_fle(data)
    """

    if file_name is None:
        if file_name is None:
            import tempfile

            _, file_name = tempfile.mkstemp()

    rep = update_rep(rep)
    n_jobs = eff_n_jobs(n_jobs)

    if ("W_" + rep) not in data.uns:
        neighbors(
            data,
            K=K,
            rep=rep,
            n_jobs=n_jobs,
            random_state=random_state,
            full_speed=full_speed,
        )

    knn_indices, knn_dists = get_neighbors(data,
                                           K=select_K,
                                           rep=rep,
                                           n_jobs=n_jobs,
                                           random_state=random_state,
                                           full_speed=full_speed)

    selected = select_cells(
        knn_dists,
        select_frac,
        K=select_K,
        alpha=select_alpha,
        random_state=random_state,
    )

    X_full = X_from_rep(data, rep)
    X = X_full[selected, :]

    ds_indices_key = "ds_" + rep + "_knn_indices"
    ds_distances_key = "ds_" + rep + "_knn_distances"
    indices, distances = calculate_nearest_neighbors(X,
                                                     K=K,
                                                     n_jobs=n_jobs,
                                                     random_state=random_state,
                                                     full_speed=full_speed)
    data.uns[ds_indices_key] = indices
    data.uns[ds_distances_key] = distances

    W = calculate_affinity_matrix(indices, distances)

    X_fle = calc_force_directed_layout(
        W,
        file_name + ".small",
        n_jobs,
        target_change_per_node,
        target_steps,
        is3d,
        memory,
        random_state,
    )

    data.uns["X_" + out_basis + "_small"] = X_fle
    data.obs["ds_diffmap_selected"] = selected

    n_components = 2 if not is3d else 3
    Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64)
    Y_init[selected, :] = X_fle
    Y_init[~selected, :] = net_train_and_predict(X,
                                                 X_fle,
                                                 X_full[~selected, :],
                                                 net_alpha,
                                                 n_jobs,
                                                 random_state,
                                                 verbose=True)

    data.obsm["X_" + out_basis + "_pred"] = Y_init

    data.obsm["X_" + out_basis] = calc_force_directed_layout(
        W_from_rep(data, rep),
        file_name,
        n_jobs,
        target_change_per_node,
        polish_target_steps,
        is3d,
        memory,
        random_state,
        init=Y_init,
    )
Exemplo n.º 8
0
def fle(
    data: MultimodalData,
    file_name: str = None,
    n_jobs: int = -1,
    rep: str = "diffmap",
    K: int = 50,
    full_speed: bool = False,
    target_change_per_node: float = 2.0,
    target_steps: int = 5000,
    is3d: bool = False,
    memory: int = 8,
    random_state: int = 0,
    out_basis: str = "fle",
) -> None:
    """Construct the Force-directed (FLE) graph.

    This implementation uses forceatlas2-python_ package, which is a Python wrapper of ForceAtlas2_.

    See [Jacomy14]_ for details on FLE.

    .. _forceatlas2-python: https://github.com/klarman-cell-observatory/forceatlas2-python
    .. _ForceAtlas2: https://github.com/klarman-cell-observatory/forceatlas2

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    file_name: ``str``, optional, default: ``None``
        Temporary file to store the coordinates as the input to forceatlas2. If ``None``, use ``tempfile.mkstemp`` to generate file name.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all physical CPU cores.

    rep: ``str``, optional, default: ``"diffmap"``
        Representation of data used for the calculation. By default, use Diffusion Map coordinates. If ``None``, use the count matrix ``data.X``.

    K: ``int``, optional, default: ``50``
        Number of nearest neighbors to be considered during the computation.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    target_change_per_node: ``float``, optional, default: ``2.0``
        Target change per node to stop ForceAtlas2.

    target_steps: ``int``, optional, default: ``5000``
        Maximum number of iterations before stopping the ForceAtlas2 algorithm.

    is3d: ``bool``, optional, default: ``False``
        If ``True``, calculate 3D force-directed layout.

    memory: ``int``, optional, default: ``8``
        Memory size in GB for the Java FA2 component. By default, use 8GB memory.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    out_basis: ``str``, optional, default: ``"fle"``
        Key name for calculated FLE coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: FLE coordinates of the data.

    Examples
    --------
    >>> pg.fle(data)
    """

    if file_name is None:
        import tempfile

        _, file_name = tempfile.mkstemp()

    rep = update_rep(rep)
    n_jobs = eff_n_jobs(n_jobs)

    if ("W_" + rep) not in data.uns:
        neighbors(
            data,
            K=K,
            rep=rep,
            n_jobs=n_jobs,
            random_state=random_state,
            full_speed=full_speed,
        )

    data.obsm["X_" + out_basis] = calc_force_directed_layout(
        W_from_rep(data, rep),
        file_name,
        n_jobs,
        target_change_per_node,
        target_steps,
        is3d,
        memory,
        random_state,
    )
def tsvd(
    data: MultimodalData,
    n_components: int = 51,
    features: str = "robust",
    n_jobs: int = -1,
    random_state: int = 0,
) -> None:
    """Perform Truncated Singular Value Decomposition (TSVD) to the data.

    The calculation uses *scikit-learn* implementation.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    n_components: ``int``, optional, default: ``51``.
        Number of components to compute. Because we do not remove means, the first component might be related to the mean values and thus we ask for one more component than PCA as default (i.e. 51).

    features: ``str``, optional, default: ``"robust"``.
        Keyword in ``data.var`` to specify features used for TSVD.

    robust: ``bool``, optional, default: ``False``.
        If true, use 'arpack' instead of 'randomized'.

    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to using all physical CPU cores.

    random_state: ``int``, optional, default: ``0``.
        Random seed to be set for reproducing result.

    Returns
    -------
    ``None``.

    Update ``data.obsm``:

        * ``data.obsm["X_tsvd"]``: TSVD coordinates of the data.

    Update ``data.uns``:

        * ``data.uns["TSVDs"]``: The TSVD components containing the loadings.

    Examples
    --------
    >>> pg.tsvd(data)
    """
    X = data.X
    if (features is not None) and (data.var[features].sum() < data.shape[1]):
        X = X[:, data.var[features].values]
    X = X.astype(np.float64)  # for reproducible purpose

    tsvd = TruncatedSVD(n_components=n_components, random_state=random_state)

    n_jobs = eff_n_jobs(n_jobs)
    with threadpool_limits(limits=n_jobs):
        X_tsvd = tsvd.fit_transform(X)

    data.obsm["X_tsvd"] = np.ascontiguousarray(X_tsvd, dtype=np.float32)
    data.uns["TSVDs"] = np.ascontiguousarray(
        tsvd.components_.T, dtype=np.float32
    )  # cannot be varm because numbers of features are not the same
    data.uns["tsvd"] = {}
    data.uns["tsvd"]["variance"] = tsvd.explained_variance_
    data.uns["tsvd"]["variance_ratio"] = tsvd.explained_variance_ratio_
    data.uns["tsvd_features"] = features  # record which feature to use
Exemplo n.º 10
0
def predict_scarches_scanvi(
    data: Union[MultimodalData, UnimodalData],
    dir_path: str,
    label: str,
    predictions: str = "predictions",
    matkey: str = "counts",
    n_jobs: int = -1,
    random_state: int = 0,
    max_epochs: Union[int, None] = None,
    batch: Optional[str] = None,
    categorical_covariate_keys: Optional[List[str]] = None,
    continuous_covariate_keys: Optional[List[str]] = None,
    use_gpu: Union[str, int, bool, None] = None,
) -> None:
    """Run scArches training.

    This is a wrapper of `scvitools <https://github.com/scverse/scvi-tools>`_ package.

    Parameters
    ----------
    data: ``MultimodalData``.
        Annotated data matrix with rows for cells and columns for genes.
    dir_path: ``str``.
        Save the model to this directory.
    label: ``str``.
        The obs key representing labels.
    predictions: ``str``, , optional, default: ``"predictions"``
        The obs key to store predicted labels.
    matkey: ``str``, optional, default: ``"counts"``
        Matrix key for the raw count
    n_jobs : ``int``, optional, default: ``-1``.
        Number of threads to use. ``-1`` refers to using all physical CPU cores.
    random_state: ``int``, optional, default: ``0``.
        Seed for random number generator
    max_epochs: ``int | None``, optional, default: ``None``.
        Maximum number of training epochs. Defaults to np.min([round((20000 / n_cells) * 100), 100])
    batch: ``str``, optional, default: ``None``.
        If only one categorical covariate, the obs key representing batches that should be corrected for, default is ``None``.
    categorical_covariate_keys: ``List[str]``
        If multiple categorical covariates, a list of obs keys listing categorical covariates that should be corrected for, default is ``None``.
    continuous_covariate_keys: ``List[str]``
        A list of obs keys listing continuous covariates that should be corrected for, default is ``None``.
    use_gpu: ``str | int | bool | None``
        Use default GPU if available (if None or True), or index of GPU to use (if int), or name of GPU (if str, e.g., ‘cuda:0’), or use CPU (if False).

    Returns
    -------
    Update ``data.obsm``:
        * ``data.obsm['X_scanVI']``: The embedding calculated by scanVI.
        * ``data.obsm[predictions]``: The predicted labels by scanVI.

    Examples
    --------
    >>> pg.predict_scarches_scanvi(data, dir_path="scanvi_model/", label="celltype", matkey="counts", batch="tech")
    """
    try:
        import scvi
    except ImportError as e:
        import sys
        logger.error(f"{e}\nscvi-tools needed! Try 'pip install scvi-tools'.")
        sys.exit(-1)

    logger.info("Start prediction with scArches method.")

    obs_columns = [label]
    if batch is not None and batch:
        obs_columns.append(batch)
    if categorical_covariate_keys is not None and categorical_covariate_keys:
        obs_columns.extend(categorical_covariate_keys)
    if continuous_covariate_keys is not None and continuous_covariate_keys:
        obs_columns.extend(continuous_covariate_keys)

    features = scvi.model.SCANVI.prepare_query_anndata(
        None, dir_path, return_reference_var_names=True)
    adata = _gen_query_anndata(data, features, obs_columns,
                               matkey)  # gen AnnData

    scvi.settings.num_threads = eff_n_jobs(n_jobs)  # set n_jobs
    scvi.settings.seed = random_state  # set random_state, see [here](https://docs.scvi-tools.org/en/stable/_modules/scvi/_settings.html) for more details.

    if max_epochs is None:
        max_epochs = np.min([round((20000 / len(adata.obs)) * 100), 100])

    vae_q = scvi.model.SCANVI.load_query_data(adata, dir_path)
    vae_q.train(
        max_epochs=max_epochs,
        plan_kwargs=dict(weight_decay=0.0),
        check_val_every_n_epoch=10,
    )

    data.obsm["X_scANVI"] = vae_q.get_latent_representation()
    data.register_attr("X_scANVI", "embedding")
    data.obs[predictions] = vae_q.predict()
Exemplo n.º 11
0
def train_scarches_scanvi(
    data: Union[MultimodalData, UnimodalData],
    dir_path: str,
    label: str,
    unlabeled_category: str = "Unknown",
    features: str = "highly_variable_features",
    matkey: str = "counts",
    n_jobs: int = -1,
    random_state: int = 0,
    max_epochs: Union[int, None] = None,
    batch: Optional[str] = None,
    categorical_covariate_keys: Optional[List[str]] = None,
    continuous_covariate_keys: Optional[List[str]] = None,
    semisupervised_max_epochs: Union[int, None] = None,
    n_samples_per_label: Optional[int] = None,
    use_gpu: Union[str, int, bool, None] = None,
    arches_params: dict = dict(
        use_layer_norm="both",
        use_batch_norm="none",
        encode_covariates=True,
        dropout_rate=0.2,
        n_layers=2,
    ),
) -> None:
    """Run scArches training.

    This is a wrapper of `scvitools <https://github.com/scverse/scvi-tools>`_ package.

    Parameters
    ----------
    data: ``MultimodalData``.
        Annotated data matrix with rows for cells and columns for genes.
    dir_path: ``str``.
        Save the model to this directory.
    label: ``str``.
        The obs key representing labels.
    unlabeled_category: ``str``, default: ``"Unknown"``
        Value used for unlabeled cells in ``label``.
    features: ``str``, optional, default: ``"highly_variable_features"``
        Keyword in ``data.var``, which refers to a boolean array. If ``None``, all features will be selected.
    matkey: ``str``, optional, default: ``"counts"``
        Matrix key for the raw count
    n_jobs : ``int``, optional, default: ``-1``.
        Number of threads to use. ``-1`` refers to using all physical CPU cores.
    random_state: ``int``, optional, default: ``0``.
        Seed for random number generator
    max_epochs: ``int | None``, optional, default: ``None``.
        Maximum number of unsupervised training epochs. Defaults to np.min([round((20000 / n_cells) * 400), 400])
    batch: ``str``, optional, default: ``None``.
        If only one categorical covariate, the obs key representing batches that should be corrected for, default is ``None``.
    categorical_covariate_keys: ``List[str]``
        If multiple categorical covariates, a list of obs keys listing categorical covariates that should be corrected for, default is ``None``.
    continuous_covariate_keys: ``List[str]``
        A list of obs keys listing continuous covariates that should be corrected for, default is ``None``.
    semisupervised_max_epochs: ``int | None``, optional, default: ``None``.
        Maximum number of semisupervised training epochs. Defaults to np.min([round(np.sqrt(``max_epochs``)), 20])
    n_samples_per_label : ``int``, optional, default: ``None``.
        Number of subsamples for each label class to sample per epoch. By default, there is no label subsampling.
    use_gpu: ``str | int | bool | None``
        Use default GPU if available (if None or True), or index of GPU to use (if int), or name of GPU (if str, e.g., ‘cuda:0’), or use CPU (if False).
    arches_params: ``dict``.
        Hyperparameters for VAE. See https://docs.scvi-tools.org/en/stable/api/reference/scvi.module.VAE.html#scvi.module.VAE for more details

    Returns
    -------
    Update ``data.obsm``:
        * ``data.obsm['X_scVI']``: The embedding calculated by scVI.
        * ``data.obsm['X_scanVI']``: The embedding calculated by scanVI.

    Examples
    --------
    >>> pg.train_scarches_scanvi(data, dir_path="scanvi_model/", label="celltype", matkey="counts", batch="tech", n_samples_per_label=100)
    """
    try:
        import scvi
    except ImportError as e:
        import sys
        logger.error(f"{e}\nscvi-tools needed! Try 'pip install scvi-tools'.")
        sys.exit(-1)

    logger.info("Start training with scArches method.")

    obs_columns = [label]
    if batch is not None and batch:
        obs_columns.append(batch)
    if categorical_covariate_keys is not None and categorical_covariate_keys:
        obs_columns.extend(categorical_covariate_keys)
    if continuous_covariate_keys is not None and continuous_covariate_keys:
        obs_columns.extend(continuous_covariate_keys)

    adata = _gen_anndata(data, features, obs_columns, matkey)  # gen AnnData

    scvi.settings.num_threads = eff_n_jobs(n_jobs)  # set n_jobs
    scvi.settings.seed = random_state  # set random_state, see [here](https://docs.scvi-tools.org/en/stable/_modules/scvi/_settings.html) for more details.

    # unsupervised
    if max_epochs is None:
        max_epochs = np.min([round((20000 / len(adata.obs)) * 400), 400])

    scvi.model.SCVI.setup_anndata(
        adata,
        batch_key=batch,
        categorical_covariate_keys=categorical_covariate_keys,
        continuous_covariate_keys=continuous_covariate_keys,
    )  # register anndata
    vae_ref = scvi.model.SCVI(adata, **arches_params)  # obtain scvi model
    vae_ref.train(max_epochs=max_epochs, use_gpu=use_gpu)  # train model

    data.obsm["X_scVI"] = vae_ref.get_latent_representation()  # Get embedding
    data.register_attr("X_scVI",
                       "embedding")  # Register X_scVI as an embedding

    # semisupervised
    if semisupervised_max_epochs is None:
        semisupervised_max_epochs = np.min([round(np.sqrt(max_epochs)), 20])

    vae_ref_scan = scvi.model.SCANVI.from_scvi_model(
        vae_ref,
        unlabeled_category=unlabeled_category,
        labels_key=label,
    )
    vae_ref_scan.train(
        max_epochs=semisupervised_max_epochs,
        n_samples_per_label=n_samples_per_label,
        use_gpu=use_gpu,
    )
    vae_ref_scan.save(dir_path, overwrite=True)

    data.obsm["X_scANVI"] = vae_ref_scan.get_latent_representation()
    data.register_attr("X_scANVI", "embedding")
Exemplo n.º 12
0
def run_scvi(
    data: Union[MultimodalData, UnimodalData],
    features: str = "highly_variable_features",
    matkey: str = "counts",
    n_jobs: int = -1,
    random_state: int = 0,
    max_epochs: Union[int, None] = None,
    batch: Optional[str] = None,
    categorical_covariate_keys: Optional[List[str]] = None,
    continuous_covariate_keys: Optional[List[str]] = None,
    use_gpu: Union[str, int, bool, None] = None,
) -> str:
    """Run scVI embedding.

    This is a wrapper of `scvitools <https://github.com/scverse/scvi-tools>`_ package.

    Parameters
    ----------
    data: ``MultimodalData``.
        Annotated data matrix with rows for cells and columns for genes.
    features: ``str``, optional, default: ``"highly_variable_features"``
        Keyword in ``data.var``, which refers to a boolean array. If ``None``, all features will be selected.
    matkey: ``str``, optional, default: ``"counts"``
        Matrix key for the raw count
    n_jobs : ``int``, optional, default: ``-1``.
        Number of threads to use. ``-1`` refers to using all physical CPU cores.
    random_state: ``int``, optional, default: ``0``.
        Seed for random number generator
    max_epochs: ``int | None``, optional, default: ``None``.
        Maximum number of training epochs. Defaults to np.min([round((20000 / n_cells) * 400), 400])
    batch: ``str``, optional, default: ``None``.
        If only one categorical covariate, the obs key representing batches that should be corrected for, default is ``None``.
    categorical_covariate_keys: ``List[str]``
        If multiple categorical covariates, a list of obs keys listing categorical covariates that should be corrected for, default is ``None``.
    continuous_covariate_keys: ``List[str]``
        A list of obs keys listing continuous covariates that should be corrected for, default is ``None``.
    use_gpu: ``str | int | bool | None``
        Use default GPU if available (if None or True), or index of GPU to use (if int), or name of GPU (if str, e.g., ``cuda:0``), or use CPU (if False).

    Returns
    -------
    out_rep: ``str``
        The keyword in ``data.obsm`` referring to the embedding calculated by integrative NMF algorithm. out_rep is always equal to "scVI"

    Update ``data.obsm``:
        * ``data.obsm['X_scVI']``: The embedding calculated by scVI.

    Examples
    --------
    >>> pg.run_scvi(data, batch="Channel")
    >>> pg.run_scvi(data, categorical_covariate_keys=["cell_source", "donor"], continuous_covariate_keys=["percent_mito", "percent_ribo"])
    """
    try:
        import scvi
    except ImportError as e:
        import sys
        logger.error(f"{e}\nscvi-tools needed! Try 'pip install scvi-tools'.")
        sys.exit(-1)

    logger.info("Start embedding using scVI.")

    obs_columns = []
    if batch is not None and batch:
        obs_columns.append(batch)
    if categorical_covariate_keys is not None and categorical_covariate_keys:
        obs_columns.extend(categorical_covariate_keys)
    if continuous_covariate_keys is not None and continuous_covariate_keys:
        obs_columns.extend(continuous_covariate_keys)

    adata = _gen_anndata(data, features, obs_columns, matkey)  # gen AnnData

    scvi.settings.num_threads = eff_n_jobs(n_jobs)  # set n_jobs
    scvi.settings.seed = random_state  # set random_state, see [here](https://docs.scvi-tools.org/en/stable/_modules/scvi/_settings.html) for more details.

    if max_epochs is None:
        max_epochs = np.min([round((20000 / len(adata.obs)) * 400), 400])

    scvi.model.SCVI.setup_anndata(
        adata,
        batch_key=batch,
        categorical_covariate_keys=categorical_covariate_keys,
        continuous_covariate_keys=continuous_covariate_keys,
    )  # register anndata

    model = scvi.model.SCVI(adata)  # obtain scvi model
    model.train(max_epochs=max_epochs, use_gpu=use_gpu)  # train model
    data.obsm["X_scVI"] = model.get_latent_representation()  # Get embedding
    data.register_attr("X_scVI",
                       "embedding")  # Register X_scVI as an embedding

    return "scVI"
Exemplo n.º 13
0
def nmf(
    data: MultimodalData,
    n_components: int = 20,
    init: str = "nndsvdar",
    solver: str = "cd",
    max_iter: int = 200,
    features: str = "highly_variable_features",
    scale: bool = True,
    max_value: float = 10.0,
    n_jobs: int = -1,
    random_state: int = 0,
) -> None:
    """Perform Principle Component Analysis (PCA) to the data.

    The calculation uses *scikit-learn* implementation.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    n_components: ``int``, optional, default: ``50``.
        Number of Principal Components to get.

    init: ``str``, optional, default: ``nndsvdar``.
        Method to initialize NMF for sklearn. Options are 'random', 'nndsvd', 'nndsvda' and 'nndsvdar'.

    solver: ``str``, optional, default: ``cd``.
        NMF solver. Options are 'cd' and 'mu'.

    max_iter: ``str``, optional, default: ``200``.
        Maximum number of iterations for NMF.

    features: ``str``, optional, default: ``"highly_variable_features"``.
        Keyword in ``data.var`` to specify features used for PCA.

    scale: ``bool``, optional, default: ``True``.
        Whether to scale the data to have unit variance.

    max_value: ``float``, optional, default: ``10``.
        The threshold to truncate data after scaling. If ``None``, do not truncate.

    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to using all physical CPU cores.

    random_state: ``int``, optional, default: ``0``.
        Random seed to be set for reproducing result.

    Returns
    -------
    ``None``.

    Update ``data.obsm``:

        * ``data.obsm["X_nmf"]``: NMF coordinate matrix (W) of the data.

    Update ``data.uns``:

        * ``data.uns["H"]``: The feature factor matrix. 

        * ``data.uns["nmf_features"]``: Record the features used to perform NMF analysis.

    Examples
    --------
    >>> pg.nmf(data)
    """
    keyword = select_features(data,
                              features=features,
                              standardize=scale,
                              max_value=max_value)

    X = (data.uns[keyword] +
         data.uns['stdzn_mean'] / data.uns['stdzn_std']).astype(np.float64)
    X[X < 0] = 0.0

    nmf = NMF(n_components=n_components,
              init=init,
              solver=solver,
              max_iter=max_iter,
              random_state=random_state)

    n_jobs = eff_n_jobs(n_jobs)
    with threadpool_limits(limits=n_jobs):
        X_nmf = nmf.fit_transform(X)

    data.obsm["X_nmf"] = np.ascontiguousarray(X_nmf, dtype=np.float32)
    data.uns["H"] = np.ascontiguousarray(
        nmf.components_.T, dtype=np.float32
    )  # cannot be varm because numbers of features are not the same
    data.uns["nmf_features"] = features  # record which feature to use
Exemplo n.º 14
0
def calculate_nearest_neighbors(
    X: np.array,
    K: int = 100,
    n_jobs: int = -1,
    method: str = "hnsw",
    M: int = 20,
    efC: int = 200,
    efS: int = 200,
    random_state: int = 0,
    full_speed: int = False,
):
    """Calculate nearest neighbors
    X is the sample by feature matrix
    Return K -1 neighbors, the first one is the point itself and thus omitted.
    TODO: Documentation
    """

    nsample = X.shape[0]

    if nsample <= 1000:
        method = "sklearn"

    if nsample < K:
        logger.warning(
            f"Warning: in calculate_nearest_neighbors, number of samples = {nsample} < K = {K}!\n Set K to {nsample}."
        )
        K = nsample

    n_jobs = eff_n_jobs(n_jobs)

    if method == "hnsw":
        import hnswlib

        assert not issparse(X)
        # Build hnsw index
        knn_index = hnswlib.Index(space="l2", dim=X.shape[1])
        knn_index.init_index(max_elements=nsample,
                             ef_construction=efC,
                             M=M,
                             random_seed=random_state)
        knn_index.set_num_threads(n_jobs if full_speed else 1)
        knn_index.add_items(X)

        # KNN query
        knn_index.set_ef(efS)
        knn_index.set_num_threads(n_jobs)
        indices, distances = knn_index.knn_query(X, k=K)
        # eliminate the first neighbor, which is the node itself
        for i in range(nsample):
            if indices[i, 0] != i:
                indices[i, 1:] = indices[i, 0:-1]
                distances[i, 1:] = distances[i, 0:-1]
        indices = indices[:, 1:].astype(int)
        distances = np.sqrt(distances[:, 1:])
    else:
        assert method == "sklearn"
        knn = NearestNeighbors(
            n_neighbors=K - 1, n_jobs=n_jobs
        )  # eliminate the first neighbor, which is the node itself
        knn.fit(X)
        distances, indices = knn.kneighbors()

    return indices, distances
Exemplo n.º 15
0
def find_markers(
    data: AnnData,
    label_attr: str,
    de_key: str = "de_res",
    n_jobs: int = -1,
    min_gain: float = 1.0,
    random_state: int = 0,
    remove_ribo: bool = False,
) -> Dict[str, Dict[str, List[str]]]:
    """Find markers using gradient boosting method.

    Parameters
    ----------
    data: ``anndata.AnnData``
        Annotated data matrix with rows for cells and columns for genes.

    label_attr: ``str``
        Cluster labels used for finding markers. Must exist in ``data.obs``.

    de_key: ``str``, optional, default: ``"de_res"``
        Keyword of DE analysis result stored in ``data.varm``.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to used. -1 refers to using all physical CPU cores.

    min_gain: ``float``, optional, default: ``1.0``
        Only report genes with a feature importance score (in gain) of at least ``min_gain``.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    remove_ribo: ``bool``, optional, default: ``False``
        If ``True``, remove ribosomal genes with either RPL or RPS as prefixes.

    Returns
    -------
    markers: ``Dict[str, Dict[str, List[str]]]``
        A Python dictionary containing marker information in structure ``dict[cluster_id]['up' or 'down'][dataframe]``.

    Examples
    --------
    >>> marker_dict = pg.find_markers(adata, label_attr = 'leiden_labels')
    """

    n_jobs = eff_n_jobs(n_jobs)

    if remove_ribo:
        data = data[:,
                    np.vectorize(lambda x: not x.startswith("RPL") and not x.
                                 startswith("RPS"))(data.var_names), ]

    X_train, X_test, y_train, y_test = train_test_split(
        data.X,
        data.obs[label_attr],
        test_size=0.1,
        random_state=random_state,
        stratify=data.obs[label_attr],
    )

    # start = time.time()
    # xgb = XGBClassifier(n_jobs = n_jobs, n_gpus = 0)
    # xgb.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric = 'merror')
    # # print(xgb.evals_result())
    # end = time.time()
    # print("XGBoost used {:.2f}s to train.".format(end - start))

    # from xgboost import XGBClassifier
    try:
        from lightgbm import LGBMClassifier
    except ImportError:
        print("Need lightgbm! Try 'pip install lightgbm'.")
    start_lgb = time.time()
    lgb = LGBMClassifier(n_jobs=n_jobs,
                         metric="multi_error",
                         importance_type="gain")
    lgb.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=1,
    )
    end_lgb = time.time()
    logger.info("LightGBM used {:.2f}s to train.".format(end_lgb - start_lgb))

    ntot = (lgb.feature_importances_ >= min_gain).sum()
    ords = np.argsort(lgb.feature_importances_)[::-1][:ntot]

    log_exprs = [
        x for x in data.varm[de_key].dtype.names
        if x.startswith("mean_logExpr:")
    ]
    labels = [x.rpartition(":")[2] for x in log_exprs]

    titles = [("down", "down_gain"), ("weak", "weak_gain"),
              ("strong", "strong_gain")]
    markers = defaultdict(lambda: defaultdict(list))

    kmeans = KMeans(n_clusters=3, random_state=random_state)
    for gene_id in ords:
        gene_symbol = data.var_names[gene_id]
        mydat = [[x] for x in data.varm[de_key][log_exprs][gene_id]]
        with threadpool_limits(limits=1):
            kmeans.fit(mydat)
        kmeans_label_mode = pd.Series(kmeans.labels_).mode()[0]
        for i, kmeans_label in enumerate(
                np.argsort(kmeans.cluster_centers_[:, 0])):
            if kmeans_label != kmeans_label_mode:
                for pos in (kmeans.labels_ == kmeans_label).nonzero()[0]:
                    clust_label = labels[pos]
                    markers[clust_label][titles[i][0]].append(gene_symbol)
                    markers[clust_label][titles[i][1]].append("{:.2f}".format(
                        lgb.feature_importances_[gene_id]))

    return markers
def pca(
    data: Union[MultimodalData, UnimodalData],
    n_components: int = 50,
    features: str = "highly_variable_features",
    standardize: bool = True,
    max_value: float = 10.0,
    n_jobs: int = -1,
    random_state: int = 0,
) -> None:
    """Perform Principle Component Analysis (PCA) to the data.

    The calculation uses *scikit-learn* implementation.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    n_components: ``int``, optional, default: ``50``.
        Number of Principal Components to get.

    features: ``str``, optional, default: ``"highly_variable_features"``.
        Keyword in ``data.var`` to specify features used for PCA. If ``None``, all features will be selected.

    standardize: ``bool``, optional, default: ``True``.
        Whether to scale the data to unit variance and zero mean.

    max_value: ``float``, optional, default: ``10``.
        The threshold to truncate data after scaling. If ``None``, do not truncate.

    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to using all physical CPU cores.

    random_state: ``int``, optional, default: ``0``.
        Random seed to be set for reproducing result.

    Returns
    -------
    ``None``.

    Update ``data.obsm``:

        * ``data.obsm["X_pca"]``: PCA matrix of the data.

    Update ``data.uns``:

        * ``data.uns["PCs"]``: The principal components containing the loadings.

        * ``data.uns["pca_variance"]``: Explained variance, i.e. the eigenvalues of the covariance matrix.

        * ``data.uns["pca_variance_ratio"]``: Ratio of explained variance.

        * ``data.uns["pca_features"]``: Record the features used to generate PCA components.

    Examples
    --------
    >>> pg.pca(data)
    """
    keyword = select_features(data,
                              features=features,
                              standardize=standardize,
                              max_value=max_value)
    X = data.uns[keyword].astype(
        np.float64
    )  # float64 to avoid precision issues and make results more reproducible across platforms
    pca = PCA(n_components=n_components, random_state=random_state
              )  # use auto solver, default is randomized for large datasets

    n_jobs = eff_n_jobs(n_jobs)
    with threadpool_limits(limits=n_jobs):
        X_pca = pca.fit_transform(X)

    data.obsm["X_pca"] = np.ascontiguousarray(X_pca, dtype=np.float32)
    data.uns["PCs"] = np.ascontiguousarray(
        pca.components_.T, dtype=np.float32
    )  # cannot be varm because numbers of features are not the same
    data.uns["pca"] = {}
    data.uns["pca"]["variance"] = pca.explained_variance_
    data.uns["pca"]["variance_ratio"] = pca.explained_variance_ratio_
    data.uns["pca_features"] = features  # record which feature to use
Exemplo n.º 17
0
def integrative_nmf(
    data: Union[MultimodalData, UnimodalData],
    batch: str = "Channel",
    n_components: int = 20,
    features: str = "highly_variable_features",
    space: str = "log",
    algo: str = "halsvar",
    mode: str = "online",
    tol: float = 1e-4,
    use_gpu: bool = False,
    lam: float = 5.0,
    fp_precision: str = "float",
    n_jobs: int = -1,
    random_state: int = 0,
    quantile_norm: bool = True,
) -> str:
    """Perform Integrative Nonnegative Matrix Factorization (iNMF) [Yang16]_ for data integration.

    The calculation uses `nmf-torch <https://github.com/lilab-bcb/nmf-torch>`_ .

    This function assumes that cells in each batch are adjacent to each other.
    In addition, it will scale each batch with L2 norm separately. The resulting Hs will also be scaled with L2 norm.
    If ``quantile_norm=True``, quantile normalization will be additionally performed.

    See [Welch19]_ and [Gao21]_ for preprocessing and normalization details.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    batch: ``str``, optional, default: ``"Channel"``.
        Which attribute in data.obs field represents batches, default is "Channel".

    n_components: ``int``, optional, default: ``50``.
        Number of Principal Components to get.

    features: ``str``, optional, default: ``"highly_variable_features"``.
        Keyword in ``data.var`` to specify features used for integrative_nmf.

    space: ``str``, optional, default: ``log``.
        Choose from ``log`` and ``expression``. ``log`` works on log-transformed expression space; ``expression`` works on the original expression space (normalized by total UMIs).

    algo: ``str``, optional, default: ``halsvar``
        Choose from ``mu`` (Multiplicative Update), ``halsvar`` (HALS variant that mimic bpp but faster) and ``bpp`` (alternative non-negative least squares with Block Principal Pivoting method).

    mode: ``str``, optional, default: ``online``
        Learning mode. Choose from ``batch`` and ``online``. Notice that ``online`` only works when ``beta=2.0``. For other beta loss, it switches back to ``batch`` method.

    tol: ``float``, optional, default: ``1e-4``
        The toleration used for convergence check.

    use_gpu: ``bool``, optional, default: ``False``
        If ``True``, use GPU if available. Otherwise, use CPU only.

    lam: ``float``, optional, default: ``5.0``
        The coefficient for regularization terms. If ``0``, then no regularization will be performed.

    fp_precision: ``str``, optional, default: ``float``
        The numeric precision on the results. Choose from ``float`` and ``double``.

    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to using all physical CPU cores.

    random_state: ``int``, optional, default: ``0``.
        Random seed to be set for reproducing result.

    quantile_norm: ``bool``, optioanl, default: ``True``.
        Perform quantile normalization as described in Gao et al. Nature Biotech 2021. Cluster refinement K=20; min_cells=20; quantiles = 50.

    Returns
    -------
    out_rep: ``str``
        The keyword in ``data.obsm`` referring to the embedding calculated by integrative NMF algorithm. out_rep is always equal to "inmf"


    Update ``data.obsm``:

        * ``data.obsm["X_inmf"]``: Scaled and possibly quantile normalized iNMF coordinates.

        * ``data.obsm["H"]``: The concatenation of coordinate factor matrices of shape ``(n_cells, n_components)``.

    Update ``data.uns``:

        * ``data.uns["W"]``: The feature factor matrix of shape ``(n_HVFs, n_components)``.

        * ``data.uns["V"]``: The batch specific feature factor matrices as one tensor of shape ``(n_batches, n_components, n_HVFs)``.

        * ``data.uns["inmf_err"]``: The iNMF loss.

        * ``data.uns["inmf_features"]``: Record the features used to perform iNMF analysis.

    Examples
    --------
    >>> pg.integrative_nmf(data)
    """
    if not check_batch_key(data, batch, "Cannot apply integrative_nmf!"):
        return "pca"

    Xs = _select_and_scale_features(data,
                                    features=features,
                                    space=space,
                                    batch=batch)

    try:
        from nmf import integrative_nmf
    except ImportError as e:
        import sys
        logger.error(f"{e}\nNeed NMF-Torch! Try 'pip install nmf-torch'.")
        sys.exit(-1)

    n_jobs = eff_n_jobs(n_jobs)

    Hs, W, Vs, err = integrative_nmf(
        Xs,
        n_components=n_components,
        algo=algo,
        mode=mode,
        tol=tol,
        n_jobs=n_jobs,
        random_state=random_state,
        use_gpu=use_gpu,
        lam=lam,
        fp_precision=fp_precision,
    )

    # Implementation of algo 3, quantile normalization
    Hs_new = numbaList()
    csums = numbaList()
    ids_by_clusts = numbaList()

    nbatch = len(Hs)
    rg = np.random.default_rng(random_state)
    seeds = rg.integers(4294967295, size=nbatch)
    ref_batch = max_size = -1
    for i in range(nbatch):
        H_new = np.ascontiguousarray(Hs[i] / np.linalg.norm(Hs[i], axis=0),
                                     dtype=np.float32)  # Scale H
        Hs_new.append(H_new)  # Append scaled H

        if not quantile_norm:
            continue

        clusters = np.argmax(H_new, axis=1)  # Assign cluster
        indices, _ = calculate_nearest_neighbors(
            H_new, K=20, n_jobs=n_jobs, random_state=seeds[i])  # KNN with K=20
        clusters, csum = _refine_cluster(clusters, indices,
                                         n_components)  # Refine cluster
        csums.append(csum)
        ids_by_clusts.append(np.argsort(clusters, kind='stable'))

        if H_new.shape[0] > max_size:  # Find ref batch
            max_size = H_new.shape[0]
            ref_batch = i

    if quantile_norm:
        _quantile_norm(Hs_new, csums, ids_by_clusts, nbatch, ref_batch,
                       n_components)  # quantile normalization

    data.uns["inmf_features"] = features  # record which feature to use
    data.uns["W"] = np.ascontiguousarray(
        W.T, dtype=np.float32
    )  # cannot be varm because numbers of features are not the same
    data.uns["V"] = np.array(Vs)
    data.uns["inmf_err"] = err

    data.obsm["H"] = np.concatenate(Hs)
    data.obsm["X_inmf"] = np.concatenate(Hs_new)

    return "inmf"
Exemplo n.º 18
0
def tsne(
    data: MultimodalData,
    rep: str = "pca",
    n_jobs: int = -1,
    n_components: int = 2,
    perplexity: float = 30,
    early_exaggeration: int = 12,
    learning_rate: float = "auto",
    initialization: str = "pca",
    random_state: int = 0,
    out_basis: str = "tsne",
) -> None:
    """Calculate t-SNE embedding of cells using the FIt-SNE package.

    This function uses fitsne_ package. See [Linderman19]_ for details on FIt-SNE algorithm.

    .. _fitsne: https://github.com/KlugerLab/FIt-SNE

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all physical CPU cores.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated FI-tSNE coordinates. By default, generate 2-dimensional data for 2D visualization.

    perplexity: ``float``, optional, default: ``30``
        The perplexity is related to the number of nearest neighbors used in other manifold learning algorithms. Larger datasets usually require a larger perplexity.

    early_exaggeration: ``int``, optional, default: ``12``
        Controls how tight natural clusters in the original space are in the embedded space, and how much space will be between them.

    learning_rate: ``float``, optional, default: ``auto``
        By default, the learning rate is determined automatically as max(data.shape[0] / early_exaggeration, 200). See [Belkina19]_ and [Kobak19]_ for details.
    
    initialization: ``str``, optional, default: ``pca``
        Initialization can be either ``pca`` or ``random`` or np.ndarray. By default, we use ``pca`` initialization according to [Kobak19]_.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    out_basis: ``str``, optional, default: ``"fitsne"``
        Key name for calculated FI-tSNE coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: FI-tSNE coordinates of the data.

    Examples
    --------
    >>> pg.tsne(data)
    """

    rep = update_rep(rep)
    n_jobs = eff_n_jobs(n_jobs)
    X = X_from_rep(data, rep).astype(np.float64)

    if learning_rate == "auto":
        learning_rate = max(X.shape[0] / early_exaggeration, 200.0)

    if initialization == "random":
        initialization = None
    elif initialization == "pca":
        if rep == "pca":
            initialization = X[:, 0:n_components].copy()
        else:
            from sklearn.decomposition import PCA
            pca = PCA(n_components=n_components, random_state=random_state)
            with threadpool_limits(limits=n_jobs):
                initialization = np.ascontiguousarray(pca.fit_transform(X))
        initialization = initialization / np.std(initialization[:, 0]) * 0.0001
    else:
        assert isinstance(
            initialization,
            np.ndarray) and initialization.ndim == 2 and initialization.shape[
                0] == X.shape[0] and initialization.shape[1] == n_components
        if initialization.dtype != np.float64:
            initialization = initialization.astype(np.float64)

    data.obsm["X_" + out_basis] = calc_tsne(
        X,
        n_jobs,
        n_components,
        perplexity,
        early_exaggeration,
        learning_rate,
        random_state,
        initialization,
    )
Exemplo n.º 19
0
def nmf(
    data: Union[MultimodalData, UnimodalData],
    n_components: int = 20,
    features: str = "highly_variable_features",
    space: str = "log",
    init: str = "nndsvdar",
    algo: str = "halsvar",
    mode: str = "batch",
    tol: float = 1e-4,
    use_gpu: bool = False,
    alpha_W: float = 0.0,
    l1_ratio_W: float = 0.0,
    alpha_H: float = 0.0,
    l1_ratio_H: float = 0.0,
    fp_precision: str = "float",
    n_jobs: int = -1,
    random_state: int = 0,
) -> None:
    """Perform Nonnegative Matrix Factorization (NMF) to the data using Frobenius norm. Steps include select features and L2 normalization and NMF and L2 normalization of resulting coordinates.

    The calculation uses `nmf-torch <https://github.com/lilab-bcb/nmf-torch>`_ package.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    n_components: ``int``, optional, default: ``50``.
        Number of Principal Components to get.

    features: ``str``, optional, default: ``"highly_variable_features"``.
        Keyword in ``data.var`` to specify features used for nmf.

    max_value: ``float``, optional, default: ``None``.
        The threshold to truncate data symmetrically after scaling. If ``None``, do not truncate.

    space: ``str``, optional, default: ``log``.
        Choose from ``log`` and ``expression``. ``log`` works on log-transformed expression space; ``expression`` works on the original expression space (normalized by total UMIs).

    init: ``str``, optional, default: ``nndsvdar``.
        Method to initialize NMF. Options are 'random', 'nndsvd', 'nndsvda' and 'nndsvdar'.

    algo: ``str``, optional, default: ``halsvar``
        Choose from ``mu`` (Multiplicative Update), ``hals`` (Hierarchical Alternative Least Square), ``halsvar`` (HALS variant, use HALS to mimic ``bpp`` and can get better convergence for sometimes) and ``bpp`` (alternative non-negative least squares with Block Principal Pivoting method).

    mode: ``str``, optional, default: ``batch``
        Learning mode. Choose from ``batch`` and ``online``. Notice that ``online`` only works when ``beta=2.0``. For other beta loss, it switches back to ``batch`` method.

    tol: ``float``, optional, default: ``1e-4``
        The toleration used for convergence check.

    use_gpu: ``bool``, optional, default: ``False``
        If ``True``, use GPU if available. Otherwise, use CPU only.

    alpha_W: ``float``, optional, default: ``0.0``
        A numeric scale factor which multiplies the regularization terms related to W.
        If zero or negative, no regularization regarding W is considered.

    l1_ratio_W: ``float``, optional, default: ``0.0``
        The ratio of L1 penalty on W, must be between 0 and 1. And thus the ratio of L2 penalty on W is (1 - l1_ratio_W).

    alpha_H: ``float``, optional, default: ``0.0``
        A numeric scale factor which multiplies the regularization terms related to H.
        If zero or negative, no regularization regarding H is considered.

    l1_ratio_H: ``float``, optional, default: ``0.0``
        The ratio of L1 penalty on W, must be between 0 and 1. And thus the ratio of L2 penalty on H is (1 - l1_ratio_H).

    fp_precision: ``str``, optional, default: ``float``
        The numeric precision on the results. Choose from ``float`` and ``double``.

    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to using all physical CPU cores.

    random_state: ``int``, optional, default: ``0``.
        Random seed to be set for reproducing result.

    Returns
    -------
    ``None``.

    Update ``data.obsm``:

        * ``data.obsm["X_nmf"]``: Scaled NMF coordinates of shape ``(n_cells, n_components)``. Each column has a unit variance.

        * ``data.obsm["H"]``: The coordinate factor matrix of shape ``(n_cells, n_components)``.

    Update ``data.uns``:

        * ``data.uns["W"]``: The feature factor matrix of shape ``(n_HVFs, n_components)``.

        * ``data.uns["nmf_err"]``: The NMF loss.

        * ``data.uns["nmf_features"]``: Record the features used to perform NMF analysis.

    Examples
    --------
    >>> pg.nmf(data)
    """
    X = _select_and_scale_features(data, features=features, space=space)

    try:
        from nmf import run_nmf
    except ImportError as e:
        import sys
        logger.error(f"{e}\nNeed NMF-Torch! Try 'pip install nmf-torch'.")
        sys.exit(-1)

    H, W, err = run_nmf(
        X,
        n_components=n_components,
        init=init,
        algo=algo,
        mode=mode,
        tol=tol,
        n_jobs=eff_n_jobs(n_jobs),
        random_state=random_state,
        use_gpu=use_gpu,
        alpha_W=alpha_W,
        l1_ratio_W=l1_ratio_W,
        alpha_H=alpha_H,
        l1_ratio_H=l1_ratio_H,
        fp_precision=fp_precision,
    )

    data.uns["nmf_features"] = features  # record which feature to use
    data.uns["W"] = np.ascontiguousarray(
        W.T, dtype=np.float32
    )  # cannot be varm because numbers of features are not the same
    data.uns["nmf_err"] = err

    data.obsm["H"] = np.ascontiguousarray(H, dtype=np.float32)
    H = data.obsm["H"]
    data.obsm["X_nmf"] = H / np.linalg.norm(H, axis=0)
Exemplo n.º 20
0
def net_umap(
    data: MultimodalData,
    rep: str = "pca",
    n_jobs: int = -1,
    n_components: int = 2,
    n_neighbors: int = 15,
    min_dist: float = 0.5,
    spread: float = 1.0,
    random_state: int = 0,
    select_frac: float = 0.1,
    select_K: int = 25,
    select_alpha: float = 1.0,
    full_speed: bool = False,
    net_alpha: float = 0.1,
    polish_learning_rate: float = 10.0,
    polish_n_epochs: int = 30,
    out_basis: str = "net_umap",
) -> None:
    """Calculate Net-UMAP embedding of cells.

    Net-UMAP is an approximated UMAP embedding using Deep Learning model to improve the speed.

    In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor.

    See [Li20]_ for details.

    .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all physical CPU cores.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization.

    n_neighbors: ``int``, optional, default: ``15``
        Number of nearest neighbors considered during the computation.

    min_dist: ``float``, optional, default: ``0.5``
        The effective minimum distance between embedded data points.

    spread: ``float``, optional, default: ``1.0``
        The effective scale of embedded data points.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    select_frac: ``float``, optional, default: ``0.1``
        Down sampling fraction on the cells.

    select_K: ``int``, optional, default: ``25``
        Number of neighbors to be used to estimate local density for each data point for down sampling.

    select_alpha: ``float``, optional, default: ``1.0``
        Weight the down sample to be proportional to ``radius ** select_alpha``.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    net_alpha: ``float``, optional, default: ``0.1``
        L2 penalty (regularization term) parameter of the deep regressor.

    polish_learning_frac: ``float``, optional, default: ``10.0``
        After running the deep regressor to predict new coordinates, use ``polish_learning_frac`` * ``n_obs`` as the learning rate to polish the coordinates.

    polish_n_iter: ``int``, optional, default: ``30``
        Number of iterations for polishing UMAP run.

    out_basis: ``str``, optional, default: ``"net_umap"``
        Key name for calculated UMAP coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: Net UMAP coordinates of the data.

    Update ``data.obs``:
        * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase.

    Examples
    --------
    >>> pg.net_umap(data)
    """

    rep = update_rep(rep)
    n_jobs = eff_n_jobs(n_jobs)
    knn_indices, knn_dists = get_neighbors(data,
                                           K=select_K,
                                           rep=rep,
                                           n_jobs=n_jobs,
                                           random_state=random_state,
                                           full_speed=full_speed)

    selected = select_cells(
        knn_dists,
        select_frac,
        K=select_K,
        alpha=select_alpha,
        random_state=random_state,
    )
    X_full = X_from_rep(data, rep)
    X = X_full[selected, :]

    if data.shape[0] < n_neighbors:
        logger.warning(
            f"Warning: Number of samples = {data.shape[0]} < K = {n_neighbors}!\n Set K to {data.shape[0]}."
        )
        n_neighbors = data.shape[0]

    ds_indices_key = "ds_" + rep + "_knn_indices"  # ds refers to down-sampling
    ds_distances_key = "ds_" + rep + "_knn_distances"
    indices, distances = calculate_nearest_neighbors(
        X,
        K=n_neighbors,
        n_jobs=n_jobs,
        random_state=random_state,
        full_speed=full_speed,
    )
    data.uns[ds_indices_key] = indices
    data.uns[ds_distances_key] = distances

    knn_indices = np.insert(data.uns[ds_indices_key][:, 0:n_neighbors - 1],
                            0,
                            range(X.shape[0]),
                            axis=1)
    knn_dists = np.insert(data.uns[ds_distances_key][:, 0:n_neighbors - 1],
                          0,
                          0.0,
                          axis=1)

    X_umap = calc_umap(
        X,
        n_components,
        n_neighbors,
        min_dist,
        spread,
        random_state,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )

    data.uns["X_" + out_basis + "_small"] = X_umap
    data.obs["ds_selected"] = selected

    Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64)
    Y_init[selected, :] = X_umap
    Y_init[~selected, :] = net_train_and_predict(X,
                                                 X_umap,
                                                 X_full[~selected, :],
                                                 net_alpha,
                                                 n_jobs,
                                                 random_state,
                                                 verbose=True)

    data.obsm["X_" + out_basis + "_pred"] = Y_init

    knn_indices, knn_dists = get_neighbors(data,
                                           K=n_neighbors,
                                           rep=rep,
                                           n_jobs=n_jobs,
                                           random_state=random_state,
                                           full_speed=full_speed)
    knn_indices = np.insert(knn_indices[:, 0:n_neighbors - 1],
                            0,
                            range(data.shape[0]),
                            axis=1)
    knn_dists = np.insert(knn_dists[:, 0:n_neighbors - 1], 0, 0.0, axis=1)

    data.obsm["X_" + out_basis] = calc_umap(
        X_full,
        n_components,
        n_neighbors,
        min_dist,
        spread,
        random_state,
        init=Y_init,
        n_epochs=polish_n_epochs,
        learning_rate=polish_learning_rate,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )
Exemplo n.º 21
0
def de_analysis(
    data: Union[MultimodalData, UnimodalData, AnnData],
    cluster: str,
    condition: Optional[str] = None,
    subset: Optional[List[str]] = None,
    de_key: Optional[str] = "de_res",
    n_jobs: Optional[int] = -1,
    t: Optional[bool] = False,
    fisher: Optional[bool] = False,
    temp_folder: Optional[str] = None,
    verbose: Optional[bool] = True,
) -> None:
    """Perform Differential Expression (DE) Analysis on data.

    The analysis considers one cluster at one time, comparing gene expression levels on cells
    within the cluster with all the others using a number of statistical tools, and determining
    up-regulated genes and down-regulated genes of the cluster.

    Mann-Whitney U test and AUROC are calculated by default. Welch's T test and Fisher's Exact test are optionally.

    The scalability performance on calculating all the test statistics is improved by the inspiration from `Presto <https://github.com/immunogenomics/presto>`_.

    Parameters
    ----------
    data: ``MultimodalData``, ``UnimodalData``, or ``anndata.AnnData``
        Data matrix with rows for cells and columns for genes.

    cluster: ``str``
        Cluster labels used in DE analysis. Must exist in ``data.obs``.

    condition: ``str``, optional, default: ``None``
        Sample attribute used as condition in DE analysis. If ``None``, no condition is considered; otherwise, must exist in ``data.obs``.
        If ``condition`` is used, the DE analysis will be performed on cells of each level of ``data.obs[condition]`` respectively, and collect the results after finishing.

    subset: ``List[str]``, optional, default: ``None``
        Perform DE analysis on only a subset of cluster IDs. Cluster ID subset is specified as a list of strings, such as ``[clust_1,clust_3,clust_5]``, where all IDs must exist in ``data.obs[cluster]``.

    de_key: ``str``, optional, default: ``"de_res"``
        Key name of DE analysis results stored.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all available threads.

    t: ``bool``, optional, default: ``True``
        If ``True``, calculate Welch's t test.

    fisher: ``bool``, optional, default: ``False``
        If ``True``, calculate Fisher's exact test.

    temp_folder: ``str``, optional, default: ``None``
        Joblib temporary folder for memmapping numpy arrays.

    verbose: ``bool``, optional, default: ``True``
        If ``True``, show detailed intermediate output.

    Returns
    -------
    ``None``

    Update ``data.varm``:
        ``data.varm[de_key]``: DE analysis result.

    Examples
    --------
    >>> pg.de_analysis(data, cluster='spectral_leiden_labels')
    >>> pg.de_analysis(data, cluster='louvain_labels', condition='anno')
    """
    if cluster not in data.obs:
        raise ValueError("Cannot find cluster label!")
    cluster_labels = data.obs[cluster].values
    if not is_categorical_dtype(cluster_labels):
        from natsort import natsorted
        cluster_labels = pd.Categorical(cluster_labels,
                                        natsorted(np.unique(cluster_labels)))

    cond_labels = None
    if condition is not None:
        if condition not in data.obs:
            raise ValueError("Cannot find condition!")
        cond_labels = data.obs[condition].values
        if not is_categorical_dtype(cond_labels):
            from natsort import natsorted
            cond_labels = pd.Categorical(cond_labels,
                                         natsorted(np.unique(cond_labels)))
        if cond_labels.categories.size < 2:
            raise ValueError("Number of conditions must be at least 2!")

    X = data.X if isinstance(data.X, csr_matrix) else csr_matrix(
        data.X)  # If dense matrix, force it to be a csr_matrix

    if subset is not None:
        # subset data for de analysis
        subset = np.array(subset)
        idx_s = np.isin(subset, cluster_labels.categories.values)
        if idx_s.sum() < subset.size:
            raise ValueError("These cluster labels do not exist: " +
                             ",".join(subset[~idx_s]) + "!")

        idx = np.isin(cluster_labels, subset)
        cluster_labels = pd.Categorical(cluster_labels[idx], categories=subset)
        if cond_labels is not None:
            cond_labels = cond_labels[idx]
        X = X[idx]

    if condition is not None:
        #Eliminate NaN rows from calculation
        idx_na = cond_labels.isna()
        if idx_na.sum() > 0:
            logger.warning(
                "Detected NaN values in condition. Cells with NaN values are excluded from DE analysis."
            )
            idx_not_na = ~idx_na
            X = X[idx_not_na]
            cluster_labels = cluster_labels[idx_not_na]
            cond_labels = cond_labels[idx_not_na]

    n_jobs = eff_n_jobs(n_jobs)
    gene_names = data.var_names.values

    if cond_labels is None:
        df = _de_test(X, cluster_labels, gene_names, n_jobs, t, fisher,
                      temp_folder, verbose)
    else:
        df = _de_test_cond(X, cluster_labels, cond_labels, gene_names, n_jobs,
                           t, fisher, temp_folder, verbose)

    data.varm[de_key] = df.to_records(index=False)

    logger.info("Differential expression analysis is finished.")
Exemplo n.º 22
0
def get_neighbors(
    data: MultimodalData,
    K: int = 100,
    rep: str = "pca",
    n_comps: int = None,
    n_jobs: int = -1,
    random_state: int = 0,
    full_speed: bool = False,
    use_cache: bool = True,
    dist: str = "l2",
) -> Tuple[List[int], List[float]]:
    """Find K nearest neighbors for each data point and return the indices and distances arrays.

    Parameters
    ----------

    data : `pegasusio.MultimodalData`
        An AnnData object.
    K : `int`, optional (default: 100)
        Number of neighbors, including the data point itself.
    rep : `str`, optional (default: 'pca')
        Representation used to calculate kNN. If `None` use data.X
    n_comps: `int`, optional (default: None)
        Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions.
    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to using all physical CPU cores.
    random_state: `int`, optional (default: 0)
        Random seed for random number generator.
    full_speed: `bool`, optional (default: False)
        If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible.
    use_cache: `bool`, optional (default: True)
        If use_cache and found cached knn results, will not recompute.
    dist: `str`, optional (default: 'l2')
        Distance metric to use. By default, use squared L2 distance. Available options, inner product 'ip' or cosine similarity 'cosine'.

    Returns
    -------

    kNN indices and distances arrays.

    Examples
    --------
    >>> indices, distances = tools.get_neighbors(data)
    """

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if use_cache and knn_is_cached(data, indices_key, distances_key, K):
        indices = data.obsm[indices_key]
        distances = data.obsm[distances_key]
        logger.info("Found cached kNN results, no calculation is required.")
    else:
        indices, distances = calculate_nearest_neighbors(
            X_from_rep(data, rep, n_comps),
            K=K,
            n_jobs=eff_n_jobs(n_jobs),
            random_state=random_state,
            full_speed=full_speed,
            dist=dist,
        )
        data.obsm[indices_key] = indices
        data.register_attr(indices_key, "knn")
        data.obsm[distances_key] = distances
        data.register_attr(distances_key, "knn")

    return indices, distances
Exemplo n.º 23
0
def jump_method(
    data: MultimodalData,
    rep: str = "pca",
    K_max: int = 40,
    Y: float = None,
    n_jobs: int = -1,
    random_state: int = 0,
) -> None:
    """ Determine the optimal number of clusters using the Jump Method. [Sugar and James, 2003]_

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        The embedding representation used for clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates.

    K_max: ``int``, optional, default: 40
        The maximum number of clusters to try.

    Y: ``float``, optional, default: ``None``
        The transformation power used. If None, use min(data.shape[1] / 3.0, 3.0).

    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to using all physical CPU cores.

    random_state: ``int``, optional, default: ``0``
        Random seed for reproducing results.

    Returns
    -------
    ``None``

    Update ``data.uns``:
        * ``data.obs[jump_values]``: Jump values (difference of adjacent transformed distortion values)

    Examples
    --------
    >>> pg.jump_method(data)
    """
    X = data.obsm[f"X_{rep}"]
    Y = min(data.shape[1] / 3.0, 3.0) if Y is None else Y
    logger.info(f"Jump method: Y = {Y:.3f}.")

    n_jobs = eff_n_jobs(n_jobs)
    jump_values = np.zeros(K_max, dtype=np.float64)
    v_old = v = 0.0
    for k in range(1, K_max + 1):
        with threadpool_limits(limits=n_jobs):
            kmeans = KMeans(n_clusters=k, random_state=random_state).fit(X)
        v = _calc_trans_distor(X, kmeans.labels_, Y)
        jump_values[k - 1] = v - v_old
        v_old = v
        logger.info(
            f"K = {k} is finished, jump_value = {jump_values[k - 1]:.6f}.")
    optimal_k = np.argmax(jump_values) + 1

    data.uns[f"{rep}_jump_values"] = jump_values
    data.uns[f"{rep}_optimal_k"] = optimal_k

    logger.info(f"Jump method finished. Optimal K = {optimal_k}.")
Exemplo n.º 24
0
def calc_kBET(
    data: MultimodalData,
    attr: str,
    rep: str = "pca",
    K: int = 25,
    alpha: float = 0.05,
    n_jobs: int = -1,
    random_state: int = 0,
    temp_folder: str = None,
    use_cache: bool = True,
) -> Tuple[float, float, float]:
    """Calculate the kBET metric of the data regarding a specific sample attribute and embedding.

    The kBET metric is defined in [Büttner18]_, which measures if cells from different samples mix well in their local neighborhood.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    attr: ``str``
        The sample attribute to consider. Must exist in ``data.obs``.

    rep: ``str``, optional, default: ``"pca"``
        The embedding representation to be used. The key ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates.

    K: ``int``, optional, default: ``25``
        Number of nearest neighbors, using L2 metric.

    alpha: ``float``, optional, default: ``0.05``
        Acceptance rate threshold. A cell is accepted if its kBET p-value is greater than or equal to ``alpha``.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads used. If ``-1``, use all physical CPU cores.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    temp_folder: ``str``, optional, default: ``None``
        Temporary folder for joblib execution.

    use_cache: ``bool``, optional, default: ``True``
        If use cache results for kNN.

    Returns
    -------
    stat_mean: ``float``
        Mean kBET chi-square statistic over all cells.

    pvalue_mean: ``float``
        Mean kBET p-value over all cells.

    accept_rate: ``float``
        kBET Acceptance rate of the sample.

    Examples
    --------
    >>> pg.calc_kBET(data, attr = 'Channel')

    >>> pg.calc_kBET(data, attr = 'Channel', rep = 'umap')
    """
    assert attr in data.obs
    if data.obs[attr].dtype.name != "category":
        data.obs[attr] = pd.Categorical(data.obs[attr])

    ideal_dist = (data.obs[attr].value_counts(normalize=True,
                                              sort=False).values
                  )  # ideal no batch effect distribution
    nsample = data.shape[0]
    nbatch = ideal_dist.size

    attr_values = data.obs[attr].values.copy()
    attr_values.categories = range(nbatch)

    indices, distances = get_neighbors(
        data,
        K=K,
        rep=rep,
        n_jobs=n_jobs,
        random_state=random_state,
        use_cache=use_cache,
    )
    knn_indices = np.concatenate(
        (np.arange(nsample).reshape(-1, 1), indices[:, 0:K - 1]),
        axis=1)  # add query as 1-nn

    # partition into chunks
    n_jobs = min(eff_n_jobs(n_jobs), nsample)
    starts = np.zeros(n_jobs + 1, dtype=int)
    quotient = nsample // n_jobs
    remainder = nsample % n_jobs
    for i in range(n_jobs):
        starts[i + 1] = starts[i] + quotient + (1 if i < remainder else 0)

    from joblib import Parallel, delayed, parallel_backend
    with parallel_backend("loky", inner_max_num_threads=1):
        kBET_arr = np.concatenate(
            Parallel(n_jobs=n_jobs, temp_folder=temp_folder)(
                delayed(calc_kBET_for_one_chunk)(knn_indices[
                    starts[i]:starts[i + 1], :], attr_values, ideal_dist, K)
                for i in range(n_jobs)))

    res = kBET_arr.mean(axis=0)
    stat_mean = res[0]
    pvalue_mean = res[1]
    accept_rate = (kBET_arr[:, 1] >= alpha).sum() / nsample

    return (stat_mean, pvalue_mean, accept_rate)
Exemplo n.º 25
0
def _run_scrublet(
    data: Union[MultimodalData, UnimodalData],
    raw_mat_key: Optional[str] = 'counts',
    name: Optional[str] = '',
    expected_doublet_rate: Optional[float] = None,
    sim_doublet_ratio: Optional[float] = 2.0,
    n_prin_comps: Optional[int] = 30,
    k: Optional[int] = None,
    n_jobs: Optional[int] = -1,
    random_state: Optional[int] = 0,
    plot_hist: Optional[bool] = True,
    manual_correction: Optional[str] = None,
) -> Union[None, Figure]:
    """Calculate doublet scores using Scrublet-like [Wolock18]_ strategy for the current data.X; determine a right threshold based on the KDE curve.
       This function should be called after highly_variable_gene selection.

    Parameters
    -----------
    data: ``Union[MultimodalData, UnimodalData]`` object.
        Annotated data matrix with rows for cells and columns for genes. Data must be low quality cell and gene filtered and log-transformed.

    raw_mat_key: ``str``, optional, default: ``counts``
        Matrix key for the raw count matrix.

    name: ``str``, optional, default: ``''``
        Name of the sample.

    expected_doublet_rate: ``float``, optional, default: ``None``
        The expected doublet rate for the experiment. By default, calculate the expected rate based on number of cells from the 10x multiplet rate table

    sim_doublet_ratio: ``float``, optional, default: ``2.0``
        The ratio between synthetic doublets and observed cells.

    n_prin_comps: ``int``, optional, default: ``30``
        Number of principal components.

    k: ``int``, optional, default: ``None``
        Number of observed cell neighbors. If None, k = round(0.5 * sqrt(number of observed cells)). Total neighbors k_adj = round(k * (1.0 + sim_doublet_ratio)).

    n_jobs: ``int``, optional, default: ``-``
        Number of threads to use. If ``-1``, use all physical CPU cores.

    random_state: ``int``, optional, default: ``0``
        Random state for doublet simulation, PCA and approximate nearest neighbor search.

    plot_hist: ``bool``, optional, default: ``True``
        If True, plot diagnostic histograms. Each sample would have a figure consisting of 4 panels showing histograms of doublet scores for observed cells (panel 1, density in log scale), simulated doublets (panel 2, density in log scale), KDE plot (panel 3) and signed curvature plot (panel 4) of log doublet scores for simulated doublets.

    manual_correction: ``str``, optional, default: ``None``
        If present, use human guide provided in manual_correction to select threshold. Currently only support manual_correction='peak', which means cut at the center of the peak.

    Returns
    --------
    ``None`` or a ``matplotlib Figure object`` if

    Update ``data.obs``:
        * ``data.obs['doublet_score']``: The calculated doublet scores on cells.
        * ``data.obs['pred_dbl']``: Predicted doublets as True.

    Update ``data.uns``:
        * ``data.uns['doublet_threshold']``: Inferred doublet threshold; any score > threshold is identified as a neotypic doublet.

    Examples
    --------
    >>> pg.run_scrublet(data)
    """
    from pegasus.tools import calculate_nearest_neighbors, simulate_doublets
    from sklearn.decomposition import PCA
    from scipy.stats import gaussian_kde
    from sklearn.cluster import KMeans

    if "highly_variable_features" not in data.var:
        raise ValueError(
            "_run_scrublet must be run after highly_variable_features is called!"
        )

    r = sim_doublet_ratio
    if expected_doublet_rate is None:
        expected_doublet_rate = _calc_expected_doublet_rate(data.shape[0])
    rho = expected_doublet_rate

    # subset the raw count matrix
    rawX = data.get_matrix(raw_mat_key)
    obs_umis = rawX.sum(axis=1, dtype=np.int32).A1
    rawX = rawX[:, data.var["highly_variable_features"].values]
    # Simulate synthetic doublets
    sim_rawX, pair_idx = simulate_doublets(rawX, r, random_state)
    sim_umis = obs_umis[pair_idx].sum(axis=1, dtype=np.int32)

    # standardize and calculate PCA for rawX
    obsX = rawX.astype(np.float32).toarray()
    obsX /= obs_umis.reshape(-1, 1)  # normalize each cell

    m1 = obsX.mean(axis=0)  # calculate mean and std
    psum = np.multiply(obsX, obsX).sum(axis=0)
    std = ((psum - obsX.shape[0] * (m1**2)) / (obsX.shape[0] - 1.0))**0.5
    std[std == 0] = 1

    obsX -= m1  # standardize
    obsX /= std

    pca = PCA(n_components=n_prin_comps, random_state=random_state)
    n_jobs = eff_n_jobs(n_jobs)
    with threadpool_limits(limits=n_jobs):
        obs_pca = pca.fit_transform(obsX.astype(
            np.float64))  # float64 for reproducibility
        obs_pca = np.ascontiguousarray(obs_pca, dtype=np.float32)
        kmeans = KMeans(n_clusters=5, random_state=random_state).fit(obs_pca)

    # calculate in simulated distribution, expected percentage of embedded doublets
    data.obs["dbl_kmeans_"] = pd.Categorical(kmeans.labels_)
    _, freqs = np.unique(kmeans.labels_, return_counts=True)
    freqs = np.array(freqs) / sum(freqs)
    d_emb = (((1.0 - rho) * freqs + rho * (freqs**2))**2).sum()
    d_neo = 1.0 - d_emb

    # standardize and calculate PCA for sim_rawX
    simX = sim_rawX.astype(np.float32).toarray()
    simX /= sim_umis.reshape(-1, 1)  # normalize each cell

    simX -= m1  # standardize
    simX /= std

    sim_pca = pca.transform(simX)  # transform to PC coordinates
    sim_pca = np.ascontiguousarray(sim_pca, dtype=np.float32)

    # concatenate observed and simulated data
    pc_coords = np.vstack((obs_pca, sim_pca))
    is_doublet = np.repeat(np.array([0, 1], dtype=np.int32),
                           [obsX.shape[0], simX.shape[0]])

    # Calculate k nearest neighbors
    if k is None:
        k = int(round(0.5 * np.sqrt(obsX.shape[0])))
    k_adj = int(round(k * (1.0 + r)))
    indices, _ = calculate_nearest_neighbors(pc_coords,
                                             K=k_adj + 1,
                                             n_jobs=n_jobs)

    # Calculate scrublet-like doublet score
    k_d = is_doublet[indices].sum(axis=1)
    q = (k_d + 1.0) / (k_adj + 2.0)  # Equation 5
    doublet_scores = (q * rho / r) / (
        (1.0 - rho) - q * (1.0 - rho - rho / r))  # Equation 4
    obs_scores = doublet_scores[0:obsX.shape[0]]
    sim_scores = doublet_scores[obsX.shape[0]:]

    # Determine a scrublet score threshold
    # log transformed
    sim_scores_log = np.log(sim_scores)

    # Estimate KDE
    min_score = sim_scores_log.min()
    max_score = sim_scores_log.max()
    min_gap = np.diff(np.unique(np.sort(sim_scores_log))).min()
    from math import ceil
    n_gap = max(int(ceil((max_score - min_score) / min_gap)),
                200)  # minimum is 200
    gap = (max_score - min_score) / n_gap

    n_ext = 5
    min_score -= gap * n_ext
    max_score += gap * n_ext
    x = np.linspace(min_score, max_score,
                    n_gap + 1 + n_ext * 2)  # generate x coordinates
    kde = gaussian_kde(sim_scores_log)
    y = kde(x)

    # Find local maxima
    maxima, maxima_by_x, filtered_maxima = _find_local_maxima(y)
    assert maxima.size > 0
    curv = _calc_vec_f(_curvature, x.size, y, gap)  # calculate curvature

    x_theory = np.percentile(sim_scores_log, d_emb * 100.0 + 1e-6)
    threshold_theory = np.exp(x_theory)

    case_num = -1
    pos = -1
    if maxima.size >= 2:
        pos = _locate_cutoff_among_peaks_with_guide(x, y, maxima,
                                                    sim_scores_log, d_neo)
        case_num = 0
        d_pneo = (sim_scores_log > x[pos]).sum() / sim_scores_log.size
        if d_pneo < 0.1:  # < 10%, consider it as not a peak
            idx_ = maxima_by_x >= pos
            filtered_maxima = np.concatenate(
                (filtered_maxima, maxima_by_x[idx_]))
            maxima_by_x = maxima_by_x[~idx_]
            pos = -1
    if pos < 0:
        frac_right = (sim_scores_log >
                      x[maxima_by_x[-1]]).sum() / sim_scores.size
        if frac_right < 0.41 or (frac_right < 0.5
                                 and x_theory + 0.05 < x[maxima_by_x[-1]]):
            logger.debug(f"frac_right={frac_right}.")
            if maxima_by_x.size > 1:
                posvec = np.vectorize(
                    lambda i: y[maxima_by_x[i] + 1:maxima_by_x[i + 1]].argmin(
                    ) + (maxima_by_x[i] + 1))(range(maxima_by_x.size - 1))
                pos = posvec[np.argmin(np.abs(x[posvec] - x_theory))]
                case_num = 1
            else:
                pos = _find_cutoff_left_side(maxima_by_x[0], x, curv, x_theory)
                case_num = 2
        else:
            pos = _find_cutoff_right_side(maxima_by_x[-1], curv,
                                          filtered_maxima)
            case_num = 3
    threshold = np.exp(x[pos])

    threshold_auto = None
    if manual_correction is not None:
        assert case_num == 2
        threshold_auto = threshold
        threshold = np.exp(x[maxima_by_x[-1]])

    data.obs["doublet_score"] = obs_scores.astype(np.float32)
    data.obs["pred_dbl"] = obs_scores > threshold
    data.uns["doublet_threshold"] = float(threshold)

    neo_dbl_rate = data.obs['pred_dbl'].sum() / data.shape[0]
    neo_sim_dbl_rate = (sim_scores > threshold).sum() / sim_scores.size
    logger.info(
        f"Sample {name}: doublet threshold = {threshold:.4f}; total cells = {data.shape[0]}; neotypic doublet rate in simulation = {neo_sim_dbl_rate:.2%}; neotypic doublet rate = {neo_dbl_rate:.2%}."
    )

    fig = None
    if plot_hist:
        fig = _plot_hist(obs_scores,
                         sim_scores,
                         threshold,
                         threshold_theory,
                         x,
                         y,
                         curv,
                         threshold_auto=threshold_auto)
    return fig