Python MultimodalData.register_attr 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pegasusio

클래스/타입: MultimodalData

메소드/함수: register_attr

hotexamples.com에서의 예제들: 9

Python MultimodalData.register_attr - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pegasusio.MultimodalData.register_attr에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

MultimodalData(12)

register_attr(9)

add_data(6)

get_matrix(5)

list_data(5)

get_data(5)

current_data(5)

get_modality(4)

X(2)

get_genome(2)

add_matrix(2)

list_keys(2)

select_data(2)

current_key(1)

filter_data(1)

kick_start(1)

_zarrobj(1)

_inplace_subset_var(1)

scan_black_list(1)

예제 #1

파일 보기

def split_one_cluster(
    data: MultimodalData,
    clust_label: str,
    clust_id: str,
    n_clust: int,
    res_label: str,
    rep: str = "pca",
    random_state: int = 0,
) -> None:
    """
    Use Leiden algorithm to split 'clust_id' in 'clust_label' into 'n_components' clusters and write the new clusting results to 'res_label'. Assume 'clust_label' named clusters as numbers (in str format).

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    clust_label: `str`
        Use existing clustering stored in data.obs['clust_label'].

    clust_id: `str`
        Cluster ID in data.obs['clust_label'].

    n_clust: `int`
        Split 'clust_id' into `n_clust' subclusters.

    res_label: `str`,
        Write new clustering in data.obs['res_label']. The largest subcluster will use 'clust_id' as its cluster ID, while other subclusters will be numbered after existing clusters.

    rep: ``str``, optional, default: ``"pca"``
        The embedding representation used for Kmeans clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates.

    n_jobs : `int`, optional (default: -1)
        Number of threads to use for the KMeans step in 'spectral_louvain' and 'spectral_leiden'. -1 refers to using all physical CPU cores.

    random_state: ``int``, optional, default: ``0``
        Random seed for reproducing results.

    Returns
    -------
    ``None``

    Update ``data.obs``:
        * ``data.obs[res_label]``: New cluster labels of cells as categorical data.

    Examples
    --------
    >>> pg.split_one_cluster(data, 'leiden_labels', '15', 2, 'leiden_labels_split')
    """
    idx = np.where(data.obs[clust_label] == clust_id)[0]
    tmpdat = data[idx].copy()
    from pegasus.tools import neighbors
    neighbors(tmpdat, rep=rep, use_cache=False)
    leiden(tmpdat,
           rep=rep,
           resolution=None,
           n_clust=n_clust,
           random_state=random_state)
    new_clust = data.obs[clust_label].values.astype(int)
    new_label = new_clust.max() + 1
    for label in tmpdat.obs['leiden_labels'].value_counts().index[1:]:
        new_clust[idx[(
            tmpdat.obs['leiden_labels'] == label).values]] = new_label
        new_label += 1
    data.obs[res_label] = pd.Categorical(values=new_clust.astype(str),
                                         categories=np.array(
                                             range(1, new_label)).astype(str))
    data.register_attr(res_label, "cluster")
    del tmpdat

예제 #2

파일 보기

def get_neighbors(
    data: MultimodalData,
    K: int = 100,
    rep: str = "pca",
    n_comps: int = None,
    n_jobs: int = -1,
    random_state: int = 0,
    full_speed: bool = False,
    use_cache: bool = True,
    dist: str = "l2",
) -> Tuple[List[int], List[float]]:
    """Find K nearest neighbors for each data point and return the indices and distances arrays.

    Parameters
    ----------

    data : `pegasusio.MultimodalData`
        An AnnData object.
    K : `int`, optional (default: 100)
        Number of neighbors, including the data point itself.
    rep : `str`, optional (default: 'pca')
        Representation used to calculate kNN. If `None` use data.X
    n_comps: `int`, optional (default: None)
        Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions.
    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to using all physical CPU cores.
    random_state: `int`, optional (default: 0)
        Random seed for random number generator.
    full_speed: `bool`, optional (default: False)
        If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible.
    use_cache: `bool`, optional (default: True)
        If use_cache and found cached knn results, will not recompute.
    dist: `str`, optional (default: 'l2')
        Distance metric to use. By default, use squared L2 distance. Available options, inner product 'ip' or cosine similarity 'cosine'.

    Returns
    -------

    kNN indices and distances arrays.

    Examples
    --------
    >>> indices, distances = tools.get_neighbors(data)
    """

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if use_cache and knn_is_cached(data, indices_key, distances_key, K):
        indices = data.obsm[indices_key]
        distances = data.obsm[distances_key]
        logger.info("Found cached kNN results, no calculation is required.")
    else:
        indices, distances = calculate_nearest_neighbors(
            X_from_rep(data, rep, n_comps),
            K=K,
            n_jobs=eff_n_jobs(n_jobs),
            random_state=random_state,
            full_speed=full_speed,
            dist=dist,
        )
        data.obsm[indices_key] = indices
        data.register_attr(indices_key, "knn")
        data.obsm[distances_key] = distances
        data.register_attr(distances_key, "knn")

    return indices, distances

예제 #3

파일 보기

def leiden(
    data: MultimodalData,
    rep: str = "pca",
    resolution: int = 1.3,
    n_clust: int = None,
    n_iter: int = -1,
    random_state: int = 0,
    class_label: str = "leiden_labels",
) -> None:
    """Cluster the data using Leiden algorithm. [Traag19]_

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        The embedding representation used for clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm`` and nearest neighbors must be calculated so that affinity matrix ``'W_' + rep`` exists in ``data.uns``. By default, use PCA coordinates.

    resolution: ``int``, optional, default: ``1.3``
        Resolution factor. Higher resolution tends to find more clusters.

    n_clust: ``int``, optional, default: ``None``
        This option only takes effect if 'resolution = None'. Try to find an appropriate resolution by binary search such that the total number of clusters matches 'n_clust'. The range of resolution to search is (0.01, 2.0].

    n_iter: ``int``, optional, default: ``-1``
        Number of iterations that Leiden algorithm runs. If ``-1``, run the algorithm until reaching its optimal clustering.

    random_state: ``int``, optional, default: ``0``
        Random seed for reproducing results.

    class_label: ``str``, optional, default: ``"leiden_labels"``
        Key name for storing cluster labels in ``data.obs``.

    Returns
    -------
    ``None``

    Update ``data.obs``:
        * ``data.obs[class_label]``: Cluster labels of cells as categorical data.

    Examples
    --------
    >>> pg.leiden(data)
    """

    try:
        import leidenalg
    except ImportError:
        import sys
        logger.error("Need leidenalg! Try 'pip install leidenalg'.")
        sys.exit(-1)

    rep_key = "W_" + rep
    if rep_key not in data.obsp:
        raise ValueError(
            "Cannot find affinity matrix. Please run neighbors first!")
    W = data.obsp[rep_key]

    G = construct_graph(W)
    if resolution is not None:
        membership = _run_community_detection("leiden", leidenalg, G,
                                              resolution, random_state, n_iter)
    else:
        assert isinstance(n_clust, int)
        resolution, membership = _find_optimal_resolution(
            "leiden", leidenalg, n_clust, 2.0, G, random_state, n_iter)

    data.uns["leiden_resolution"] = resolution
    labels = np.array([str(x + 1) for x in membership])
    categories = natsorted(np.unique(labels))
    data.obs[class_label] = pd.Categorical(values=labels,
                                           categories=categories)
    data.register_attr(class_label, "cluster")

    n_clusters = data.obs[class_label].cat.categories.size
    logger.info(f"Leiden clustering is done. Get {n_clusters} clusters.")

예제 #4

파일 보기

def spectral_leiden(
    data: MultimodalData,
    rep: str = "pca",
    resolution: float = 1.3,
    rep_kmeans: str = "diffmap",
    n_clusters: int = 30,
    n_clusters2: int = 50,
    n_init: int = 10,
    n_jobs: int = -1,
    random_state: int = 0,
    class_label: str = "spectral_leiden_labels",
) -> None:
    """Cluster the data using Spectral Leiden algorithm. [Li20]_

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        The embedding representation used for clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates.

    resolution: ``int``, optional, default: ``1.3``
        Resolution factor. Higher resolution tends to find more clusters.

    rep_kmeans: ``str``, optional, default: ``"diffmap"``
        The embedding representation on which the KMeans runs. Keyword must exist in ``data.obsm``. By default, use Diffusion Map coordinates. If diffmap is not calculated, use PCA coordinates instead.

    n_clusters: ``int``, optional, default: ``30``
        The number of first level clusters.

    n_clusters2: ``int``, optional, default: ``50``
        The number of second level clusters.

    n_init: ``int``, optional, default: ``10``
        Number of kmeans tries for the first level clustering. Default is set to be the same as scikit-learn Kmeans function.

    n_jobs : `int`, optional (default: -1)
        Number of threads to use for the KMeans step. -1 refers to using all physical CPU cores.

    random_state: ``int``, optional, default: ``0``
        Random seed for reproducing results.

    class_label: ``str``, optional, default: ``"spectral_leiden_labels"``
        Key name for storing cluster labels in ``data.obs``.

    Returns
    -------
    ``None``

    Update ``data.obs``:
        * ``data.obs[class_label]``: Cluster labels for cells as categorical data.

    Examples
    --------
    >>> pg.spectral_leiden(data)
    """

    try:
        import leidenalg
    except ImportError:
        import sys
        logger.error("Need leidenalg! Try 'pip install leidenalg'.")
        sys.exit(-1)

    if f"X_{rep_kmeans}" not in data.obsm.keys():
        logger.warning(
            f"{rep_kmeans} is not calculated, switch to pca instead.")
        rep_kmeans = "pca"
        if f"X_{rep_kmeans}" not in data.obsm.keys():
            raise ValueError(f"Please run {rep_kmeans} first!")
    if f"W_{rep}" not in data.obsp:
        raise ValueError(
            "Cannot find affinity matrix. Please run neighbors first!")

    labels = partition_cells_by_kmeans(
        data.obsm[f"X_{rep_kmeans}"],
        n_clusters,
        n_clusters2,
        n_init,
        n_jobs,
        random_state,
    )

    W = data.obsp[f"W_{rep}"]

    G = construct_graph(W)
    partition_type = leidenalg.RBConfigurationVertexPartition
    partition = partition_type(G,
                               resolution_parameter=resolution,
                               weights="weight",
                               initial_membership=labels)
    partition_agg = partition.aggregate_partition()

    optimiser = leidenalg.Optimiser()
    optimiser.set_rng_seed(random_state)
    diff = optimiser.optimise_partition(partition_agg, -1)
    partition.from_coarse_partition(partition_agg)

    labels = np.array([str(x + 1) for x in partition.membership])
    categories = natsorted(np.unique(labels))
    data.obs[class_label] = pd.Categorical(values=labels,
                                           categories=categories)
    data.register_attr(class_label, "cluster")

    n_clusters = data.obs[class_label].cat.categories.size
    logger.info(
        f"Spectral Leiden clustering is done. Get {n_clusters} clusters.")

예제 #5

파일 보기