def calc_force_directed_layout( W, file_name, n_jobs, target_change_per_node, target_steps, is3d, memory, random_state, init=None, ): """ TODO: Typing """ G = construct_graph(W) return fa2.forceatlas2( file_name, graph=G, n_jobs=n_jobs, target_change_per_node=target_change_per_node, target_steps=target_steps, is3d=is3d, memory=memory, random_state=random_state, init=init, )
def calc_force_directed_layout( W, file_name, n_jobs, target_change_per_node, target_steps, is3d, memory, random_state, init=None, ): """ TODO: Typing """ G = construct_graph(W) try: import forceatlas2 as fa2 except ModuleNotFoundError: import sys logger.error( "Need forceatlas2-python! Try 'pip install forceatlas2-python'.") sys.exit(-1) return fa2.forceatlas2( file_name, graph=G, n_jobs=n_jobs, target_change_per_node=target_change_per_node, target_steps=target_steps, is3d=is3d, memory=memory, random_state=random_state, init=init, )
def louvain( data: AnnData, rep: str = "pca", resolution: int = 1.3, random_state: int = 0, class_label: str = "louvain_labels", ) -> None: """Cluster the cells using Louvain algorithm. Parameters ---------- data: ``anndata.AnnData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` The embedding representation used for clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates. resolution: ``int``, optional, default: ``1.3`` Resolution factor. Higher resolution tends to find more clusters with smaller sizes. random_state: ``int``, optional, default: ``0`` Random seed for reproducing results. class_label: ``str``, optional, default: ``"louvain_labels"`` Key name for storing cluster labels in ``data.obs``. Returns ------- ``None`` Update ``data.obs``: * ``data.obs[class_label]``: Cluster labels of cells as categorical data. Examples -------- >>> pg.louvain(adata) """ start = time.time() rep_key = "W_" + rep if rep_key not in data.uns: raise ValueError("Cannot find affinity matrix. Please run neighbors first!") W = data.uns[rep_key] G = construct_graph(W) partition_type = louvain_module.RBConfigurationVertexPartition partition = partition_type(G, resolution_parameter=resolution, weights="weight") optimiser = louvain_module.Optimiser() optimiser.set_rng_seed(random_state) diff = optimiser.optimise_partition(partition) labels = np.array([str(x + 1) for x in partition.membership]) categories = natsorted(np.unique(labels)) data.obs[class_label] = pd.Categorical(values=labels, categories=categories) end = time.time() logger.info("Louvain clustering is done. Time spent = {:.2f}s.".format(end - start))
def leiden(W, resolution, random_state=0): start = time.perf_counter() G = construct_graph(W) partition_type = leidenalg.RBConfigurationVertexPartition partition = leidenalg.find_partition( G, partition_type, seed=random_state, weights="weight", resolution_parameter=resolution, n_iterations=-1, ) labels = np.array([str(x + 1) for x in partition.membership]) end = time.perf_counter() n_clusters = len(set(labels)) logger.info( f"Finished leiden clustering for res = {resolution}. Get {n_clusters} clusters. " f"Time spent = {end-start:.2f}s.") return pd.Series(labels)
def spectral_leiden( data: MultimodalData, rep: str = "pca", resolution: float = 1.3, rep_kmeans: str = "diffmap", n_clusters: int = 30, n_clusters2: int = 50, n_init: int = 10, n_jobs: int = -1, random_state: int = 0, class_label: str = "spectral_leiden_labels", ) -> None: """Cluster the data using Spectral Leiden algorithm. [Li20]_ Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` The embedding representation used for clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates. resolution: ``int``, optional, default: ``1.3`` Resolution factor. Higher resolution tends to find more clusters. rep_kmeans: ``str``, optional, default: ``"diffmap"`` The embedding representation on which the KMeans runs. Keyword must exist in ``data.obsm``. By default, use Diffusion Map coordinates. If diffmap is not calculated, use PCA coordinates instead. n_clusters: ``int``, optional, default: ``30`` The number of first level clusters. n_clusters2: ``int``, optional, default: ``50`` The number of second level clusters. n_init: ``int``, optional, default: ``10`` Number of kmeans tries for the first level clustering. Default is set to be the same as scikit-learn Kmeans function. n_jobs : `int`, optional (default: -1) Number of threads to use for the KMeans step. -1 refers to using all physical CPU cores. random_state: ``int``, optional, default: ``0`` Random seed for reproducing results. class_label: ``str``, optional, default: ``"spectral_leiden_labels"`` Key name for storing cluster labels in ``data.obs``. Returns ------- ``None`` Update ``data.obs``: * ``data.obs[class_label]``: Cluster labels for cells as categorical data. Examples -------- >>> pg.spectral_leiden(data) """ try: import leidenalg except ImportError: import sys logger.error("Need leidenalg! Try 'pip install leidenalg'.") sys.exit(-1) if f"X_{rep_kmeans}" not in data.obsm.keys(): logger.warning( f"{rep_kmeans} is not calculated, switch to pca instead.") rep_kmeans = "pca" if f"X_{rep_kmeans}" not in data.obsm.keys(): raise ValueError(f"Please run {rep_kmeans} first!") if f"W_{rep}" not in data.obsp: raise ValueError( "Cannot find affinity matrix. Please run neighbors first!") labels = partition_cells_by_kmeans( data.obsm[f"X_{rep_kmeans}"], n_clusters, n_clusters2, n_init, n_jobs, random_state, ) W = data.obsp[f"W_{rep}"] G = construct_graph(W) partition_type = leidenalg.RBConfigurationVertexPartition partition = partition_type(G, resolution_parameter=resolution, weights="weight", initial_membership=labels) partition_agg = partition.aggregate_partition() optimiser = leidenalg.Optimiser() optimiser.set_rng_seed(random_state) diff = optimiser.optimise_partition(partition_agg, -1) partition.from_coarse_partition(partition_agg) labels = np.array([str(x + 1) for x in partition.membership]) categories = natsorted(np.unique(labels)) data.obs[class_label] = pd.Categorical(values=labels, categories=categories) data.register_attr(class_label, "cluster") n_clusters = data.obs[class_label].cat.categories.size logger.info( f"Spectral Leiden clustering is done. Get {n_clusters} clusters.")
def leiden( data: MultimodalData, rep: str = "pca", resolution: int = 1.3, n_clust: int = None, n_iter: int = -1, random_state: int = 0, class_label: str = "leiden_labels", ) -> None: """Cluster the data using Leiden algorithm. [Traag19]_ Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` The embedding representation used for clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm`` and nearest neighbors must be calculated so that affinity matrix ``'W_' + rep`` exists in ``data.uns``. By default, use PCA coordinates. resolution: ``int``, optional, default: ``1.3`` Resolution factor. Higher resolution tends to find more clusters. n_clust: ``int``, optional, default: ``None`` This option only takes effect if 'resolution = None'. Try to find an appropriate resolution by binary search such that the total number of clusters matches 'n_clust'. The range of resolution to search is (0.01, 2.0]. n_iter: ``int``, optional, default: ``-1`` Number of iterations that Leiden algorithm runs. If ``-1``, run the algorithm until reaching its optimal clustering. random_state: ``int``, optional, default: ``0`` Random seed for reproducing results. class_label: ``str``, optional, default: ``"leiden_labels"`` Key name for storing cluster labels in ``data.obs``. Returns ------- ``None`` Update ``data.obs``: * ``data.obs[class_label]``: Cluster labels of cells as categorical data. Examples -------- >>> pg.leiden(data) """ try: import leidenalg except ImportError: import sys logger.error("Need leidenalg! Try 'pip install leidenalg'.") sys.exit(-1) rep_key = "W_" + rep if rep_key not in data.obsp: raise ValueError( "Cannot find affinity matrix. Please run neighbors first!") W = data.obsp[rep_key] G = construct_graph(W) if resolution is not None: membership = _run_community_detection("leiden", leidenalg, G, resolution, random_state, n_iter) else: assert isinstance(n_clust, int) resolution, membership = _find_optimal_resolution( "leiden", leidenalg, n_clust, 2.0, G, random_state, n_iter) data.uns["leiden_resolution"] = resolution labels = np.array([str(x + 1) for x in membership]) categories = natsorted(np.unique(labels)) data.obs[class_label] = pd.Categorical(values=labels, categories=categories) data.register_attr(class_label, "cluster") n_clusters = data.obs[class_label].cat.categories.size logger.info(f"Leiden clustering is done. Get {n_clusters} clusters.")
def leiden( data: AnnData, rep: str = "pca", resolution: int = 1.3, n_iter: int = -1, random_state: int = 0, class_label: str = "leiden_labels", ) -> None: """Cluster the data using Leiden algorithm. Parameters ---------- data: ``anndata.AnnData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` The embedding representation used for clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates. resolution: ``int``, optional, default: ``1.3`` Resolution factor. Higher resolution tends to find more clusters. n_iter: ``int``, optional, default: ``-1`` Number of iterations that Leiden algorithm runs. If ``-1``, run the algorithm until reaching its optimal clustering. random_state: ``int``, optional, default: ``0`` Random seed for reproducing results. class_label: ``str``, optional, default: ``"leiden_labels"`` Key name for storing cluster labels in ``data.obs``. Returns ------- ``None`` Update ``data.obs``: * ``data.obs[class_label]``: Cluster labels of cells as categorical data. Examples -------- >>> pg.leiden(adata) """ start = time.time() rep_key = "W_" + rep if rep_key not in data.uns: raise ValueError("Cannot find affinity matrix. Please run neighbors first!") W = data.uns[rep_key] G = construct_graph(W) partition_type = leidenalg.RBConfigurationVertexPartition partition = leidenalg.find_partition( G, partition_type, seed=random_state, weights="weight", resolution_parameter=resolution, n_iterations=n_iter, ) labels = np.array([str(x + 1) for x in partition.membership]) categories = natsorted(np.unique(labels)) data.obs[class_label] = pd.Categorical(values=labels, categories=categories) end = time.time() logger.info("Leiden clustering is done. Time spent = {:.2f}s.".format(end - start))
def spectral_leiden( data: AnnData, rep: str = "pca", resolution: float = 1.3, rep_kmeans: str = "diffmap", n_clusters: int = 30, n_clusters2: int = 50, n_init: int = 10, n_jobs: int = -1, random_state: int = 0, class_label: str = "spectral_leiden_labels", ) -> None: """Cluster the data using Spectral Leiden algorithm. Parameters ---------- data: ``anndata.AnnData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` The embedding representation used for clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates. resolution: ``int``, optional, default: ``1.3`` Resolution factor. Higher resolution tends to find more clusters. rep_kmeans: ``str``, optional, default: ``"diffmap"`` The embedding representation on which the KMeans runs. Keyword must exist in ``data.obsm``. By default, use Diffusion Map coordinates. If diffmap is not calculated, use PCA coordinates instead. n_clusters: ``int``, optional, default: ``30`` The number of first level clusters. n_clusters2: ``int``, optional, default: ``50`` The number of second level clusters. n_init: ``int``, optional, default: ``10`` Number of kmeans tries for the first level clustering. Default is set to be the same as scikit-learn Kmeans function. n_jobs: ``int``, optional, default: ``-1`` Number of threads to use. If ``-1``, use all available threads. random_state: ``int``, optional, default: ``0`` Random seed for reproducing results. temp_folder: ``str``, optional, default: ``None`` Temporary folder name for joblib to use during the computation. class_label: ``str``, optional, default: ``"spectral_leiden_labels"`` Key name for storing cluster labels in ``data.obs``. Returns ------- ``None`` Update ``data.obs``: * ``data.obs[class_label]``: Cluster labels for cells as categorical data. Examples -------- >>> pg.spectral_leiden(adata) """ start = time.time() if "X_" + rep_kmeans not in data.obsm.keys(): logger.warning("{} is not calculated, switch to pca instead.".format(rep_kmeans)) rep_kmeans = "pca" if "X_" + rep_kmeans not in data.obsm.keys(): raise ValueError("Please run {} first!".format(rep_kmeans)) if "W_" + rep not in data.uns: raise ValueError("Cannot find affinity matrix. Please run neighbors first!") labels = partition_cells_by_kmeans( data, rep_kmeans, n_jobs, n_clusters, n_clusters2, n_init, random_state, ) W = data.uns["W_" + rep] G = construct_graph(W) partition_type = leidenalg.RBConfigurationVertexPartition partition = partition_type( G, resolution_parameter=resolution, weights="weight", initial_membership=labels ) partition_agg = partition.aggregate_partition() optimiser = leidenalg.Optimiser() optimiser.set_rng_seed(random_state) diff = optimiser.optimise_partition(partition_agg, -1) partition.from_coarse_partition(partition_agg) labels = np.array([str(x + 1) for x in partition.membership]) categories = natsorted(np.unique(labels)) data.obs[class_label] = pd.Categorical(values=labels, categories=categories) end = time.time() logger.info( "Spectral Leiden clustering is done. Time spent = {:.2f}s.".format( end - start ) )