def centrality_scores( adata: AnnData, cluster_key: str, score: Optional[Union[str, Iterable[str]]] = None, connectivity_key: Optional[str] = None, copy: bool = False, n_jobs: Optional[int] = None, backend: str = "loky", show_progress_bar: bool = False, ) -> Optional[pd.DataFrame]: """ Compute centrality scores per cluster or cell type. Inspired by usage in Gene Regulatory Networks (GRNs) in :cite:`celloracle`. Parameters ---------- %(adata)s %(cluster_key)s score Centrality measures as described in :class:`networkx.algorithms.centrality` :cite:`networkx`. If `None`, use all the options below. Valid options are: - `{c.CLOSENESS.s!r}` - measure of how close the group is to other nodes. - `{c.CLUSTERING.s!r}` - measure of the degree to which nodes cluster together. - `{c.DEGREE.s!r}` - fraction of non-group members connected to group members. %(conn_key)s %(copy)s %(parallelize)s Returns ------- If ``copy = True``, returns a :class:`pandas.DataFrame`. Otherwise, modifies the ``adata`` with the following key: - :attr:`anndata.AnnData.uns` ``['{{cluster_key}}_centrality_scores']`` - the centrality scores, as mentioned above. """ connectivity_key = Key.obsp.spatial_conn(connectivity_key) _assert_categorical_obs(adata, cluster_key) _assert_connectivity_key(adata, connectivity_key) if isinstance(score, (str, Centrality)): centrality = [score] elif score is None: centrality = [c.s for c in Centrality] centralities = [Centrality(c) for c in centrality] graph = nx.from_scipy_sparse_matrix(adata.obsp[connectivity_key]) cat = adata.obs[cluster_key].cat.categories.values clusters = adata.obs[cluster_key].values fun_dict = {} for c in centralities: if c == Centrality.CLOSENESS: fun_dict[c.s] = partial( nx.algorithms.centrality.group_closeness_centrality, graph) elif c == Centrality.DEGREE: fun_dict[c.s] = partial( nx.algorithms.centrality.group_degree_centrality, graph) elif c == Centrality.CLUSTERING: fun_dict[c.s] = partial(nx.algorithms.cluster.average_clustering, graph) else: raise NotImplementedError( f"Centrality `{c}` is not yet implemented.") n_jobs = _get_n_cores(n_jobs) start = logg.info( f"Calculating centralities `{centralities}` using `{n_jobs}` core(s)") res_list = [] for k, v in fun_dict.items(): df = parallelize( _centrality_scores_helper, collection=cat, extractor=pd.concat, n_jobs=n_jobs, backend=backend, show_progress_bar=show_progress_bar, )(clusters=clusters, fun=v, method=k) res_list.append(df) df = pd.concat(res_list, axis=1) if copy: return df _save_data(adata, attr="uns", key=Key.uns.centrality_scores(cluster_key), data=df, time=start)
def _analysis( data: pd.DataFrame, interactions: np.ndarray, interaction_clusters: np.ndarray, threshold: float = 0.1, n_perms: int = 1000, seed: Optional[int] = None, n_jobs: int = 1, numba_parallel: Optional[bool] = None, **kwargs: Any, ) -> TempResult: """ Run the analysis as described in :cite:`cellphonedb`. This function runs the mean, percent and shuffled analysis. Parameters ---------- data Array of shape `(n_cells, n_genes)`. interactions Array of shape `(n_interactions, 2)`. interaction_clusters Array of shape `(n_interaction_clusters, 2)`. threshold Percentage threshold for removing lowly expressed genes in clusters. %(n_perms)s %(seed)s n_jobs Number of parallel jobs to launch. numba_parallel Whether to use :class:`numba.prange` or not. If `None`, it's determined automatically. kwargs Keyword arguments for :func:`squidpy._utils.parallelize`, such as ``n_jobs`` or ``backend``. Returns ------- Tuple of the following format: - `'means'` - array of shape `(n_interactions, n_interaction_clusters)` containing the means. - `'pvalues'` - array of shape `(n_interactions, n_interaction_clusters)` containing the p-values. """ def extractor(res: Sequence[TempResult]) -> TempResult: assert len( res ) == n_jobs, f"Expected to find `{n_jobs}` results, found `{len(res)}`." meanss: List[np.ndarray] = [ r.means for r in res if r.means is not None ] assert len( meanss ) == 1, f"Only `1` job should've calculated the means, but found `{len(meanss)}`." means = meanss[0] if TYPE_CHECKING: assert isinstance(means, np.ndarray) pvalues = np.sum([r.pvalues for r in res if r.pvalues is not None], axis=0) / float(n_perms) assert means.shape == pvalues.shape, f"Means and p-values differ in shape: `{means.shape}`, `{pvalues.shape}`." return TempResult(means=means, pvalues=pvalues) groups = data.groupby("clusters") clustering = np.array(data["clusters"].values, dtype=np.int32) mean = groups.mean().values.T # (n_genes, n_clusters) mask = groups.apply(lambda c: ( (c > 0).sum() / len(c)) >= threshold).values.T # (n_genes, n_clusters) # (n_cells, n_genes) data = np.array(data[data.columns.difference(["clusters"])].values, dtype=np.float64, order="C") # all 3 should be C contiguous return parallelize( # type: ignore[no-any-return] _analysis_helper, np.arange(n_perms, dtype=np.int32), n_jobs=n_jobs, unit="permutation", extractor=extractor, **kwargs, )( data, mean, mask, interactions, interaction_clusters=interaction_clusters, clustering=clustering, seed=seed, numba_parallel=numba_parallel, )
def nhood_enrichment( adata: AnnData, cluster_key: str, connectivity_key: Optional[str] = None, n_perms: int = 1000, numba_parallel: bool = False, seed: Optional[int] = None, copy: bool = False, n_jobs: Optional[int] = None, backend: str = "loky", show_progress_bar: bool = True, ) -> Optional[Tuple[np.ndarray, np.ndarray]]: """ Compute neighborhood enrichment by permutation test. Parameters ---------- %(adata)s %(cluster_key)s %(conn_key)s %(n_perms)s %(numba_parallel)s %(seed)s %(copy)s %(parallelize)s Returns ------- If ``copy = True``, returns a :class:`tuple` with the z-score and the enrichment count. Otherwise, modifies the ``adata`` with the following keys: - :attr:`anndata.AnnData.uns` ``['{cluster_key}_nhood_enrichment']['zscore']`` - the enrichment z-score. - :attr:`anndata.AnnData.uns` ``['{cluster_key}_nhood_enrichment']['count']`` - the enrichment count. """ connectivity_key = Key.obsp.spatial_conn(connectivity_key) _assert_categorical_obs(adata, cluster_key) _assert_connectivity_key(adata, connectivity_key) _assert_positive(n_perms, name="n_perms") adj = adata.obsp[connectivity_key] original_clust = adata.obs[cluster_key] clust_map = { v: i for i, v in enumerate(original_clust.cat.categories.values) } # map categories int_clust = np.array([clust_map[c] for c in original_clust], dtype=ndt) indices, indptr = (adj.indices.astype(ndt), adj.indptr.astype(ndt)) n_cls = len(clust_map) _test = _create_function(n_cls, parallel=numba_parallel) count = _test(indices, indptr, int_clust) n_jobs = _get_n_cores(n_jobs) start = logg.info( f"Calculating neighborhood enrichment using `{n_jobs}` core(s)") perms = parallelize( _nhood_enrichment_helper, collection=np.arange(n_perms), extractor=np.vstack, n_jobs=n_jobs, backend=backend, show_progress_bar=show_progress_bar, )(callback=_test, indices=indices, indptr=indptr, int_clust=int_clust, n_cls=n_cls, seed=seed) zscore = (count - perms.mean(axis=0)) / perms.std(axis=0) if copy: return zscore, count _save_data( adata, attr="uns", key=Key.uns.nhood_enrichment(cluster_key), data={ "zscore": zscore, "count": count }, time=start, )
def moran( adata: AnnData, connectivity_key: str = Key.obsp.spatial_conn(), genes: Optional[Union[str, Sequence[str]]] = None, transformation: Literal["r", "B", "D", "U", "V"] = "r", n_perms: int = 1000, corr_method: Optional[str] = "fdr_bh", layer: Optional[str] = None, seed: Optional[int] = None, copy: bool = False, n_jobs: Optional[int] = None, backend: str = "loky", show_progress_bar: bool = True, ) -> Optional[pd.DataFrame]: """ Calculate Moran’s I Global Autocorrelation Statistic. Parameters ---------- %(adata)s %(conn_key)s genes List of gene names, as stored in :attr:`anndata.AnnData.var_names`, used to compute Moran's I statistics :cite:`pysal`. If `None`, it's computed :attr:`anndata.AnnData.var` ``['highly_variable']``, if present. Otherwise, it's computed for all genes. transformation Transformation to be used, as reported in :class:`esda.Moran`. Default is `"r"`, row-standardized. %(n_perms)s %(corr_method)s layer Layer in :attr:`anndata.AnnData.layers` to use. If `None`, use :attr:`anndata.AnnData.X`. %(seed)s %(copy)s %(parallelize)s Returns ------- If ``copy = True``, returns a :class:`pandas.DataFrame` with the following keys: - `'I'` - Moran's I statistic. - `'pval_sim'` - p-value based on permutations. - `'VI_sim'` - variance of `'I'` from permutations. - `'pval_sim_{{corr_method}}'` - the corrected p-values if ``corr_method != None`` . Otherwise, modifies the ``adata`` with the following key: - :attr:`anndata.AnnData.uns` ``['moranI']`` - the above mentioned dataframe. """ if esda is None or libpysal is None: raise ImportError( "Please install `esda` and `libpysal` as `pip install esda libpysal`." ) _assert_positive(n_perms, name="n_perms") _assert_connectivity_key(adata, connectivity_key) if genes is None: if "highly_variable" in adata.var.columns: genes = adata[:, adata.var.highly_variable.values].var_names.values else: genes = adata.var_names.values genes = _assert_non_empty_sequence(genes, name="genes") n_jobs = _get_n_cores(n_jobs) start = logg.info( f"Calculating for `{len(genes)}` genes using `{n_jobs}` core(s)") w = _set_weight_class(adata, key=connectivity_key) # init weights df = parallelize( _moran_helper, collection=genes, extractor=pd.concat, use_ixs=True, n_jobs=n_jobs, backend=backend, show_progress_bar=show_progress_bar, )(adata=adata, weights=w, transformation=transformation, permutations=n_perms, layer=layer, seed=seed) if corr_method is not None: _, pvals_adj, _, _ = multipletests(df["pval_sim"].values, alpha=0.05, method=corr_method) df[f"pval_sim_{corr_method}"] = pvals_adj df.sort_values(by="I", ascending=False, inplace=True) if copy: logg.info("Finish", time=start) return df _save_data(adata, attr="uns", key="moranI", data=df, time=start)
x, y = np.triu_indices_from(np.empty((n_splits, n_splits))) idx_splits = [(i, j) for i, j in zip(x, y)] n_jobs = _get_n_cores(n_jobs) start = logg.info(f"Calculating co-occurrence probabilities for\ `{len(interval)}` intervals\ `{len(idx_splits)}` split combinations\ using `{n_jobs}` core(s)") out_lst = parallelize( _co_occurrence_helper, collection=idx_splits, extractor=chain.from_iterable, n_jobs=n_jobs, backend=backend, show_progress_bar=show_progress_bar, )( spatial_splits=spatial_splits, labs_splits=labs_splits, labs_unique=labs_unique, interval=interval, ) if len(idx_splits) == 1: out = list(out_lst)[0] else: out = sum(list(out_lst)) / len(idx_splits) if copy: logg.info("Finish", time=start) return out, interval
def spatial_autocorr( adata: AnnData, connectivity_key: str = Key.obsp.spatial_conn(), genes: Optional[Union[str, Sequence[str]]] = None, mode: Literal[ "moran", "geary"] = SpatialAutocorr.MORAN.s, # type: ignore[assignment] transformation: bool = True, n_perms: Optional[int] = None, two_tailed: bool = False, corr_method: Optional[str] = "fdr_bh", layer: Optional[str] = None, seed: Optional[int] = None, use_raw: bool = False, copy: bool = False, n_jobs: Optional[int] = None, backend: str = "loky", show_progress_bar: bool = True, ) -> Optional[pd.DataFrame]: """ Calculate Global Autocorrelation Statistic (Moran’s I or Geary's C). See :cite:`pysal` for reference. Parameters ---------- %(adata)s %(conn_key)s genes List of gene names, as stored in :attr:`anndata.AnnData.var_names`, used to compute global spatial autocorrelation statistic. If `None`, it's computed :attr:`anndata.AnnData.var` ``['highly_variable']``, if present. Otherwise, it's computed for all genes. mode Mode of score calculation: - `{sp.MORAN.s!r}` - `Moran's I autocorrelation <https://en.wikipedia.org/wiki/Moran%27s_I>`_. - `{sp.GEARY.s!r}` - `Geary's C autocorrelation <https://en.wikipedia.org/wiki/Geary%27s_C>`_. transformation If `True`, weights in :attr:`anndata.AnnData.obsp` ``['{key}']`` are row-normalized, advised for analytic p-value calculation. %(n_perms)s If `None`, only p-values under normality assumption are computed. two_tailed If `True`, p-values are two-tailed, otherwise they are one-tailed. %(corr_method)s layer Layer in :attr:`anndata.AnnData.layers` to use. If `None`, use :attr:`anndata.AnnData.X`. %(seed)s %(copy)s %(parallelize)s Returns ------- If ``copy = True``, returns a :class:`pandas.DataFrame` with the following keys: - `'I' or 'C'` - Moran's I or Geary's C statistic. - `'pval_norm'` - p-value under normality assumption. - `'var_norm'` - variance of `'score'` under normality assumption. - `'{{p_val}}_{{corr_method}}'` - the corrected p-values if ``corr_method != None`` . If ``n_perms != None`` is not None, additionally returns the following columns: - `'pval_z_sim'` - p-value based on standard normal approximation from permutations. - `'pval_sim'` - p-value based on permutations. - `'var_sim'` - variance of `'score'` from permutations. Otherwise, modifies the ``adata`` with the following key: - :attr:`anndata.AnnData.uns` ``['moranI']`` - the above mentioned dataframe, if ``mode = {sp.MORAN.s!r}``. - :attr:`anndata.AnnData.uns` ``['gearyC']`` - the above mentioned dataframe, if ``mode = {sp.GEARY.s!r}``. """ _assert_connectivity_key(adata, connectivity_key) if genes is None: if "highly_variable" in adata.var.columns: genes = adata[:, adata.var.highly_variable.values].var_names.values else: genes = adata.var_names.values genes = _assert_non_empty_sequence(genes, name="genes") mode = SpatialAutocorr(mode) # type: ignore[assignment] if TYPE_CHECKING: assert isinstance(mode, SpatialAutocorr) params = { "mode": mode.s, "transformation": transformation, "two_tailed": two_tailed } if mode == SpatialAutocorr.MORAN: params["func"] = _morans_i params["stat"] = "I" params["expected"] = -1.0 / (adata.shape[0] - 1) # expected score params["ascending"] = False elif mode == SpatialAutocorr.GEARY: params["func"] = _gearys_c params["stat"] = "C" params["expected"] = 1.0 params["ascending"] = True else: raise NotImplementedError(f"Mode `{mode}` is not yet implemented.") n_jobs = _get_n_cores(n_jobs) vals = _get_obs_rep(adata[:, genes], use_raw=use_raw, layer=layer).T g = adata.obsp[connectivity_key].copy() # row-normalize if transformation: normalize(g, norm="l1", axis=1, copy=False) score = params["func"](g, vals) start = logg.info( f"Calculating {mode}'s statistic for `{n_perms}` permutations using `{n_jobs}` core(s)" ) if n_perms is not None: _assert_positive(n_perms, name="n_perms") perms = np.arange(n_perms) score_perms = parallelize( _score_helper, collection=perms, extractor=np.concatenate, use_ixs=True, n_jobs=n_jobs, backend=backend, show_progress_bar=show_progress_bar, )(mode=mode, g=g, vals=vals, seed=seed) else: score_perms = None with np.errstate(divide="ignore"): pval_results = _p_value_calc(score, score_perms, g, params) results = {params["stat"]: score} results.update(pval_results) df = pd.DataFrame(results, index=genes) if corr_method is not None: for pv in filter(lambda x: "pval" in x, df.columns): _, pvals_adj, _, _ = multipletests(df[pv].values, alpha=0.05, method=corr_method) df[f"{pv}_{corr_method}"] = pvals_adj df.sort_values(by=params["stat"], ascending=params["ascending"], inplace=True) if copy: logg.info("Finish", time=start) return df _save_data(adata, attr="uns", key=params["mode"] + params["stat"], data=df, time=start)
def segment( img: ImageContainer, layer: Optional[str] = None, method: Union[str, Callable[..., np.ndarray]] = "watershed", channel: int = 0, size: Optional[Union[int, Tuple[int, int]]] = None, layer_added: Optional[str] = None, copy: bool = False, show_progress_bar: bool = True, n_jobs: Optional[int] = None, backend: str = "loky", **kwargs: Any, ) -> Optional[ImageContainer]: """ Segment an image. If ``size`` is defined, iterate over crops of that size and segment those. Recommended for large images. Parameters ---------- %(img_container)s %(img_layer)s %(seg_blob.parameters)s - `{m.WATERSHED.s!r}` - :func:`skimage.segmentation.watershed`. %(custom_fn)s channel Channel index to use for segmentation. %(size)s %(layer_added)s If `None`, use ``'segmented_{{model}}'``. thresh Threshold for creation of masked image. The areas to segment should be contained in this mask. If `None`, it is determined by `Otsu's method <https://en.wikipedia.org/wiki/Otsu%27s_method>`_. Only used if ``method = {m.WATERSHED.s!r}``. geq Treat ``thresh`` as upper or lower bound for defining areas to segment. If ``geq = True``, mask is defined as ``mask = arr >= thresh``, meaning high values in ``arr`` denote areas to segment. invert Whether to segment an inverted array. Only used if ``method`` is one of :mod:`skimage` blob methods. %(copy_cont)s %(segment_kwargs)s %(parallelize)s kwargs Keyword arguments for ``method``. Returns ------- If ``copy = True``, returns a new container with the segmented image in ``'{{layer_added}}'``. Otherwise, modifies the ``img`` with the following key: - :class:`squidpy.im.ImageContainer` ``['{{layer_added}}']`` - the segmented image. """ layer = img._get_layer(layer) channel_dim = img[layer].dims[-1] kind = SegmentationBackend.CUSTOM if callable(method) else SegmentationBackend(method) layer_new = Key.img.segment(kind, layer_added=layer_added) if kind in (SegmentationBackend.LOG, SegmentationBackend.DOG, SegmentationBackend.DOH): segmentation_model: SegmentationModel = SegmentationBlob(model=kind) elif kind == SegmentationBackend.WATERSHED: segmentation_model = SegmentationWatershed() elif kind == SegmentationBackend.CUSTOM: if TYPE_CHECKING: assert callable(method) segmentation_model = SegmentationCustom(func=method) else: raise NotImplementedError(f"Model `{kind}` is not yet implemented.") n_jobs = _get_n_cores(n_jobs) crops: List[ImageContainer] = list(img.generate_equal_crops(size=size, as_array=False)) start = logg.info(f"Segmenting `{len(crops)}` crops using `{segmentation_model}` and `{n_jobs}` core(s)") crops: List[ImageContainer] = parallelize( # type: ignore[no-redef] _segment, collection=crops, unit="crop", extractor=lambda res: list(chain.from_iterable(res)), n_jobs=n_jobs, backend=backend, show_progress_bar=show_progress_bar and len(crops) > 1, )(model=segmentation_model, layer=layer, layer_new=layer_new, channel=channel, **kwargs) if isinstance(segmentation_model, SegmentationWatershed): # By convention, segments are numbered from 1..number of segments within each crop. # Next, we have to account for that before merging the crops so that segments are not confused. # TODO use overlapping crops to not create confusion at boundaries counter = 0 for crop in crops: data = crop[layer_new].data data[data > 0] += counter counter += np.max(crop[layer_new].data) res: ImageContainer = ImageContainer.uncrop(crops, shape=img.shape) res._data = res.data.rename({channel_dim: f"{channel_dim}:{channel}"}) logg.info("Finish", time=start) if copy: return res img.add_img(res, layer=layer_new, copy=False, channel_dim=res[layer_new].dims[-1])
def calculate_image_features( adata: AnnData, img: ImageContainer, layer: Optional[str] = None, features: Union[str, Sequence[str]] = ImageFeature.SUMMARY.s, features_kwargs: Mapping[str, Mapping[str, Any]] = MappingProxyType({}), key_added: str = "img_features", copy: bool = False, n_jobs: Optional[int] = None, backend: str = "loky", show_progress_bar: bool = True, **kwargs: Any, ) -> Optional[pd.DataFrame]: """ Calculate image features for all observations in ``adata``. Parameters ---------- %(adata)s %(img_container)s %(img_layer)s features Features to be calculated. Valid options are: - `{f.TEXTURE.s!r}` - summary stats based on repeating patterns :meth:`squidpy.im.ImageContainer.features_texture`. - `{f.SUMMARY.s!r}` - summary stats of each image channel :meth:`squidpy.im.ImageContainer.features_summary`. - `{f.COLOR_HIST.s!r}` - counts in bins of image channel's histogram :meth:`squidpy.im.ImageContainer.features_histogram`. - `{f.SEGMENTATION.s!r}` - stats of a cell segmentation mask :meth:`squidpy.im.ImageContainer.features_segmentation`. - `{f.CUSTOM.s!r}` - extract features using a custom function :meth:`squidpy.im.ImageContainer.features_custom`. features_kwargs Keyword arguments for the different features that should be generated, such as ``{{ {f.TEXTURE.s!r}: {{ ... }}, ... }}``. key_added Key in :attr:`anndata.AnnData.obsm` where to store the calculated features. %(copy)s %(parallelize)s kwargs Keyword arguments for :meth:`squidpy.im.ImageContainer.generate_spot_crops`. Returns ------- If ``copy = True``, returns a :class:`panda.DataFrame` where columns correspond to the calculated features. Otherwise, modifies the ``adata`` object with the following key: - :attr:`anndata.AnnData.uns` ``['{{key_added}}']`` - the above mentioned dataframe. Raises ------ ValueError If a feature is not known. """ layer = img._get_layer(layer) if isinstance(features, (str, ImageFeature)): features = [features] features = sorted({ImageFeature(f).s for f in features}) n_jobs = _get_n_cores(n_jobs) start = logg.info( f"Calculating features `{list(features)}` using `{n_jobs}` core(s)") res = parallelize( _calculate_image_features_helper, collection=adata.obs_names, extractor=pd.concat, n_jobs=n_jobs, backend=backend, show_progress_bar=show_progress_bar, )(adata, img, layer=layer, features=features, features_kwargs=features_kwargs, **kwargs) if copy: logg.info("Finish", time=start) return res _save_data(adata, attr="obsm", key=key_added, data=res, time=start)