def score_genes_cell_cycle(adata, s_genes=None, g2m_genes=None, copy=False, **kwargs): """\ Score cell cycle genes. Calculates scores and assigns a cell cycle phase (G1, S, G2M) using the list of cell cycle genes defined in Tirosh et al, 2015 (https://doi.org/10.1126/science.aad0501). Parameters ---------- adata The annotated data matrix. s_genes List of genes associated with S phase. g2m_genes List of genes associated with G2M phase. copy Copy `adata` or modify it inplace. **kwargs Are passed to :func:`~scanpy.tl.score_genes`. `ctrl_size` is not possible, as it's set as `min(len(s_genes), len(g2m_genes))`. Returns ------- S_score: `adata.obs`, dtype `object` The score for S phase for each cell. G2M_score: `adata.obs`, dtype `object` The score for G2M phase for each cell. phase: `adata.obs`, dtype `object` The cell cycle phase (`S`, `G2M` or `G1`) for each cell. """ logg.info("calculating cell cycle phase") from scanpy.tools._score_genes import score_genes adata = adata.copy() if copy else adata s_genes_, g2m_genes_ = get_phase_marker_genes(adata) if s_genes is None: s_genes = s_genes_ if g2m_genes is None: g2m_genes = g2m_genes_ ctrl_size = min(len(s_genes), len(g2m_genes)) kwargs.update({"ctrl_size": ctrl_size}) score_genes(adata, gene_list=s_genes, score_name="S_score", **kwargs) score_genes(adata, gene_list=g2m_genes, score_name="G2M_score", **kwargs) scores = adata.obs[["S_score", "G2M_score"]] phase = pd.Series("S", index=scores.index) # default phase is S phase[scores.G2M_score > scores.S_score] = "G2M" # G2M, if G2M is higher than S phase[np.all(scores < 0, axis=1)] = "G1" # G1, if all scores are negative adata.obs["phase"] = phase logg.hint(" 'S_score' and 'G2M_score', scores of cell cycle phases (adata.obs)") return adata if copy else None
def velocity_confidence_transition(data, vkey="velocity", scale=10, copy=False): """Computes confidences of velocity transitions. Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. vkey: `str` (default: `'velocity'`) Name of velocity estimates to be used. scale: `float` (default: 10) Scale parameter of gaussian kernel. copy: `bool` (default: `False`) Return a copy instead of writing to adata. Returns ------- velocity_confidence_transition: `.obs` Confidence of transition for each cell """ adata = data.copy() if copy else data if vkey not in adata.layers.keys(): raise ValueError("You need to run `tl.velocity` first.") X = np.array(adata.layers["Ms"]) V = np.array(adata.layers[vkey]) tmp_filter = np.invert(np.isnan(np.sum(V, axis=0))) if f"{vkey}_genes" in adata.var.keys(): tmp_filter &= np.array(adata.var[f"{vkey}_genes"], dtype=bool) if "spearmans_score" in adata.var.keys(): tmp_filter &= adata.var["spearmans_score"].values > 0.1 V = V[:, tmp_filter] X = X[:, tmp_filter] T = transition_matrix(adata, vkey=vkey, scale=scale) dX = T.dot(X) - X dX -= dX.mean(1)[:, None] V -= V.mean(1)[:, None] norms = l2_norm(dX, axis=1) * l2_norm(V, axis=1) norms += norms == 0 adata.obs[f"{vkey}_confidence_transition"] = prod_sum(dX, V, axis=1) / norms logg.hint(f"added '{vkey}_confidence_transition' (adata.obs)") return adata if copy else None
def velocity_clusters( data, vkey="velocity", match_with="clusters", sort_by="velocity_pseudotime", resolution=None, min_likelihood=None, copy=False, ): """Computes velocity clusters via louvain on velocities. .. code:: python scv.tl.velocity_clusters(adata) scv.pl.scatter(adata, color='velocity_clusters') .. image:: https://user-images.githubusercontent.com/31883718/69625627-484dc480-1047-11ea-847f-6607a3430427.png :width: 600px Arguments ---------- data : :class:`~anndata.AnnData` Annotated data matrix. vkey: `str` (default: `'velocity'`) Key of velocities computed in `tl.velocity` match_with : `int`, optional (default: 100) The number of genes that appear in the returned tables. match_with: `str` (default: `'clusters'`) Match the names of the velocity clusters with the names of this key (.obs). sort_by: `str` or `None` (default: `'dpt_pseudotime'`) Sort velocity clusters by this key (.obs). resolution: `float` (default: 0.7) Resolution for louvain modularity. min_likelihood: `float` between `0` and `1` or `None` (default: `None`) Only rank velocity of genes with a likelihood higher than min_likelihood. copy: `bool` (default: `False`) Return a copy instead of writing to data. Returns ------- velocity_clusters : `.obs` Clusters obtained from applying louvain modularity on velocity expression. """ # noqa E501 adata = data.copy() if copy else data logg.info("computing velocity clusters", r=True) tmp_filter = ~np.isnan(adata.layers[vkey].sum(0)) if f"{vkey}_genes" in adata.var.keys(): tmp_filter &= np.array(adata.var[f"{vkey}_genes"].values, dtype=bool) if "unspliced" in adata.layers.keys(): n_counts = (adata.layers["unspliced"] > 0).sum(0) n_counts = n_counts.A1 if issparse( adata.layers["unspliced"]) else n_counts min_counts = min(50, np.percentile(n_counts, 50)) tmp_filter &= np.ravel(n_counts > min_counts) if "r2" in adata.var.keys(): r2 = adata.var.velocity_r2 min_r2 = np.percentile(r2[r2 > 0], 50) tmp_filter &= r2 > min_r2 if "dispersions_norm" in adata.var.keys(): dispersions = adata.var.dispersions_norm min_dispersion = np.percentile(dispersions, 20) tmp_filter &= dispersions > min_dispersion if "fit_likelihood" in adata.var.keys() and min_likelihood is not None: tmp_filter &= adata.var["fit_likelihood"] > min_likelihood from anndata import AnnData vdata = AnnData(adata.layers[vkey][:, tmp_filter]) vdata.obs = adata.obs.copy() vdata.var = adata.var[tmp_filter].copy() if "highly_variable" in vdata.var.keys(): vdata.var["highly_variable"] = np.array(vdata.var["highly_variable"], dtype=bool) import scanpy as sc logg.switch_verbosity("off", module="scanpy") sc.pp.pca(vdata, n_comps=20, svd_solver="arpack") sc.pp.neighbors(vdata, n_pcs=20) sc.tl.louvain(vdata, resolution=0.7 if resolution is None else resolution) logg.switch_verbosity("on", module="scanpy") if sort_by == "velocity_pseudotime" and sort_by not in adata.obs.keys(): velocity_pseudotime(adata, vkey=vkey) if sort_by in vdata.obs.keys(): vc = vdata.obs["louvain"] vc_cats = vc.cat.categories mean_times = [ np.mean(vdata.obs[sort_by][vc == cat]) for cat in vc_cats ] vdata.obs["louvain"].cat.reorder_categories( vc_cats[np.argsort(mean_times)], inplace=True) if isinstance(match_with, str) and match_with in adata.obs.keys(): from .utils import most_common_in_list vc = vdata.obs["louvain"] cats_nums = {cat: 0 for cat in adata.obs[match_with].cat.categories} for cat in vc.cat.categories: cells_in_cat = np.where(vc == cat)[0] new_cat = most_common_in_list(adata.obs[match_with][cells_in_cat]) cats_nums[new_cat] += 1 vc = vc.cat.rename_categories( {cat: f"{new_cat} ({cats_nums[new_cat]})"}) vdata.obs["louvain"] = vc else: vdata.obs["louvain"].cat.categories = np.arange( len(vdata.obs["louvain"].cat.categories)) adata.obs[f"{vkey}_clusters"] = vdata.obs["louvain"].copy() del vdata logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" f" '{vkey}_clusters', " f"clusters based on louvain modularity on velocity vector field (adata.obs)" ) return adata if copy else None
def rank_velocity_genes( data, vkey="velocity", n_genes=100, groupby=None, match_with=None, resolution=None, min_counts=None, min_r2=None, min_corr=None, min_dispersion=None, min_likelihood=None, copy=False, ): """Rank genes for velocity characterizing groups. This applies a differential expression test (Welch t-test with overestimated variance to be conservative) on velocity expression, to find genes in a cluster that show dynamics that is transcriptionally regulated differently compared to all other clusters (e.g. induction in that cluster and homeostasis in remaining population). If no clusters are given, it priorly computes velocity clusters by applying louvain modularity on velocity expression. .. code:: python scv.tl.rank_velocity_genes(adata, groupby='clusters') scv.pl.scatter( adata, basis=adata.uns['rank_velocity_genes']['names']['Beta'][:3] ) pd.DataFrame(adata.uns['rank_velocity_genes']['names']).head() .. image:: https://user-images.githubusercontent.com/31883718/69626017-11c47980-1048-11ea-89f4-df3769df5ad5.png :width: 600px .. image:: https://user-images.githubusercontent.com/31883718/69626572-30774000-1049-11ea-871f-e8a30c42f10e.png :width: 600px Arguments ---------- data : :class:`~anndata.AnnData` Annotated data matrix. vkey: `str` (default: `'velocity'`) Key of velocities computed in `tl.velocity` n_genes : `int`, optional (default: 100) The number of genes that appear in the returned tables. groupby: `str`, `list` or `np.ndarray` (default: `None`) Key of observations grouping to consider. match_with: `str` or `None` (default: `None`) adata.obs key to separatively rank velocities on. resolution: `str` or `None` (default: `None`) Resolution for louvain modularity. min_counts: `float` (default: None) Minimum count of genes for consideration. min_r2: `float` (default: None) Minimum r2 value of genes for consideration. min_corr: `float` (default: None) Minimum Spearmans correlation coefficient between spliced and unspliced. min_dispersion: `float` (default: None) Minimum dispersion norm value of genes for consideration. min_likelihood: `float` between `0` and `1` or `None` (default: `None`) Only rank velocity of genes with a likelihood higher than min_likelihood. copy: `bool` (default: `False`) Return a copy instead of writing to data. Returns ------- rank_velocity_genes : `.uns` Structured array to be indexed by group id storing the gene names. Ordered according to scores. velocity_score : `.var` Storing the score for each gene for each group. Ordered according to scores. """ # noqa E501 adata = data.copy() if copy else data if groupby is None or groupby == "velocity_clusters": velocity_clusters( adata, vkey=vkey, match_with=match_with, resolution=resolution, min_likelihood=min_likelihood, ) groupby = f"{vkey}_clusters" logg.info("ranking velocity genes", r=True) if "spearmans_score" not in adata.var.keys(): corr = vcorrcoef( np.array(adata.layers["Ms"]).T, np.array(adata.layers["Mu"].T), mode="spearmans", ) adata.var["spearmans_score"] = np.clip(corr, 0, None) tmp_filter = ~np.isnan(adata.layers[vkey].sum(0)) if f"{vkey}_genes" in adata.var.keys(): tmp_filter &= np.array(adata.var[f"{vkey}_genes"].values, dtype=bool) if "unspliced" in adata.layers.keys(): n_counts = (adata.layers["unspliced"] > 0).sum(0) n_counts = n_counts.A1 if issparse( adata.layers["unspliced"]) else n_counts min_counts = (min(50, np.percentile(n_counts, 50)) if min_counts is None else min_counts) tmp_filter &= np.ravel(n_counts > min_counts) if f"{vkey}_r2" in adata.var.keys(): r2 = adata.var[f"{vkey}_r2"] min_r2 = 0.1 if min_r2 is None else min_r2 # np.percentile(r2[r2 > 0], 50) tmp_filter &= r2 > min_r2 if "spearmans_score" in adata.var.keys(): corr = adata.var["spearmans_score"] min_corr = (0.1 if min_corr is None else min_corr ) # np.percentile(r2[r2 > 0], 50) tmp_filter &= corr > min_corr if "dispersions_norm" in adata.var.keys(): dispersions = adata.var.dispersions_norm min_dispersion = 0 if min_dispersion is None else min_dispersion tmp_filter &= dispersions > min_dispersion if "fit_likelihood" in adata.var.keys(): fit_likelihood = adata.var["fit_likelihood"] min_likelihood = 0.1 if min_likelihood is None else min_likelihood tmp_filter &= fit_likelihood > min_likelihood X = adata[:, tmp_filter].layers[vkey] groups, groups_masks = select_groups(adata, key=groupby) n_groups = groups_masks.shape[0] sizes = groups_masks.sum(1) mean, var = np.zeros((n_groups, X.shape[1])), np.zeros( (n_groups, X.shape[1])) for i, mask in enumerate(groups_masks): mean[i], var[i] = get_mean_var(X[mask]) # test each against the union of all other groups rankings_gene_names, rankings_gene_scores = [], [] for i in range(n_groups): mask_rest = ~groups_masks[i] mean_rest, var_rest = get_mean_var(X[mask_rest]) size_rest = sizes[i] # else mask_rest.sum() if method == 't-test' scores = (mean[i] - mean_rest) / np.sqrt(var[i] / sizes[i] + var_rest / size_rest) scores = np.nan_to_num(scores) # equivalent to but much faster than np.argsort(scores)[-10:] if n_genes > X.shape[1]: n_genes = X.shape[1] idx = np.argpartition(scores, -n_genes)[-n_genes:] idx = idx[np.argsort(scores[idx])[::-1]] rankings_gene_names.append(adata[:, tmp_filter].var_names[idx].values) rankings_gene_scores.append(scores[idx]) rankings_gene_names = np.array([list(n) for n in rankings_gene_names]) rankings_gene_scores = np.array([list(n) for n in rankings_gene_scores]) all_names = rankings_gene_names.T.flatten() all_scores = rankings_gene_scores.T.flatten() vscore = np.zeros(adata.n_vars, dtype=int) for i, name in enumerate(adata.var_names): if name in all_names: vscore[i] = all_scores[np.where(name == all_names)[0][0]] adata.var["velocity_score"] = vscore key = "rank_velocity_genes" if key not in adata.uns.keys(): adata.uns[key] = {} adata.uns[key] = { "names": np.rec.fromarrays([n for n in rankings_gene_names], dtype=[(f"{rn}", "U50") for rn in groups]), "scores": np.rec.fromarrays( [n.round(2) for n in rankings_gene_scores], dtype=[(f"{rn}", "float32") for rn in groups], ), "params": { "groupby": groupby, "reference": "rest", "method": "t-test_overestim_var", "use_raw": True, }, } logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" f" '{key}', sorted scores by group ids (adata.uns) \n" " 'spearmans_score', spearmans correlation scores (adata.var)") return adata if copy else None
def velocity_graph( data, vkey="velocity", xkey="Ms", tkey=None, basis=None, n_neighbors=None, n_recurse_neighbors=None, random_neighbors_at_max=None, sqrt_transform=None, variance_stabilization=None, gene_subset=None, compute_uncertainties=None, approx=None, mode_neighbors="distances", copy=False, n_jobs=None, backend="loky", ): """Computes velocity graph based on cosine similarities. The cosine similarities are computed between velocities and potential cell state transitions, i.e. it measures how well a corresponding change in gene expression :math:`\\delta_{ij} = x_j - x_i` matches the predicted change according to the velocity vector :math:`\\nu_i`, .. math:: \\pi_{ij} = \\cos\\angle(\\delta_{ij}, \\nu_i) = \\frac{\\delta_{ij}^T \\nu_i}{\\left\\lVert\\delta_{ij}\\right\\rVert \\left\\lVert \\nu_i \\right\\rVert}. Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. vkey: `str` (default: `'velocity'`) Name of velocity estimates to be used. xkey: `str` (default: `'Ms'`) Layer key to extract count data from. tkey: `str` (default: `None`) Observation key to extract time data from. basis: `str` (default: `None`) Basis / Embedding to use. n_neighbors: `int` or `None` (default: None) Use fixed number of neighbors or do recursive neighbor search (if `None`). n_recurse_neighbors: `int` (default: `None`) Number of recursions for neighbors search. Defaults to 2 if mode_neighbors is 'distances', and 1 if mode_neighbors is 'connectivities'. random_neighbors_at_max: `int` or `None` (default: `None`) If number of iterative neighbors for an individual cell is higher than this threshold, a random selection of such are chosen as reference neighbors. sqrt_transform: `bool` (default: `False`) Whether to variance-transform the cell states changes and velocities before computing cosine similarities. gene_subset: `list` of `str`, subset of adata.var_names or `None`(default: `None`) Subset of genes to compute velocity graph on exclusively. compute_uncertainties: `bool` (default: `None`) Whether to compute uncertainties along with cosine correlation. approx: `bool` or `None` (default: `None`) If True, first 30 pc's are used instead of the full count matrix mode_neighbors: 'str' (default: `'distances'`) Determines the type of KNN graph used. Options are 'distances' or 'connectivities'. The latter yields a symmetric graph. copy: `bool` (default: `False`) Return a copy instead of writing to adata. n_jobs: `int` or `None` (default: `None`) Number of parallel jobs. backend: `str` (default: "loky") Backend used for multiprocessing. See :class:`joblib.Parallel` for valid options. Returns ------- velocity_graph: `.uns` sparse matrix with correlations of cell state transitions with velocities """ adata = data.copy() if copy else data verify_neighbors(adata) if vkey not in adata.layers.keys(): velocity(adata, vkey=vkey) if sqrt_transform is None: sqrt_transform = variance_stabilization vgraph = VelocityGraph( adata, vkey=vkey, xkey=xkey, tkey=tkey, basis=basis, n_neighbors=n_neighbors, approx=approx, n_recurse_neighbors=n_recurse_neighbors, random_neighbors_at_max=random_neighbors_at_max, sqrt_transform=sqrt_transform, gene_subset=gene_subset, compute_uncertainties=compute_uncertainties, report=True, mode_neighbors=mode_neighbors, ) if isinstance(basis, str): logg.warn( f"The velocity graph is computed on {basis} embedding coordinates.\n" f" Consider computing the graph in an unbiased manner \n" f" on full expression space by not specifying basis.\n") n_jobs = get_n_jobs(n_jobs=n_jobs) logg.info( f"computing velocity graph (using {n_jobs}/{os.cpu_count()} cores)", r=True) vgraph.compute_cosines(n_jobs=n_jobs, backend=backend) adata.uns[f"{vkey}_graph"] = vgraph.graph adata.uns[f"{vkey}_graph_neg"] = vgraph.graph_neg if vgraph.uncertainties is not None: adata.uns[f"{vkey}_graph_uncertainties"] = vgraph.uncertainties adata.obs[f"{vkey}_self_transition"] = vgraph.self_prob if f"{vkey}_params" in adata.uns.keys(): if "embeddings" in adata.uns[f"{vkey}_params"]: del adata.uns[f"{vkey}_params"]["embeddings"] else: adata.uns[f"{vkey}_params"] = {} adata.uns[f"{vkey}_params"]["mode_neighbors"] = mode_neighbors adata.uns[f"{vkey}_params"][ "n_recurse_neighbors"] = vgraph.n_recurse_neighbors logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" f" '{vkey}_graph', sparse matrix with cosine correlations (adata.uns)" ) return adata if copy else None
def velocity_confidence(data, vkey="velocity", copy=False): """Computes confidences of velocities. .. code:: python scv.tl.velocity_confidence(adata) scv.pl.scatter(adata, color='velocity_confidence', perc=[2,98]) .. image:: https://user-images.githubusercontent.com/31883718/69626334-b6df5200-1048-11ea-9171-495845c5bc7a.png :width: 600px Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. vkey: `str` (default: `'velocity'`) Name of velocity estimates to be used. copy: `bool` (default: `False`) Return a copy instead of writing to adata. Returns ------- velocity_length: `.obs` Length of the velocity vectors for each individual cell velocity_confidence: `.obs` Confidence for each cell """ # noqa E501 adata = data.copy() if copy else data if vkey not in adata.layers.keys(): raise ValueError("You need to run `tl.velocity` first.") V = np.array(adata.layers[vkey]) tmp_filter = np.invert(np.isnan(np.sum(V, axis=0))) if f"{vkey}_genes" in adata.var.keys(): tmp_filter &= np.array(adata.var[f"{vkey}_genes"], dtype=bool) if "spearmans_score" in adata.var.keys(): tmp_filter &= adata.var["spearmans_score"].values > 0.1 V = V[:, tmp_filter] V -= V.mean(1)[:, None] V_norm = l2_norm(V, axis=1) R = np.zeros(adata.n_obs) indices = get_indices(dist=get_neighs(adata, "distances"))[0] for i in range(adata.n_obs): Vi_neighs = V[indices[i]] Vi_neighs -= Vi_neighs.mean(1)[:, None] R[i] = np.mean( np.einsum("ij, j", Vi_neighs, V[i]) / (l2_norm(Vi_neighs, axis=1) * V_norm[i])[None, :]) adata.obs[f"{vkey}_length"] = V_norm.round(2) adata.obs[f"{vkey}_confidence"] = np.clip(R, 0, None) logg.hint(f"added '{vkey}_length' (adata.obs)") logg.hint(f"added '{vkey}_confidence' (adata.obs)") if f"{vkey}_confidence_transition" not in adata.obs.keys(): velocity_confidence_transition(adata, vkey) return adata if copy else None
def velocity_embedding( data, basis=None, vkey="velocity", scale=10, self_transitions=True, use_negative_cosines=True, direct_pca_projection=None, retain_scale=False, autoscale=True, all_comps=True, T=None, copy=False, ): """Projects the single cell velocities into any embedding. Given normalized difference of the embedding positions :math: `\\tilde \\delta_{ij} = \\frac{x_j-x_i}{\\left\\lVert x_j-x_i \\right\\rVert}`. the projections are obtained as expected displacements with respect to the transition matrix :math:`\\tilde \\pi_{ij}` as .. math:: \\tilde \\nu_i = E_{\\tilde \\pi_{i\\cdot}} [\\tilde \\delta_{i \\cdot}] = \\sum_{j \\neq i} \\left( \\tilde \\pi_{ij} - \\frac1n \\right) \\tilde \\ delta_{ij}. Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. basis: `str` (default: `'tsne'`) Which embedding to use. vkey: `str` (default: `'velocity'`) Name of velocity estimates to be used. scale: `int` (default: 10) Scale parameter of gaussian kernel for transition matrix. self_transitions: `bool` (default: `True`) Whether to allow self transitions, based on the confidences of transitioning to neighboring cells. use_negative_cosines: `bool` (default: `True`) Whether to project cell-to-cell transitions with negative cosines into negative/opposite direction. direct_pca_projection: `bool` (default: `None`) Whether to directly project the velocities into PCA space, thus skipping the velocity graph. retain_scale: `bool` (default: `False`) Whether to retain scale from high dimensional space in embedding. autoscale: `bool` (default: `True`) Whether to scale the embedded velocities by a scalar multiplier, which simply ensures that the arrows in the embedding are properly scaled. all_comps: `bool` (default: `True`) Whether to compute the velocities on all embedding components. T: `csr_matrix` (default: `None`) Allows the user to directly pass a transition matrix. copy: `bool` (default: `False`) Return a copy instead of writing to `adata`. Returns ------- velocity_umap: `.obsm` coordinates of velocity projection on embedding (e.g., basis='umap') """ adata = data.copy() if copy else data if basis is None: keys = [ key for key in ["pca", "tsne", "umap"] if f"X_{key}" in adata.obsm.keys() ] if len(keys) > 0: basis = "pca" if direct_pca_projection else keys[-1] else: raise ValueError("No basis specified") if f"X_{basis}" not in adata.obsm_keys(): raise ValueError("You need to compute the embedding first.") if direct_pca_projection and "pca" in basis: logg.warn( "Directly projecting velocities into PCA space is for exploratory analysis " "on principal components.\n" " It does not reflect the actual velocity field from high " "dimensional gene expression space.\n" " To visualize velocities, consider applying " "`direct_pca_projection=False`.\n" ) logg.info("computing velocity embedding", r=True) V = np.array(adata.layers[vkey]) vgenes = np.ones(adata.n_vars, dtype=bool) if f"{vkey}_genes" in adata.var.keys(): vgenes &= np.array(adata.var[f"{vkey}_genes"], dtype=bool) vgenes &= ~np.isnan(V.sum(0)) V = V[:, vgenes] if direct_pca_projection and "pca" in basis: PCs = adata.varm["PCs"] if all_comps else adata.varm["PCs"][:, :2] PCs = PCs[vgenes] X_emb = adata.obsm[f"X_{basis}"] V_emb = (V - V.mean(0)).dot(PCs) else: X_emb = ( adata.obsm[f"X_{basis}"] if all_comps else adata.obsm[f"X_{basis}"][:, :2] ) V_emb = np.zeros(X_emb.shape) T = ( transition_matrix( adata, vkey=vkey, scale=scale, self_transitions=self_transitions, use_negative_cosines=use_negative_cosines, ) if T is None else T ) T.setdiag(0) T.eliminate_zeros() densify = adata.n_obs < 1e4 TA = T.A if densify else None with warnings.catch_warnings(): warnings.simplefilter("ignore") for i in range(adata.n_obs): indices = T[i].indices dX = X_emb[indices] - X_emb[i, None] # shape (n_neighbors, 2) if not retain_scale: dX /= l2_norm(dX)[:, None] dX[np.isnan(dX)] = 0 # zero diff in a steady-state probs = TA[i, indices] if densify else T[i].data V_emb[i] = probs.dot(dX) - probs.mean() * dX.sum(0) if retain_scale: X = ( adata.layers["Ms"] if "Ms" in adata.layers.keys() else adata.layers["spliced"] ) delta = T.dot(X[:, vgenes]) - X[:, vgenes] if issparse(delta): delta = delta.A cos_proj = (V * delta).sum(1) / l2_norm(delta) V_emb *= np.clip(cos_proj[:, None] * 10, 0, 1) if autoscale: V_emb /= 3 * quiver_autoscale(X_emb, V_emb) if f"{vkey}_params" in adata.uns.keys(): adata.uns[f"{vkey}_params"]["embeddings"] = ( [] if "embeddings" not in adata.uns[f"{vkey}_params"] else list(adata.uns[f"{vkey}_params"]["embeddings"]) ) adata.uns[f"{vkey}_params"]["embeddings"].extend([basis]) vkey += f"_{basis}" adata.obsm[vkey] = V_emb logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint("added\n" f" '{vkey}', embedded velocity vectors (adata.obsm)") return adata if copy else None
def neighbors( adata, n_neighbors=30, n_pcs=None, use_rep=None, use_highly_variable=True, knn=True, random_state=0, method="umap", metric="euclidean", metric_kwds=None, num_threads=-1, copy=False, ): """ Compute a neighborhood graph of observations. The neighbor graph methods (umap, hnsw, sklearn) only differ in runtime and yield the same result as scanpy [Wolf18]_. Connectivities are computed with adaptive kernel width as proposed in Haghverdi et al. 2016 (doi:10.1038/nmeth.3971). Parameters ---------- adata Annotated data matrix. n_neighbors The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If `knn` is `True`, number of nearest neighbors to be searched. If `knn` is `False`, a Gaussian kernel width is set to the distance of the `n_neighbors` neighbor. n_pcs : `int` or `None` (default: None) Number of principal components to use. If not specified, the full space is used of a pre-computed PCA, or 30 components are used when PCA is computed internally. use_rep : `None`, `'X'` or any key for `.obsm` (default: None) Use the indicated representation. If `None`, the representation is chosen automatically: for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used. use_highly_variable: `bool` (default: True) Whether to use highly variable genes only, stored in .var['highly_variable']. knn If `True`, use a hard threshold to restrict the number of neighbors to `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian Kernel to assign low weights to neighbors more distant than the `n_neighbors` nearest neighbor. random_state A numpy random seed. method : {{'umap', 'hnsw', 'sklearn'}} (default: `'umap'`) Method to compute neighbors, only differs in runtime. The 'hnsw' method is most efficient and requires to `pip install hnswlib`. Connectivities are computed with adaptive kernel. metric A known metric’s name or a callable that returns a distance. metric_kwds Options for the metric. num_threads Number of threads to be used (for runtime). copy Return a copy instead of writing to adata. Returns ------- connectivities : `.obsp` Sparse weighted adjacency matrix of the neighborhood graph of data points. Weights should be interpreted as connectivities. distances : `.obsp` Sparse matrix of distances for each pair of neighbors. """ adata = adata.copy() if copy else adata if use_rep is None: use_rep = "X" if adata.n_vars < 50 or n_pcs == 0 else "X_pca" n_pcs = None if use_rep == "X" else n_pcs elif use_rep not in adata.obsm.keys() and f"X_{use_rep}" in adata.obsm.keys(): use_rep = f"X_{use_rep}" if use_rep == "X_pca": if ( "X_pca" not in adata.obsm.keys() or n_pcs is not None and n_pcs > adata.obsm["X_pca"].shape[1] ): n_vars = ( np.sum(adata.var["highly_variable"]) if use_highly_variable and "highly_variable" in adata.var.keys() else adata.n_vars ) n_comps = min(30 if n_pcs is None else n_pcs, n_vars - 1, adata.n_obs - 1) use_highly_variable &= "highly_variable" in adata.var.keys() pca( adata, n_comps=n_comps, use_highly_variable=use_highly_variable, svd_solver="arpack", ) elif n_pcs is None and adata.obsm["X_pca"].shape[1] < 10: logg.warn( f"Neighbors are computed on {adata.obsm['X_pca'].shape[1]} " f"principal components only." ) n_duplicate_cells = len(get_duplicate_cells(adata)) if n_duplicate_cells > 0: logg.warn( f"You seem to have {n_duplicate_cells} duplicate cells in your data.", "Consider removing these via pp.remove_duplicate_cells.", ) if metric_kwds is None: metric_kwds = {} logg.info("computing neighbors", r=True) if method == "sklearn": from sklearn.neighbors import NearestNeighbors X = adata.X if use_rep == "X" else adata.obsm[use_rep] neighbors = NearestNeighbors( n_neighbors=n_neighbors - 1, metric=metric, metric_params=metric_kwds, n_jobs=num_threads, ) neighbors.fit(X if n_pcs is None else X[:, :n_pcs]) knn_distances, neighbors.knn_indices = neighbors.kneighbors() knn_distances, neighbors.knn_indices = set_diagonal( knn_distances, neighbors.knn_indices ) neighbors.distances, neighbors.connectivities = compute_connectivities_umap( neighbors.knn_indices, knn_distances, X.shape[0], n_neighbors=n_neighbors ) elif method == "hnsw": X = adata.X if use_rep == "X" else adata.obsm[use_rep] neighbors = FastNeighbors(n_neighbors=n_neighbors, num_threads=num_threads) neighbors.fit( X if n_pcs is None else X[:, :n_pcs], metric=metric, random_state=random_state, **metric_kwds, ) else: logg.switch_verbosity("off", module="scanpy") with warnings.catch_warnings(): # ignore numba warning (umap/issues/252) warnings.simplefilter("ignore") neighbors = Neighbors(adata) neighbors.compute_neighbors( n_neighbors=n_neighbors, knn=knn, n_pcs=n_pcs, method=method, use_rep=use_rep, random_state=random_state, metric=metric, metric_kwds=metric_kwds, write_knn_indices=True, ) logg.switch_verbosity("on", module="scanpy") adata.uns["neighbors"] = {} try: adata.obsp["distances"] = neighbors.distances adata.obsp["connectivities"] = neighbors.connectivities adata.uns["neighbors"]["connectivities_key"] = "connectivities" adata.uns["neighbors"]["distances_key"] = "distances" except Exception: adata.uns["neighbors"]["distances"] = neighbors.distances adata.uns["neighbors"]["connectivities"] = neighbors.connectivities if hasattr(neighbors, "knn_indices"): adata.uns["neighbors"]["indices"] = neighbors.knn_indices adata.uns["neighbors"]["params"] = { "n_neighbors": n_neighbors, "method": method, "metric": metric, "n_pcs": n_pcs, "use_rep": use_rep, } logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" " 'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)" ) return adata if copy else None
def moments( data, n_neighbors=30, n_pcs=None, mode="connectivities", method="umap", use_rep=None, use_highly_variable=True, copy=False, ): """Computes moments for velocity estimation. First-/second-order moments are computed for each cell across its nearest neighbors, where the neighbor graph is obtained from euclidean distances in PCA space. Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. n_neighbors: `int` (default: 30) Number of neighbors to use. n_pcs: `int` (default: None) Number of principal components to use. If not specified, the full space is used of a pre-computed PCA, or 30 components are used when PCA is computed internally. mode: `'connectivities'` or `'distances'` (default: `'connectivities'`) Distance metric to use for moment computation. method : {{'umap', 'hnsw', 'sklearn', `None`}} (default: `'umap'`) Method to compute neighbors, only differs in runtime. Connectivities are computed with adaptive kernel width as proposed in Haghverdi et al. 2016 (https://doi.org/10.1038/nmeth.3971). use_rep : `None`, `'X'` or any key for `.obsm` (default: None) Use the indicated representation. If `None`, the representation is chosen automatically: for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used. use_highly_variable: `bool` (default: True) Whether to use highly variable genes only, stored in .var['highly_variable']. copy: `bool` (default: `False`) Return a copy instead of writing to adata. Returns ------- Ms: `.layers` dense matrix with first order moments of spliced counts. Mu: `.layers` dense matrix with first order moments of unspliced counts. """ adata = data.copy() if copy else data layers = [ layer for layer in {"spliced", "unspliced"} if layer in adata.layers ] if any([not_yet_normalized(adata.layers[layer]) for layer in layers]): normalize_per_cell(adata) if n_neighbors is not None and n_neighbors > get_n_neighs(adata): neighbors( adata, n_neighbors=n_neighbors, use_rep=use_rep, use_highly_variable=use_highly_variable, n_pcs=n_pcs, method=method, ) verify_neighbors(adata) if "spliced" not in adata.layers.keys( ) or "unspliced" not in adata.layers.keys(): logg.warn( "Skipping moments, because un/spliced counts were not found.") else: logg.info(f"computing moments based on {mode}", r=True) connectivities = get_connectivities(adata, mode, n_neighbors=n_neighbors, recurse_neighbors=False) adata.layers["Ms"] = (csr_matrix.dot( connectivities, csr_matrix(adata.layers["spliced"])).astype(np.float32).A) adata.layers["Mu"] = (csr_matrix.dot( connectivities, csr_matrix(adata.layers["unspliced"])).astype(np.float32).A) # if renormalize: normalize_per_cell(adata, layers={'Ms', 'Mu'}, enforce=True) logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" " 'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)" ) return adata if copy else None
def paga( adata, groups=None, vkey="velocity", use_time_prior=True, root_key=None, end_key=None, threshold_root_end_prior=None, minimum_spanning_tree=True, copy=False, ): """PAGA graph with velocity-directed edges. Mapping out the coarse-grained connectivity structures of complex manifolds [Wolf19]_. By quantifying the connectivity of partitions (groups, clusters) of the single-cell graph, partition-based graph abstraction (PAGA) generates a much simpler abstracted graph (*PAGA graph*) of partitions, in which edge weights represent confidence in the presence of connections. Parameters ---------- adata : :class:`~anndata.AnnData` An annotated data matrix. groups : key for categorical in `adata.obs`, optional (default: 'louvain') You can pass your predefined groups by choosing any categorical annotation of observations (`adata.obs`). vkey: `str` or `None` (default: `None`) Key for annotations of observations/cells or variables/genes. use_time_prior : `str` or bool, optional (default: True) Obs key for pseudo-time values. If True, 'velocity_pseudotime' is used if available. root_key : `str` or bool, optional (default: None) Obs key for root states. end_key : `str` or bool, optional (default: None) Obs key for end states. threshold_root_end_prior : `float` (default: 0.9) Threshold for root and final states priors, to be in the range of [0,1]. Values above the threshold will be considered as terminal and included as prior. minimum_spanning_tree : bool, optional (default: True) Whether to prune the tree such that a path from A-to-B is removed if another more confident path exists. copy : `bool`, optional (default: `False`) Copy `adata` before computation and return a copy. Otherwise, perform computation inplace and return `None`. Returns ------- connectivities: `.uns` The full adjacency matrix of the abstracted graph, weights correspond to confidence in the connectivities of partitions. connectivities_tree: `.uns` The adjacency matrix of the tree-like subgraph that best explains the topology. transitions_confidence: `.uns` The adjacency matrix of the abstracted directed graph, weights correspond to confidence in the transitions between partitions. """ if "neighbors" not in adata.uns: raise ValueError( "You need to run `pp.neighbors` first to compute a neighborhood graph." ) adata = adata.copy() if copy else adata strings_to_categoricals(adata) if groups is None: groups = ("clusters" if "clusters" in adata.obs.keys() else "louvain" if "louvain" in adata.obs.keys() else None) elif groups == "velocity_clusters" and "velocity_clusters" not in adata.obs.keys( ): velocity_clusters(adata) if use_time_prior and not isinstance(use_time_prior, str): use_time_prior = "velocity_pseudotime" if use_time_prior not in adata.obs.keys(): velocity_pseudotime(adata, vkey=vkey, root_key=root_key, end_key=end_key) priors = [ p for p in [use_time_prior, root_key, end_key] if p in adata.obs.keys() ] logg.info( "running PAGA", f"using priors: {priors}" if len(priors) > 0 else "", r=True, ) paga = PAGA_tree( adata, groups, vkey=vkey, use_time_prior=use_time_prior, root_key=root_key, end_key=end_key, threshold_root_end_prior=threshold_root_end_prior, minimum_spanning_tree=minimum_spanning_tree, ) if "paga" not in adata.uns: adata.uns["paga"] = {} paga.compute_connectivities() adata.uns["paga"]["connectivities"] = paga.connectivities adata.uns["paga"]["connectivities_tree"] = paga.connectivities_tree adata.uns[f"{groups}_sizes"] = np.array(paga.ns) paga.compute_transitions() adata.uns["paga"]["transitions_confidence"] = paga.transitions_confidence adata.uns["paga"]["threshold"] = paga.threshold adata.uns["paga"]["groups"] = groups logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added\n" + " 'paga/connectivities', connectivities adjacency (adata.uns)\n" " 'paga/connectivities_tree', connectivities subtree (adata.uns)\n" " 'paga/transitions_confidence', velocity transitions (adata.uns)") return adata if copy else None
def velocity( data, vkey="velocity", mode="stochastic", fit_offset=False, fit_offset2=False, filter_genes=False, groups=None, groupby=None, groups_for_fit=None, constrain_ratio=None, use_raw=False, use_latent_time=None, perc=[5, 95], min_r2=1e-2, min_likelihood=1e-3, r2_adjusted=None, use_highly_variable=True, diff_kinetics=None, copy=False, **kwargs, ): """Estimates velocities in a gene-specific manner. The steady-state model [Manno18]_ determines velocities by quantifying how observations deviate from a presumed steady-state equilibrium ratio of unspliced to spliced mRNA levels. This steady-state ratio is obtained by performing a linear regression restricting the input data to the extreme quantiles. By including second-order moments, the stochastic model [Bergen19]_ exploits not only the balance of unspliced to spliced mRNA levels but also their covariation. By contrast, the likelihood-based dynamical model [Bergen19]_ solves the full splicing kinetics and generalizes RNA velocity estimation to transient systems. It is also capable of capturing non-observed steady states. .. image:: https://user-images.githubusercontent.com/31883718/69636491-ff057100-1056-11ea-90b7-d04098112ce1.png Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. vkey: `str` (default: `'velocity'`) Name under which to refer to the computed velocities for `velocity_graph` and `velocity_embedding`. mode: `'deterministic'`, `'stochastic'` or `'dynamical'` (default: `'stochastic'`) Whether to run the estimation using the steady-state/deterministic, stochastic or dynamical model of transcriptional dynamics. The dynamical model requires to run `tl.recover_dynamics` first. fit_offset: `bool` (default: `False`) Whether to fit with offset for first order moment dynamics. fit_offset2: `bool`, (default: `False`) Whether to fit with offset for second order moment dynamics. filter_genes: `bool` (default: `True`) Whether to remove genes that are not used for further velocity analysis. groups: `str`, `list` (default: `None`) Subset of groups, e.g. [‘g1’, ‘g2’, ‘g3’], to which velocity analysis shall be restricted. groupby: `str`, `list` or `np.ndarray` (default: `None`) Key of observations grouping to consider. groups_for_fit: `str`, `list` or `np.ndarray` (default: `None`) Subset of groups, e.g. [‘g1’, ‘g2’, ‘g3’], to which steady-state fitting shall be restricted. constrain_ratio: `float` or tuple of type `float` or None: (default: `None`) Bounds for the steady-state ratio. use_raw: `bool` (default: `False`) Whether to use raw data for estimation. use_latent_time: `bool`or `None` (default: `None`) Whether to use latent time as a regularization for velocity estimation. perc: `float` (default: `[5, 95]`) Percentile, e.g. 98, for extreme quantile fit. min_r2: `float` (default: 0.01) Minimum threshold for coefficient of determination min_likelihood: `float` (default: `None`) Minimal likelihood for velocity genes to fit the model on. r2_adjusted: `bool` (default: `None`) Whether to compute coefficient of determination on full data fit (adjusted) or extreme quantile fit (None) use_highly_variable: `bool` (default: True) Whether to use highly variable genes only, stored in .var['highly_variable']. copy: `bool` (default: `False`) Return a copy instead of writing to `adata`. Returns ------- velocity: `.layers` velocity vectors for each individual cell velocity_genes, velocity_beta, velocity_gamma, velocity_r2: `.var` parameters """ # noqa E501 adata = data.copy() if copy else data if not use_raw and "Ms" not in adata.layers.keys(): moments(adata) logg.info("computing velocities", r=True) strings_to_categoricals(adata) if mode is None or (mode == "dynamical" and "fit_alpha" not in adata.var.keys()): mode = "stochastic" logg.warn("Falling back to stochastic model. " "For the dynamical model run tl.recover_dynamics first.") if mode in {"dynamical", "dynamical_residuals"}: from .dynamical_model_utils import get_divergence, get_reads, get_vars gene_subset = ~np.isnan(adata.var["fit_alpha"].values) vdata = adata[:, gene_subset] alpha, beta, gamma, scaling, t_ = get_vars(vdata) connect = not adata.uns["recover_dynamics"]["use_raw"] kwargs_ = { "kernel_width": None, "normalized": True, "var_scale": True, "reg_par": None, "min_confidence": 1e-2, "constraint_time_increments": False, "fit_steady_states": True, "fit_basal_transcription": None, "use_connectivities": connect, "time_connectivities": connect, "use_latent_time": use_latent_time, } kwargs_.update(adata.uns["recover_dynamics"]) kwargs_.update(**kwargs) if "residuals" in mode: u, s = get_reads(vdata, use_raw=adata.uns["recover_dynamics"]["use_raw"]) if kwargs_["fit_basal_transcription"]: u, s = u - adata.var["fit_u0"], s - adata.var["fit_s0"] o = vdata.layers["fit_t"] < t_ vt = u * beta - s * gamma # ds/dt wt = (alpha * o - beta * u) * scaling # du/dt else: vt, wt = get_divergence(vdata, mode="velocity", **kwargs_) vgenes = adata.var.fit_likelihood > min_likelihood if min_r2 is not None: if "fit_r2" not in adata.var.keys(): velo = Velocity( adata, groups_for_fit=groups_for_fit, groupby=groupby, constrain_ratio=constrain_ratio, min_r2=min_r2, use_highly_variable=use_highly_variable, use_raw=use_raw, ) velo.compute_deterministic(fit_offset=fit_offset, perc=perc) adata.var["fit_r2"] = velo._r2 vgenes &= adata.var.fit_r2 > min_r2 lb, ub = np.nanpercentile(adata.var.fit_scaling, [10, 90]) vgenes = (vgenes & (adata.var.fit_scaling > np.min([lb, 0.03])) & (adata.var.fit_scaling < np.max([ub, 3]))) adata.var[f"{vkey}_genes"] = vgenes adata.layers[vkey] = np.ones(adata.shape) * np.nan adata.layers[vkey][:, gene_subset] = vt adata.layers[f"{vkey}_u"] = np.ones(adata.shape) * np.nan adata.layers[f"{vkey}_u"][:, gene_subset] = wt if filter_genes and len(set(vgenes)) > 1: adata._inplace_subset_var(vgenes) elif mode in {"steady_state", "deterministic", "stochastic"}: categories = (adata.obs[groupby].cat.categories if groupby is not None and groups is None and groups_for_fit is None else [None]) for cat in categories: groups = cat if cat is not None else groups cell_subset = groups_to_bool(adata, groups, groupby) _adata = adata if groups is None else adata[cell_subset] velo = Velocity( _adata, groups_for_fit=groups_for_fit, groupby=groupby, constrain_ratio=constrain_ratio, min_r2=min_r2, r2_adjusted=r2_adjusted, use_highly_variable=use_highly_variable, use_raw=use_raw, ) velo.compute_deterministic(fit_offset=fit_offset, perc=perc) if mode == "stochastic": if filter_genes and len(set(velo._velocity_genes)) > 1: adata._inplace_subset_var(velo._velocity_genes) residual = velo._residual[:, velo._velocity_genes] _adata = adata if groups is None else adata[cell_subset] velo = Velocity( _adata, residual=residual, groups_for_fit=groups_for_fit, groupby=groupby, constrain_ratio=constrain_ratio, use_highly_variable=use_highly_variable, ) velo.compute_stochastic(fit_offset, fit_offset2, mode, perc=perc) write_residuals(adata, vkey, velo._residual, cell_subset) write_residuals(adata, f"variance_{vkey}", velo._residual2, cell_subset) write_pars(adata, vkey, velo.get_pars(), velo.get_pars_names(), add_key=cat) if filter_genes and len(set(velo._velocity_genes)) > 1: adata._inplace_subset_var(velo._velocity_genes) else: raise ValueError( "Mode can only be one of these: deterministic, stochastic or dynamical." ) if f"{vkey}_genes" in adata.var.keys() and np.sum( adata.var[f"{vkey}_genes"]) < 10: logg.warn( "Too few genes are selected as velocity genes. " "Consider setting a lower threshold for min_r2 or min_likelihood.") if diff_kinetics: if not isinstance(diff_kinetics, str): diff_kinetics = "fit_diff_kinetics" if diff_kinetics in adata.var.keys(): if diff_kinetics in adata.uns["recover_dynamics"]: groupby = adata.uns["recover_dynamics"]["fit_diff_kinetics"] else: groupby = "clusters" clusters = adata.obs[groupby] for i, v in enumerate( np.array(adata.var[diff_kinetics].values, dtype=str)): if len(v) > 0 and v != "nan": idx = 1 - clusters.isin([a.strip() for a in v.split(",")]) adata.layers[vkey][:, i] *= idx if mode == "dynamical": adata.layers[f"{vkey}_u"][:, i] *= idx adata.uns[f"{vkey}_params"] = { "mode": mode, "fit_offset": fit_offset, "perc": perc } logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" f" '{vkey}', velocity vectors for each individual cell (adata.layers)" ) return adata if copy else None
def cell_origin( data, groupby="clusters", disconnected_groups=None, self_transitions=False, n_neighbors=None, copy=False, ): """Computes individual cell root points Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. groupby: `str` (default: `'clusters'`) Key to which to assign the fates. disconnected_groups: list of `str` (default: `None`) Which groups to treat as disconnected for fate assignment. n_neighbors: `int` (default: `None`) Number of neighbors to restrict transitions to. self_transitions: `bool` (default: `False`) Whether to include self-transitions. copy: `bool` (default: `False`) Return a copy instead of writing to `adata`. Returns ------- cell_origin: `.obs` most likely cell origin for each individual cell cell_origin_confidence: `.obs` confidence of coming from assigned origin """ adata = data.copy() if copy else data logg.info("computing cell fates", r=True) n_neighbors = 10 if n_neighbors is None else n_neighbors _adata = adata.copy() vgraph = VelocityGraph(_adata, n_neighbors=n_neighbors, approx=True, n_recurse_neighbors=1) vgraph.compute_cosines() _adata.uns["velocity_graph"] = vgraph.graph _adata.uns["velocity_graph_neg"] = vgraph.graph_neg T = transition_matrix(_adata, self_transitions=self_transitions, backward=True) fate = np.linalg.inv(np.eye(_adata.n_obs) - T) if issparse(T): fate = fate.A cell_fates = np.array(_adata.obs[groupby][fate.argmax(1)]) if disconnected_groups is not None: idx = _adata.obs[groupby].isin(disconnected_groups) cell_fates[idx] = _adata.obs[groupby][idx] adata.obs["cell_origin"] = cell_fates adata.obs["cell_origin_confidence"] = fate.max(1) / fate.sum(1) strings_to_categoricals(adata) logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added\n" " 'cell_origin', most likely cell origin (adata.obs)\n" " 'cell_origin_confidence', confidence of assigned origin (adata.obs)" )
def terminal_states( data, vkey="velocity", modality="Ms", groupby=None, groups=None, self_transitions=False, eps=1e-3, random_state=0, copy=False, **kwargs, ): """Computes terminal states (root and end points). The end points and root cells are obtained as stationary states of the velocity-inferred transition matrix and its transposed, respectively, which is given by left eigenvectors corresponding to an eigenvalue of 1, i.e. .. math:: μ^{\\textrm{end}}=μ^{\\textrm{end}} \\pi, \\quad μ^{\\textrm{root}}=μ^{\\textrm{root}} \\pi^{\\small \\textrm{T}}. .. code:: python scv.tl.terminal_states(adata) scv.pl.scatter(adata, color=['root_cells', 'end_points']) .. image:: https://user-images.githubusercontent.com/31883718/69496183-bcfdf300-0ecf-11ea-9aae-685300a0b1ba.png Alternatively, we recommend to use :func:`cellrank.tl.terminal_states` providing an improved/generalized approach of identifying terminal states. Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. vkey: `str` (default: `'velocity'`) Name of velocity estimates to be used. modality: `str` (default: `'Ms'`) Layer used to calculate terminal states. groupby: `str`, `list` or `np.ndarray` (default: `None`) Key of observations grouping to consider. Only to be set, if each group is assumed to have a distinct lineage with an independent root and end point. groups: `str`, `list` or `np.ndarray` (default: `None`) Groups selected to find terminal states on. Must be an element of .obs[groupby]. To be specified only for very distinct/disconnected clusters. self_transitions: `bool` (default: `False`) Allow transitions from one node to itself. eps: `float` (default: 1e-3) Tolerance for eigenvalue selection. random_state: `int` or None (default: 0) Seed used by the random number generator. If `None`, use the `RandomState` instance by `np.random`. copy: `bool` (default: `False`) Return a copy instead of writing to data. **kwargs: Passed to scvelo.tl.transition_matrix(), e.g. basis, weight_diffusion. Returns ------- root_cells: `.obs` sparse matrix with transition probabilities. end_points: `.obs` sparse matrix with transition probabilities. """ # noqa E501 adata = data.copy() if copy else data verify_neighbors(adata) logg.info("computing terminal states", r=True) strings_to_categoricals(adata) if groupby is not None: logg.warn( "Only set groupby, when you have evident distinct clusters/lineages," " each with an own root and end point.") kwargs.update({"self_transitions": self_transitions}) categories = [None] if groupby is not None and groups is None: categories = adata.obs[groupby].cat.categories for cat in categories: groups = cat if cat is not None else groups cell_subset = groups_to_bool(adata, groups=groups, groupby=groupby) _adata = adata if groups is None else adata[cell_subset] connectivities = get_connectivities(_adata, "distances") T = transition_matrix(_adata, vkey=vkey, backward=True, **kwargs) eigvecs_roots = eigs(T, eps=eps, perc=[2, 98], random_state=random_state)[1] roots = csr_matrix.dot(connectivities, eigvecs_roots).sum(1) roots = scale(np.clip(roots, 0, np.percentile(roots, 98))) roots = verify_roots(_adata, roots, modality) write_to_obs(adata, "root_cells", roots, cell_subset) T = transition_matrix(_adata, vkey=vkey, backward=False, **kwargs) eigvecs_ends = eigs(T, eps=eps, perc=[2, 98], random_state=random_state)[1] ends = csr_matrix.dot(connectivities, eigvecs_ends).sum(1) ends = scale(np.clip(ends, 0, np.percentile(ends, 98))) write_to_obs(adata, "end_points", ends, cell_subset) n_roots, n_ends = eigvecs_roots.shape[1], eigvecs_ends.shape[1] groups_str = f" ({groups})" if isinstance(groups, str) else "" roots_str = f"{n_roots} {'regions' if n_roots > 1 else 'region'}" ends_str = f"{n_ends} {'regions' if n_ends > 1 else 'region'}" logg.info(f" identified {roots_str} of root cells " f"and {ends_str} of end points {groups_str}.") logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added\n" " 'root_cells', root cells of Markov diffusion process (adata.obs)\n" " 'end_points', end points of Markov diffusion process (adata.obs)") return adata if copy else None