def fit(self, X, V, k, s=None, tol=1e-4): self.__reset__() # knn clustering if self.nbrs_idx is None: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric='euclidean', n_neighbors=k + 1, n_jobs=-1, random_state=19491001) Idx, _ = nbrs.query(X, k=k+1) else: alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree' nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=-1).fit(X) _, Idx = nbrs.kneighbors(X) self.nbrs_idx = Idx[:, 1:] else: Idx = self.nbrs_idx # compute transition prob. n = X.shape[0] self.P = np.zeros((n, n)) for i in range(n): y = X[i] v = V[i] Y = X[Idx[i, 1:]] p = compute_markov_trans_prob(y, v, Y, s, cont_time=True) p[p <= tol] = 0 # tolerance check self.P[Idx[i, 1:], i] = p self.P[i, i] = -np.sum(p)
def get_Xss_confidence(self): X = self.X_data X = X.A if sp.issparse(X) else X Xss = self.Xss.get_X() alg = 'ball_tree' if Xss.shape[1] > 10 else 'kd_tree' if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric='euclidean', n_neighbors=min(self.k, X.shape[0] - 1), n_jobs=-1, random_state=19491001) _, dist = nbrs.query(Xss, k=min(self.k, X.shape[0] - 1)) else: alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree' nbrs = NearestNeighbors(n_neighbors=min(self.k, X.shape[0] - 1), algorithm=alg, n_jobs=-1).fit(X) dist, _ = nbrs.kneighbors(Xss) dist_m = dist.mean(1) confidence = 1 - dist_m / dist_m.max() return confidence
def graphize_vecfld(func, X, nbrs_idx=None, dist=None, k=30, distance_free=True, n_int_steps=20, cores=1): n, d = X.shape nbrs = None if nbrs_idx is None: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric='euclidean', n_neighbors=k+1, n_jobs=-1, random_state=19491001) nbrs_idx, dist = nbrs.query(X, k=k+1) else: alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree' nbrs = NearestNeighbors(n_neighbors=k+1, algorithm=alg, n_jobs=-1).fit(X) dist, nbrs_idx = nbrs.kneighbors(X) if dist is None and not distance_free: D = pdist(X) else: D = None V = sp.csr_matrix((n, n)) if cores == 1: for i, idx in tqdm(enumerate(nbrs_idx), desc='Constructing diffusion graph from reconstructed vector field'): V += construct_v(X, i, idx, n_int_steps, func, distance_free, dist, D, n) else: pool = ThreadPool(cores) res = pool.starmap(construct_v, zip(itertools.repeat(X), np.arange(len(nbrs_idx)), nbrs_idx, itertools.repeat(n_int_steps), itertools.repeat(func), itertools.repeat(distance_free), itertools.repeat(dist), itertools.repeat(D), itertools.repeat(n))) pool.close() pool.join() V = functools.reduce((lambda a, b: a + b), res) return V, nbrs
def compute_tau(X, V, k=100, nbr_idx=None): if nbr_idx is None: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent( X, metric="euclidean", n_neighbors=k, n_jobs=-1, random_state=19491001, ) _, dist = nbrs.query(X, k=k) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X) dists, _ = nbrs.kneighbors(X) else: dists = np.zeros(nbr_idx.shape) for i in range(nbr_idx.shape[0]): for j in range(nbr_idx.shape[1]): x = X[i] y = X[nbr_idx[i, j]] dists[i, j] = np.sqrt((x - y).dot(x - y)) d = np.mean(dists[:, 1:], 1) v = np.linalg.norm(V, axis=1) tau = d / v return tau, v
def get_Xss_confidence(self, k=50): X = self.X_data X = X.A if sp.issparse(X) else X Xss = self.Xss.get_X() Xref = np.median(X, 0) Xss = np.vstack((Xss, Xref)) if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric="euclidean", n_neighbors=min(k, X.shape[0] - 1), n_jobs=-1, random_state=19491001) _, dist = nbrs.query(Xss, k=min(k, X.shape[0] - 1)) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=min(k, X.shape[0] - 1), algorithm=alg, n_jobs=-1).fit(X) dist, _ = nbrs.kneighbors(Xss) dist_m = dist.mean(1) # confidence = 1 - dist_m / dist_m.max() sigma = 0.1 * 0.5 * (np.max(X[:, 0]) - np.min(X[:, 0]) + np.max(X[:, 1]) - np.min(X[:, 1])) confidence = gaussian_1d(dist_m, sigma=sigma) confidence /= np.max(confidence) return confidence[:-1]
def bandwidth_selector(X): """ This function computes an empirical bandwidth for a Gaussian kernel. """ n, m = X.shape if n > 200000 and m > 2: from pynndescent import NNDescent nbrs = NNDescent( X, metric="euclidean", n_neighbors=max(2, int(0.2 * n)), n_jobs=-1, random_state=19491001, ) _, distances = nbrs.query(X, k=max(2, int(0.2 * n))) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=max(2, int(0.2 * n)), algorithm=alg, n_jobs=-1).fit(X) distances, _ = nbrs.kneighbors(X) d = np.mean(distances[:, 1:]) / 1.5 return np.sqrt(2) * d
def fit(self, X, V, k, s=None, method="qp", eps=None, tol=1e-4): # pass index # the parameter k will be replaced by a connectivity matrix in the future. self.__reset__() # knn clustering if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric="euclidean", n_neighbors=k, n_jobs=-1, random_state=19491001) Idx, _ = nbrs.query(X, k=k) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X) _, Idx = nbrs.kneighbors(X) # compute transition prob. n = X.shape[0] self.P = np.zeros((n, n)) if method == "kernel": inv_s = np.linalg.inv(s) # compute density kernel if eps is not None: self.Kd = np.zeros((n, n)) inv_eps = 1 / eps for i in range(n): self.Kd[i, Idx[i]] = compute_density_kernel( X[i], X[Idx[i]], inv_eps) D = np.sum(self.Kd, 0) for i in range(n): y = X[i] v = V[i] if method == "qp": Y = X[Idx[i, 1:]] p = compute_markov_trans_prob(y, v, Y, s) p[p <= tol] = 0 # tolerance check self.P[Idx[i, 1:], i] = p self.P[i, i] = 1 - np.sum(p) else: Y = X[Idx[i]] # p = compute_kernel_trans_prob(y, v, Y, inv_s) k = compute_drift_kernel(y, v, Y, inv_s) if eps is not None: k /= D[Idx[i]] p = k / np.sum(k) p[p <= tol] = 0 # tolerance check p = p / np.sum(p) self.P[Idx[i], i] = p
def prepare_velocity_grid_data( X_emb, xy_grid_nums, density=None, smooth=None, n_neighbors=None, ): n_obs, n_dim = X_emb.shape density = 1 if density is None else density smooth = 0.5 if smooth is None else smooth grs, scale = [], 0 for dim_i in range(n_dim): m, M = np.min(X_emb[:, dim_i]), np.max(X_emb[:, dim_i]) m = m - 0.01 * np.abs(M - m) M = M + 0.01 * np.abs(M - m) gr = np.linspace(m, M, xy_grid_nums[dim_i] * density) scale += gr[1] - gr[0] grs.append(gr) scale = scale / n_dim * smooth meshes_tuple = np.meshgrid(*grs) X_grid = np.vstack([i.flat for i in meshes_tuple]).T # estimate grid velocities if n_neighbors is None: n_neighbors = np.max([10, int(n_obs / 50)]) if X_emb.shape[0] > 200000 and X_emb.shape[1] > 2: from pynndescent import NNDescent nn = NNDescent(X_emb, metric='euclidean', n_neighbors=n_neighbors, n_jobs=-1, random_state=19491001) neighs, dists = nn.query(X_grid, k=n_neighbors) else: alg = "ball_tree" if X_emb.shape[1] > 10 else 'kd_tree' nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=-1, algorithm=alg) nn.fit(X_emb) dists, neighs = nn.kneighbors(X_grid) weight = norm.pdf(x=dists, scale=scale) p_mass = weight.sum(1) return X_grid, p_mass, neighs, weight
def trn(X, n, return_index=True, seed=19491001, **kwargs): trnet = TRNET(n, X, seed) trnet.run(**kwargs) if not return_index: return trnet.W else: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric="euclidean", n_neighbors=1, n_jobs=-1, random_state=seed) idx, _ = nbrs.query(trnet.W, k=1) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=1, algorithm=alg, n_jobs=-1).fit(X) _, idx = nbrs.kneighbors(trnet.W) return idx[:, 0]
def cluster_field(adata, basis="pca", embedding_basis=None, normalize=True, method="leiden", cores=1, copy=False, **kwargs): """Cluster cells based on vector field features. We would like to see whether the vector field can be used to better define cell state/types. This can be accessed via characterizing critical points (attractor/saddle/repressor, etc.) and characteristic curves (nullcline, separatrix). However, the calculation of those is not easy, for example, a strict definition of an attractor is states where velocity is 0 and the eigenvalue of the jacobian matrix at that point is all negative. Under this strict definition, we may sometimes find the attractors are very far away from our sampled cell states which makes them less meaningful although this can be largely avoided when we decide to remove the density correction during the velocity projection. This is not unexpected as the vector field we learned is defined via a set of basis functions based on gaussian kernels and thus it is hard to satisfy that strict definition. Fortunately, we can handle this better with the help of a different set of ideas. Instead of using critical points by the classical dynamic system methods, we can use some machine learning approaches that are based on extracting geometric features of streamline to "cluster vector field space" for define cell states/type. This requires calculating, potential (ordered pseudotime), speed, curliness, divergence, acceleration, curvature, etc. Thanks to the fact that we can analytically calculate Jacobian matrix matrix, those quantities of the vector field function can be conveniently and efficiently calculated. Parameters ---------- adata: :class:`~anndata.AnnData`. adata object that includes both newly synthesized and total gene expression of cells. Alternatively, the object should include both unspliced and spliced gene expression of cells. basis: `str` or None (default: `None`) The space that will be used for calculating vector field features. Valid names includes, for example, `pca`, `umap`, etc. embedding_basis: `str` or None (default: `None`) The embedding basis that will be combined with the vector field feature space for clustering. normalize: `bool` (default: `True`) Whether to mean center and scale the feature across all cells so that the mean method: `str` (default: `leiden`) The method that will be used for clustering, one of `{'kmeans'', 'hdbscan', 'louvain', 'leiden'}`. If `louvain` or `leiden` used, you need to have `cdlib` installed. cores: `int` (default: 1) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. copy: Whether to return a new deep copy of `adata` instead of updating `adata` object passed in arguments. kwargs: Any additional arguments that will be passed to either kmeans, hdbscan, louvain or leiden clustering algorithms. Returns ------- """ logger = LoggerManager.gen_logger("dynamo-cluster_field") logger.log_time() adata = copy_adata(adata) if copy else adata if method in ["louvain", "leiden"]: try: from cdlib import algorithms "leiden" in dir(algorithms) except ImportError: raise ImportError( "You need to install the excellent package `cdlib` if you want to use louvain or leiden " "for clustering.") feature_key = [ "speed_" + basis, basis + "_ddhodge_potential", "divergence_" + basis, "acceleration_" + basis, "curvature_" + basis, ] if feature_key[0] not in adata.obs.keys(): from ..vectorfield import speed speed(adata, basis=basis) if feature_key[1] not in adata.obs.keys(): from ..ext import ddhodge ddhodge(adata, basis=basis) if feature_key[2] not in adata.obs.keys(): from ..vectorfield import divergence divergence(adata, basis=basis) if feature_key[3] not in adata.obs.keys(): from ..vectorfield import acceleration acceleration(adata, basis=basis) if feature_key[4] not in adata.obs.keys(): from ..vectorfield import curvature curvature(adata, basis=basis) feature_data = adata.obs.loc[:, feature_key].values if embedding_basis is None: embedding_basis = basis X = np.hstack((feature_data, adata.obsm["X_" + embedding_basis])) if normalize: # X = (X - X.min(0)) / X.ptp(0) X = (X - X.mean(0)) / X.std(0) if method in ["hdbscan", "kmeans"]: if method == "hdbscan": key = "field_hdbscan" hdbscan(adata, X_data=X, result_key=key, **kwargs) elif method == "kmeans": from sklearn.cluster import KMeans key = "field_kmeans" kmeans = KMeans(random_state=0, **kwargs).fit(X) adata.obs[key] = kmeans.labels_.astype("str") # clusters need to be categorical variables adata.obs.obs[key] = adata.obs.obs[key].astype("category") elif method in ["louvain", "leiden"]: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent( X, metric="euclidean", n_neighbors=31, n_jobs=cores, random_state=19491001, ) nbrs_idx, dist = nbrs.query(X, k=31) else: nbrs = NearestNeighbors(n_neighbors=31, n_jobs=cores).fit(X) dist, nbrs_idx = nbrs.kneighbors(X) row = np.repeat(nbrs_idx[:, 0], 30) col = nbrs_idx[:, 1:].flatten() graph = csr_matrix( (np.repeat(1, len(col)), (row, col)), shape=(adata.n_obs, adata.n_obs), ) adata.obsp["vf_feature_knn"] = graph if method == "leiden": leiden( adata, adj_matrix_key="vf_feature_knn", result_key="field_leiden", ) elif method == "louvain": louvain( adata, adj_matrix_key="vf_feature_knn", result_key="field_louvain", ) elif method == "infomap": infomap( adata, adj_matrix_key="vf_feature_knn", result_key="field_infomap", ) logger.finish_progress(progress_name="clustering_field") if copy: return adata return None
def score_cells( adata, genes=None, layer=None, basis=None, n_neighbors=30, beta=0.1, iteration=5, metric="euclidean", metric_kwds=None, cores=1, seed=19491001, return_score=True, **kwargs, ): """Score cells based on a set of genes. Parameters ---------- adata: :class:`~anndata.AnnData` AnnData object that contains the reconstructed vector field function in the `uns` attribute. genes: `list` or None (default: None) The gene names whose gene expression will be used for predicting cell fate. By default (when genes is set to None), the genes used for velocity embedding (var.use_for_transition) will be used for vector field reconstruction. Note that the genes to be used need to have velocity calculated and corresponds to those used in the `dyn.tl.VectorField` function. layer: `str` or None (default: 'X') Which layer of the data will be used for predicting cell fate with the reconstructed vector field function. The layer once provided, will override the `basis` argument and then predicting cell fate in high dimensional space. basis: `str` or None (default: `None`) The embedding data to use for predicting cell fate. If `basis` is either `umap` or `pca`, the reconstructed trajectory will be projected back to high dimensional space via the `inverse_transform` function. n_neighbors: `int` (default: `30`) Number of nearest neighbors. beta: `float` (default: `0.1`) The weight that will apply to the current query cell. iteration: `int` (default: `0.5`) Number of smooth iterations. metric: `str` or callable, default='euclidean' The distance metric to use for the tree. The default metric is , and with p=2 is equivalent to the standard Euclidean metric. See the documentation of :class:`DistanceMetric` for a list of available metrics. If metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. metric_kwds : dict, default=None Additional keyword arguments for the metric function. cores: `int` (default: 1) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. seed: `int` (default `19491001`) Random seed to ensure the reproducibility of each run. return_score: `bool` (default: `False`) Whether to return the score. If False, save the smoothed score to `cell_scores` column in the `.obs` attribute and also to the dictionary corresponding to the `score_cells` key in the .uns attribute. kwargs: Additional arguments that will be passed to each nearest neighbor search algorithm. Returns ------- Depending on return_score, it either return the cell scores or an updated adata object that contains the cell score information. """ if basis is None and "X_pca" not in adata.obsm.keys(): raise ValueError(f"Your adata doesn't have 'X_pca' basis in .obsm.") elif basis is not None and "X_" + basis not in adata.obsm.keys(): raise ValueError( f"Your adata doesn't have the {basis} you inputted in .obsm attribute of your adata." ) if genes is None and "use_for_pca" not in adata.obs.keys(): raise ValueError( f"Your adata doesn't have 'use_for_pca' column in .obs.") if genes is None: genes = adata.var_names[adata.use_for_pca] else: genes = (list(adata.var_names.intersection(genes)) if adata.var_names[0].isupper() else list( adata.var_names.intersection( [i.capitalize() for i in genes])) if adata.var_names[0][0].isupper() and adata.var_names[0][1:].islower() else list( adata.var_names.intersection([i.lower() for i in genes]))) if len(genes) < 1: raise ValueError( f"Your inputted gene list doesn't overlap any gene in your adata object." ) X_basis = adata.obsm["X_pca"] if basis is None else adata.obsm["X_" + basis] if X_basis.shape[0] > 5000 and X_basis.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X_basis, metric=metric, metric_kwds=metric_kwds, n_neighbors=30, n_jobs=cores, random_state=seed, **kwargs) knn, distances = nbrs.query(X_basis, k=n_neighbors) else: alg = "ball_tree" if X_basis.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm=alg, n_jobs=cores).fit(X_basis) distances, knn = nbrs.kneighbors(X_basis) X_data = adata[:, genes].X if layer in [None, "X" ] else adata[:, genes].layers[layer] prev_score = X_data.mean(1).A1 if issparse(X_data) else X_data.mean(1) cur_score = np.zeros(prev_score.shape) for _ in range(iteration): for i in range(len(prev_score)): xn = prev_score[knn[i]] cur_score[i] = (beta * xn[0]) + ((1 - beta) * xn[1:].mean(axis=0)) prev_score = cur_score smoothed_score = cur_score if return_score: return smoothed_score else: adata.uns["score_cells"] = { "smoothed_score": smoothed_score, "genes": genes, "layer": layer, "basis": basis } adata.obs["cell_score"] = smoothed_score
def fit( self, X, V, M_diff, neighbor_idx=None, n_recurse_neighbors=None, k=30, epsilon=None, adaptive_local_kernel=False, tol=1e-4, sparse_construct=True, sample_fraction=None, ): # compute connectivity if neighbor_idx is None: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent( X, metric="euclidean", n_neighbors=k, n_jobs=-1, random_state=19491001, ) neighbor_idx, _ = nbrs.query(X, k=k) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X) _, neighbor_idx = nbrs.kneighbors(X) if n_recurse_neighbors is not None: self.Idx = append_iterative_neighbor_indices(neighbor_idx, n_recurse_neighbors) else: self.Idx = neighbor_idx # apply kNN downsampling to accelerate calculation (adapted from velocyto) if sample_fraction is not None: neighbor_idx = self.Idx p = np.linspace(0.5, 1, neighbor_idx.shape[1]) p = p / p.sum() sampling_ixs = np.stack( ( np.random.choice( np.arange(1, neighbor_idx.shape[1] - 1), size=int(sample_fraction * (neighbor_idx.shape[1] + 1)), replace=False, p=p, ) for i in range(neighbor_idx.shape[0]) ), 0, ) self.Idx = self.Idx[np.arange(neighbor_idx.shape[0])[:, None], sampling_ixs] n = X.shape[0] if sparse_construct: self.P = sp.lil_matrix((n, n)) else: self.P = np.zeros((n, n)) # compute density kernel if epsilon is not None: if sparse_construct: self.Kd = sp.lil_matrix((n, n)) else: self.Kd = np.zeros((n, n)) inv_eps = 1 / epsilon for i in range(n): self.Kd[i, self.Idx[i]] = compute_density_kernel(X[i], X[self.Idx[i]], inv_eps) self.Kd = sp.csc_matrix(self.Kd) D = np.sum(self.Kd, 0) # compute transition prob. if np.isscalar(M_diff): inv_s = 1 / M_diff else: inv_s = np.linalg.inv(M_diff) for i in tqdm(range(n), desc="compute transiton matrix"): y = X[i] v = V[i] Y = X[self.Idx[i]] if adaptive_local_kernel: k = compute_drift_local_kernel(y, v, Y, inv_s) else: k = compute_drift_kernel(y, v, Y, inv_s) if epsilon is not None: k = k / D[0, self.Idx[i]] else: k = np.matrix(k) p = k / np.sum(k) if np.sum(k) > 0 else np.ones_like(k) / n p[p <= tol] = 0 # tolerance check p = p / np.sum(p) self.P[self.Idx[i], i] = p.A[0] self.P = sp.csc_matrix(self.P)
def graphize_velocity(V, X, nbrs_idx=None, k=30, normalize_v=False, E_func=None): """ The function generates a graph based on the velocity data. The flow from i- to j-th node is returned as the edge matrix E[i, j], and E[i, j] = -E[j, i]. Arguments --------- V: :class:`~numpy.ndarray` The velocities for all cells. X: :class:`~numpy.ndarray` The coordinates for all cells. nbrs_idx: list (optional, default None) a list of neighbor indices for each cell. If None a KNN will be performed instead. k: int (optional, default 30) The number of neighbors for the KNN search. normalize_v: bool (optional, default False) Whether or not normalizing the velocity vectors. E_func: str, function, or None (optional, default None) A variance stabilizing function for reducing the variance of the flows. If a string is passed, there are two options: 'sqrt': the numpy.sqrt square root function; 'exp': the numpy.exp exponential function. Returns ------- E: :class:`~numpy.ndarray` The edge matrix. nbrs_idx: list Neighbor indices. """ n, d = X.shape nbrs = None if nbrs_idx is None: if n > 200000 and d > 2: from pynndescent import NNDescent nbrs = NNDescent( X, metric="euclidean", n_neighbors=k + 1, n_jobs=-1, random_state=19491001, ) nbrs_idx, _ = nbrs.query(X, k=k + 1) else: alg = "ball_tree" if d > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=-1).fit(X) _, nbrs_idx = nbrs.kneighbors(X) if type(E_func) is str: if E_func == "sqrt": E_func = np.sqrt elif E_func == "exp": E_func = np.exp else: raise NotImplementedError("The specified edge function is not implemented.") # E = sp.csr_matrix((n, n)) # Making E a csr_matrix will slow down this process. Try lil_matrix maybe? E = np.zeros((n, n)) for i in range(n): x = flatten(X[i]) idx = nbrs_idx[i] if len(idx) > 0 and idx[0] == i: # excluding the node itself from the neighbors idx = idx[1:] vi = flatten(V[i]) if normalize_v: vi_norm = np.linalg.norm(vi) if vi_norm > 0: vi /= vi_norm # normalized differences U = X[idx] - x U_norm = np.linalg.norm(U, axis=1) U_norm[U_norm == 0] = 1 U /= U_norm[:, None] for jj, j in enumerate(idx): vj = flatten(V[j]) if normalize_v: vj_norm = np.linalg.norm(vj) if vj_norm > 0: vj /= vj_norm u = flatten(U[jj]) v = np.mean((vi.dot(u), vj.dot(u))) if E_func is not None: v = np.sign(v) * E_func(np.abs(v)) E[i, j] = v E[j, i] = -v return E, nbrs_idx
def extract_structural_backbone(t, data, s, max_angle=90, relaxation=0): """ Construct simplified graphs connecting data points there have been projected to density ridges. Two graphs are constructed for different purposes. The first graph (g_simple) is constructed with two major steps: 1. Construct nearest neighbor graph on both ridge positions and raw data positions and combine. 2. Simplify graph so that each point is only connected to up to 2^ridge_dimensionality points with a set of filtering criteria. The second graph (g_mst) has two extra steps: 3. If the graph is not fully connected, connect the components by nearest neigbors between all pairs of components. 4. Construct a minimum spanning tree of the graph. Parameters ----------- t : 2D array Density ridge positions. Typically projected to density ridges with quasildr.dridge.Scms. data : 2D array Original data points. s : object `quasildr.dridge.Scms` object which were used to produce t0. max_angle : float, optional Maximum angle in degree for filtering graph edges. Default is 90. relaxation : float, optional The relaxation parameter used to produce `t0`. See `quasildr.dridge.Scms.scms` documention. Returns ----------- g_simple : sparse matrix Simplified graph constructed without explicit shape or connectivity constraints (g_simple) and the other one is from step 2 g_mst : sparse matrix A tree-shaped graph connecting all points (g_mst). from step 4. """ h, _, _, _, _ = s._nlocal_inv_cov(t) eigvals, eigvecs = np.linalg.eigh(-h) eigvals = -eigvals ridge_dims = ((eigvals[:, 0] - eigvals[:, 1]) / (eigvals[:, 0] - eigvals[:, -1]) < relaxation) + 1 gknn, _ = neighbors.neighbors(t, n_neighbors=50, smoothknn=False) gknnZ, _ = neighbors.neighbors(data, n_neighbors=50, smoothknn=False) gknn = gknn + gknn.T + gknnZ + gknnZ.T gknn.setdiag(0) gknn.eliminate_zeros() # remove edge connecting structures of different dimensionality gknn.data[ridge_dims[gknn.nonzero()[0]] != ridge_dims[gknn.nonzero()[1]]] = 0 gknn.eliminate_zeros() # filter edges edges = t[gknn.nonzero()[1], :] - t[gknn.nonzero()[0], :] edges_norm = edges / (np.linalg.norm(edges, axis=1)[:, np.newaxis]) angles = np.zeros(len(gknn.nonzero()[0])) angles_edge = np.zeros(len(gknn.nonzero()[0])) for d in np.unique(ridge_dims): if d == 1: ind = ridge_dims[gknn.nonzero()[0]] == 1 angles[ind] = np.arccos( np.clip(np.sum(eigvecs[gknn.nonzero()[0][ind], :, 0] * eigvecs[gknn.nonzero()[1][ind], :, 0], axis=1), -1, 1)) / np.pi * 180 angles_edge[ind] = np.arccos( np.clip(np.sum(eigvecs[gknn.nonzero()[0][ind], :, 0] * edges_norm[ind, :], axis=1), -1, 1)) / np.pi * 180 angles[ind] = np.minimum(angles[ind], 180 - angles[ind]) angles_edge[ind] = np.minimum(angles_edge[ind], 180 - angles_edge[ind]) gknn.data[(angles > max_angle) * (angles_edge > max_angle)] = 0 else: # calculate mean principal angles ind = ridge_dims[gknn.nonzero()[0]] == d angles[ind] = np.mean( subspace_angles(eigvecs[gknn.nonzero()[0][ind], :, :d], eigvecs[gknn.nonzero()[1][ind], :, :d]), axis=1) gknn.data[(angles > max_angle)] = 0 gknn.eliminate_zeros() # gknn = gknn + gknn.T n_components, labels = scipy.sparse.csgraph.connected_components(gknn) # simplify graph by connecting only closest nodes in the subspace edges_vecs = t[gknn.nonzero()[1], :] - t[gknn.nonzero()[0], :] edges_dist = np.linalg.norm(edges_vecs, axis=1) rowinds = [] colinds = [] # orient eigen vectors in the same directions for d in range(eigvecs.shape[2]): eigvecs[eigvecs[:, 0, d] < 0, :, d] *= -1 for d in np.unique(ridge_dims): if d == 1: ind = ridge_dims[gknn.nonzero()[0]] == 1 proj_vecs = np.sum(edges_vecs[ind, :] * eigvecs[gknn.nonzero()[0][ind], :, 0], axis=1)[:, np.newaxis] * eigvecs[gknn.nonzero()[0][ind], :, 0] else: ind = ridge_dims[gknn.nonzero()[0]] == d proj_vecs = matrix_multiply(eigvecs[gknn.nonzero()[0][ind], :, :d], \ matrix_multiply(eigvecs[gknn.nonzero()[0][ind], :, :d].transpose((0, 2, 1)), edges_vecs[ind, :, np.newaxis])) proj_dists = edges_dist[ind] gknn.data[ind] = proj_dists gknn_directions = [] for k in range(d): gknn_directions.append(gknn.copy()) gknn_directions[k].data[ind] = proj_vecs[:, k].squeeze() from itertools import product for directions in list(product([-1, 1], repeat=len(gknn_directions))): for i in np.where(ridge_dims == d)[0]: dist_data = gknn[i, :].data direction_datas = [gd[i, :].data for gd in gknn_directions] conditions = [(directions[i] * direction_datas[i]) > 0 for i in range(len(directions))] conditions = np.all(np.vstack(conditions).T, axis=1) if np.any(conditions): min_dist = np.min(dist_data[conditions]) if directions[0] < 0: rowinds.append(gknn[i, :].nonzero()[1][dist_data == min_dist][0]) colinds.append(i) else: rowinds.append(i) colinds.append(gknn[i, :].nonzero()[1][dist_data == min_dist][0]) gedges = np.vstack([rowinds, colinds]).T gedges = np.unique(gedges, axis=0) g_simple = csr_matrix((np.repeat(1, gedges.shape[0]), (gedges[:, 0], gedges[:, 1])), shape=gknn.shape) n_components, labels = scipy.sparse.csgraph.connected_components(g_simple) # meta graph connecting each component. components_dimensionality = [] for i in range(n_components): components_dimensionality.append(ridge_dims[labels == i][0]) # To connect or not to connect fc_metaedges = [] fc_edge_indices = [] for i in range(n_components - 1): i_inds = np.where(labels == i)[0] if len(i_inds) > 1000: index_group_i = NNDescent(t[i_inds, :]) index_group_data_i = NNDescent(data[i_inds, :]) else: index_group_i = NearestNeighbors(n_neighbors=1).fit(t[i_inds, :]) index_group_data_i = NearestNeighbors(n_neighbors=1).fit(data[i_inds, :]) # for g_mst for j in range(i + 1, n_components): j_inds = np.where(labels == j)[0] if len(i_inds) > 1000: nn, _ = index_group_i.query(t[j_inds, :], k=1) _, dist = index_group_data_i.query(data[j_inds, :], k=1) else: _, nn = index_group_i.kneighbors(t[j_inds, :]) dist, _ = index_group_data_i.kneighbors(data[j_inds, :]) mindist = np.min(dist) fc_metaedges.append([i, j]) fc_edge_indices.append([i_inds[nn[dist == mindist]][0], j_inds[np.where(dist == mindist)[0]][0]]) if len(fc_edge_indices) > 0: fc_edge_indices = np.vstack(fc_edge_indices) g_fc_connections = csr_matrix( (np.repeat(2, fc_edge_indices.shape[0]), (fc_edge_indices[:, 0], fc_edge_indices[:, 1])), shape=gknn.shape) g_fc = g_simple + g_fc_connections else: g_fc = g_simple g_fc.data = np.linalg.norm(t[g_fc.nonzero()[0], :] - t[g_fc.nonzero()[1], :], axis=1) g_mst = minimum_spanning_tree(g_fc) g_simple.data = np.linalg.norm(t[g_simple.nonzero()[0], :] - t[g_simple.nonzero()[1], :], axis=1) return g_simple, g_mst, ridge_dims
class KNNSearch: def __init__(self, features, kwargs): self.org_features = features if kwargs["normalize"]: self.features = preprocessing.normalize(features, norm='l2') else: self.features = features self.kwargs = kwargs self.predictor = None def fit(self): if self.kwargs['algorithm'] == 'datasketch': self.__datasketch_fit() elif self.kwargs['algorithm'] == 'annoy': self.__annoy_fit() elif self.kwargs['algorithm'] == 'exact': self.__exhaustive_fit() elif self.kwargs['algorithm'] == 'falconn': self.__falconn_fit() elif self.kwargs['algorithm'] == 'descent': self.__descent_fit() elif self.kwargs['algorithm'] == 'random': self.__random_fit() else: raise Exception("Algorithm=[{}] not yet implemented".format( self.kwargs['algorithm'])) def predict(self, input, k): if self.kwargs['algorithm'] == 'datasketch': return self.__datasketch_predict(input, k) elif self.kwargs['algorithm'] == 'annoy': return self.__annoy_predict(input, k) elif self.kwargs['algorithm'] == 'exact': return self.__exhaustive_predict(input, k) elif self.kwargs['algorithm'] == 'falconn': return self.__falconn_predict(input, k) elif self.kwargs['algorithm'] == 'descent': return self.__descent_predict(input, k) elif self.kwargs['algorithm'] == 'random': return self.__random_predict(input, k) else: raise Exception("Algorithm=[{}] not yet implemented".format( self.kwargs['algorithm'])) def __datasketch_fit(self): if self.kwargs['create']: # Create a list of MinHash objects min_hash_obj_list = [] forest = MinHashLSHForest(num_perm=self.kwargs['num_perm']) for i in range(len(self.features)): min_hash_obj_list.append( MinHash(num_perm=self.kwargs['num_perm'])) for d in self.features[i]: min_hash_obj_list[i].update(d) forest.add(i, min_hash_obj_list[i]) # IMPORTANT: must call index() otherwise the keys won't be searchable forest.index() with open(self.kwargs['file_path'], "wb") as f: pickle.dump(forest, f) pickle.dump(min_hash_obj_list, f) self.predictor = [forest, min_hash_obj_list] else: with open(self.kwargs['file_path'], "rb") as f: forest = pickle.load(f) min_hash_obj_list = pickle.load(f) self.predictor = [forest, min_hash_obj_list] def __datasketch_predict(self, input, k): forest, min_hash_obj_list = self.predictor if type(input) == int: return forest.query(min_hash_obj_list[input], k) else: min_hash_obj = MinHash(num_perm=self.kwargs['num_perm']) for d in input: min_hash_obj.update(d) return forest.query(min_hash_obj, k) def __annoy_fit(self): if self.kwargs['create']: indexer = AnnoyIndex(self.features.shape[1], self.kwargs['metric']) for i, f in enumerate(self.features): indexer.add_item(i, f) indexer.build(self.kwargs['num_trees']) indexer.save(self.kwargs['file_path']) self.predictor = indexer else: forest = AnnoyIndex(self.features.shape[1], self.kwargs['metric']) forest.load(self.kwargs['file_path']) self.predictor = forest def __annoy_predict(self, input, k): annoy_forest = self.predictor if type(input) == int: return annoy_forest.get_nns_by_item(input, k, search_k=-1, include_distances=False) else: return annoy_forest.get_nns_by_vector(input, k, search_k=-1, include_distances=False) def __exhaustive_fit(self): self.predictor = NearestNeighbors(algorithm='ball_tree') self.predictor.fit(self.features) def __exhaustive_predict(self, input, k): if type(input) == int: return self.predictor.kneighbors(self.features[input].reshape( 1, -1), n_neighbors=k, return_distance=False)[0] else: return self.predictor.kneighbors(input.reshape(1, -1), n_neighbors=k, return_distance=False)[0] def __falconn_fit(self): """ Initializes locality-sensitive hashing with FALCONN to find nearest neighbors in training data. """ import falconn dimension = self.features.shape[1] nb_tables = self.kwargs['nb_tables'] number_bits = self.kwargs['number_bits'] # LSH parameters params_cp = falconn.LSHConstructionParameters() params_cp.dimension = dimension params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = nb_tables params_cp.num_rotations = 2 # for dense set it to 1; for sparse data set it to 2 params_cp.seed = 5721840 # we want to use all the available threads to set up params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable # we build number_bits-bit hashes so that each table has # 2^number_bits bins; a rule of thumb is to have the number # of bins be the same order of magnitude as the number of data points falconn.compute_number_of_hash_functions(number_bits, params_cp) self._falconn_table = falconn.LSHIndex(params_cp) self._falconn_query_object = None self._FALCONN_NB_TABLES = nb_tables # Center the dataset and the queries: this improves the performance of LSH quite a bit. self.center = np.mean(self.features, axis=0) self.features -= self.center # add features to falconn table self._falconn_table.setup(self.features) def __falconn_predict(self, input, k): # Normalize input if you care about the cosine similarity if type(input) == int: input = self.features[input] else: if self.kwargs['normalize']: input /= np.linalg.norm(input) # Center the input and the queries: this improves the performance of LSH quite a bit. input -= self.center # Late falconn query_object construction # Since I suppose there might be an error # if table.setup() will be called after if self._falconn_query_object is None: self._falconn_query_object = self._falconn_table.construct_query_object( ) self._falconn_query_object.set_num_probes(self._FALCONN_NB_TABLES) query_res = self._falconn_query_object.find_k_nearest_neighbors( input, k) return query_res def __descent_fit(self): self.predictor = NNDescent(data=self.features, metric=self.kwargs['metric']) def __descent_predict(self, input, k): input = np.expand_dims( input, axis=0) # input should be an array of search points index = self.predictor return index.query(input, k)[0][ 0] # returns indices of NN, distances of the NN from the input def __random_fit(self): pass def __random_predict(self, input, k): rand_index_list = [] for i in range(k): rand_index_list.append(random.randint(0, len(self.features) - 1)) return rand_index_list
def cluster_field(adata, basis='pca', embedding_basis=None, normalize=True, method='louvain', cores=1, **kwargs): """Cluster cells based on vector field features. We would like to see whether the vector field can be used to better define cell state/types. This can be accessed via characterizing critical points (attractor/saddle/repressor, etc.) and characteristic curves (nullcline, separatrix). However, the calculation of those is not easy, for example, a strict definition of an attractor is states where velocity is 0 and the eigenvalue of the jacobian matrix at that point is all negative. Under this strict definition, we may sometimes find the attractors are very far away from our sampled cell states which makes them less meaningful. This is not unexpected as the vector field we learned is defined via a set of basis functions based on gaussian kernels and thus it is hard to satisfy that strict definition. Fortunately, we can handle this better with the help of a different set of ideas. Instead of using critical points by the classical dynamic system methods, we can use some machine learning approaches that are based on extracting geometric features of streamline to "cluster vector field space" for define cell states/type. This requires calculating, potential (ordered pseudotime), speed, curliness, divergence, acceleration, curvature, etc. Thanks to the fact that we can analytically calculate Jacobian matrix matrix, those quantities of the vector field function can be conveniently and efficiently calculated. Parameters ---------- adata: :class:`~anndata.AnnData`. adata object that includes both newly synthesized and total gene expression of cells. Alternatively, the object should include both unspliced and spliced gene expression of cells. basis: `str` or None (default: `None`) The space that will be used for calculating vector field features. Valid names includes, for example, `pca`, `umap`, etc. embedding_basis: `str` or None (default: `None`) The embedding basis that will be combined with the vector field feature space for clustering. normalize: `bool` (default: `True`) Whether to mean center and scale the feature across all cells so that the mean method: `str` (default: `louvain`) The method that will be used for clustering, one of `{'kmeans'', 'hdbscan', 'louvain', 'leiden'}`. If `louvain` or `leiden` used, you need to have `scanpy` installed. cores: `int` (default: 1) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. kwargs: Any additional arguments that will be passed to either kmeans, hdbscan, louvain or leiden clustering algorithms. Returns ------- """ if method in ['louvain', 'leiden']: try: import scanpy as sc except ImportError: raise ImportError( "You need to install the excellent package `scanpy` if you want to use louvain or leiden " "for clustering.") feature_key = [ 'speed_' + basis, basis + '_ddhodge_potential', 'divergence_' + basis, 'acceleration_' + basis, 'curvature_' + basis ] if feature_key[0] not in adata.obs.keys(): from .vector_calculus import speed speed(adata, basis=basis) if feature_key[1] not in adata.obs.keys(): from ..ext import ddhodge ddhodge(adata, basis=basis) if feature_key[2] not in adata.obs.keys(): from .vector_calculus import divergence divergence(adata, basis=basis) if feature_key[3] not in adata.obs.keys(): from .vector_calculus import acceleration acceleration(adata, basis=basis) if feature_key[4] not in adata.obs.keys(): from .vector_calculus import curvature curvature(adata, basis=basis) feature_data = adata.obs.loc[:, feature_key].values if embedding_basis is None: embedding_basis = basis X = np.hstack((feature_data, adata.obsm['X_' + embedding_basis])) if normalize: # X = (X - X.min(0)) / X.ptp(0) X = (X - X.mean(0)) / X.std(0) if method in ['hdbscan', 'kmeans']: if method == 'hdbscan': hdbscan(adata, X_data=X, **kwargs) elif method == 'kmeans': from sklearn.cluster import KMeans kmeans = KMeans(random_state=0, **kwargs).fit(X) adata.obs['kmeans'] = kmeans.labels_.astype('str') elif method in ['louvain', 'leiden']: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric='euclidean', n_neighbors=31, n_jobs=cores, random_state=19491001) nbrs_idx, dist = nbrs.query(X, k=31) else: nbrs = NearestNeighbors(n_neighbors=31, n_jobs=cores).fit(X) dist, nbrs_idx = nbrs.kneighbors(X) row = np.repeat(nbrs_idx[:, 0], 30) col = nbrs_idx[:, 1:].flatten() g = csr_matrix((np.repeat(1, len(col)), (row, col)), shape=(adata.n_obs, adata.n_obs)) adata.obsp['feature_knn'] = g if method == 'louvain': sc.tl.louvain(adata, obsp='feature_knn', **kwargs) elif method == 'leiden': sc.tl.leiden(adata, obsp='feature_knn', **kwargs)
def diffusionMatrix( adata, X_data=None, V_data=None, genes=None, layer=None, basis="umap", dims=None, n=30, VecFld=None, residual="vector_field", ): """ "Calculate the diffusion matrix from the estimated velocity vector and the reconstructed vector field. Parameters ---------- adata: :class:`~anndata.AnnData` an Annodata object. X_data: `np.ndarray` (default: `None`) The user supplied expression (embedding) data that will be used for calculating diffusion matrix directly. V_data: `np.ndarray` (default: `None`) The user supplied velocity data that will be used for calculating diffusion matrix directly. genes: `list` or None (default: `None`) The list of genes that will be used to subset the data. If `None`, all genes will be used. layer: `str` or None (default: None) Which layer of the data will be used for diffusion matrix calculation. basis: `str` (default: `umap`) Which basis of the data will be used for diffusion matrix calculation. dims: `list` or None (default: `None`) The list of dimensions that will be selected for diffusion matrix calculation. If `None`, all dimensions will be used. n: `int` (default: `10`) Number of nearest neighbors when the nearest neighbor graph is not included. VecFld: `dictionary` or None (default: None) The reconstructed vector field function. residual: `str` or None (default: `vector_field`) Method to calculate residual velocity vectors for diffusion matrix calculation. If `average`, all velocity of the nearest neighbor cells will be minused by its average velocity; if `vector_field`, all velocity will be minused by the predicted velocity from the reconstructed deterministic velocity vector field. Returns ------- adata: :class:`~anndata.AnnData` `AnnData` object that is updated with the `diffusion_matrix` key in the `uns` attribute which is a list of the diffusion matrix for each cell. A column `diffusion` corresponds to the square root of the sum of all elements for each cell's diffusion matrix will also be added. """ if X_data is None or V_data is not None: if genes is not None: genes = adata.var_name.intersection(genes).to_list() if len(genes) == 0: raise ValueError(f"no genes from your genes list appear in your adata object.") if layer is not None: if layer not in adata.layers.keys(): raise ValueError(f"the layer {layer} you provided is not included in the adata object!") if basis is None: vkey = "velocity_" + layer[0].upper() if vkey not in adata.obsm.keys(): raise ValueError( f"the data corresponds to the velocity key {vkey} is not included in the adata object!" ) if VecFld is None: VecFld, func = vecfld_from_adata(adata, basis) else: func = lambda x: vector_field_function(x, VecFld) prefix = "X_" if layer is None else layer + "_" if basis is not None: if basis.split(prefix)[-1] not in [ "pca", "umap", "trimap", "tsne", "diffmap", ]: raise ValueError( f"basis (or the suffix of basis) can only be one of " f"['pca', 'umap', 'trimap', 'tsne', 'diffmap']." ) if basis.startswith(prefix): basis = basis vkey = "velocity_" + basis.split(prefix)[-1] else: vkey = "velocity_" + basis basis = prefix + basis if vkey not in adata.obsm_keys(): raise ValueError( f"the data corresponds to the velocity key {vkey} is not included in the adata object!" ) if basis is None: if layer is None: vkey = "velocity_S" if vkey not in adata.uns_keys(): raise ValueError( f"the data corresponds to the velocity key {vkey} is not included in the adata object!" ) if genes is not None: X_data, V_data = ( adata[:, genes].X, adata[:, genes].uns[vkey], ) else: if "use_for_dynamics" not in adata.var.keys(): X_data, V_data = adata.X, adata.uns[vkey] else: X_data, V_data = ( adata[:, adata.var.use_for_dynamics].X, adata[:, adata.var.use_for_dynamics].uns[vkey], ) else: vkey = "velocity_" + layer[0].upper() if vkey not in adata.uns_keys(): raise ValueError( f"the data corresponds to the velocity key {vkey} is not included in the adata object!" ) if genes is not None: X_data, V_data = ( adata[:, genes].layers[layer], adata[:, genes].uns[vkey], ) else: if "use_for_dynamics" not in adata.var.keys(): X_data, V_data = adata.layers[layer], adata.uns[vkey] else: X_data, V_data = ( adata[:, adata.var.use_for_dynamics].layers[layer], adata[:, adata.var.use_for_dynamics].uns[vkey], ) X_data = log1p_(adata, X_data) else: X_data, V_data = adata.obsm[basis], adata.obsm[vkey] if dims is not None: X_data, V_data = X_data[:, dims], V_data[:, dims] neighbor_result_prefix = "" if layer is None else layer conn_key, dist_key, neighbor_key = _gen_neighbor_keys(neighbor_result_prefix) if neighbor_key not in adata.uns_keys() or (X_data is not None and V_data is not None): if X_data.shape[0] > 200000 and X_data.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent( X_data, metric="euclidean", n_neighbors=n, n_jobs=-1, random_state=19491001, ) Idx, _ = nbrs.query(X_data, k=n) else: alg = "ball_tree" if X_data.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=n, algorithm=alg, n_jobs=-1).fit(X_data) _, Idx = nbrs.kneighbors(X_data) else: check_and_recompute_neighbors(adata, result_prefix=layer) conn_key = "connectivities" if layer is None else layer + "_connectivities" neighbors = adata.obsp[conn_key] Idx = neighbors.tolil().rows if residual == "average": V_ave = np.zeros_like(V_data) for i in range(X_data.shape[0]): vv = V_data[Idx[i]] V_ave[i] = vv.mean(0) elif residual == "vector_field": V_ave = func(X_data) else: raise ValueError( f"The method for calculate residual {residual} is not supported. " f'Currently only {"average", "vector_field"} supported.' ) V_diff = V_data - V_ave val = np.zeros((V_data.shape[0], 1)) dmatrix = [None] * V_data.shape[0] for i in tqdm(range(X_data.shape[0]), "calculating diffusion matrix for each cell."): vv = V_diff[Idx[i]] d = np.cov(vv.T) val[i] = np.sqrt(sum(sum(d))) dmatrix[i] = d adata.obs["diffusion"] = val adata.uns["diffusion_matrix"] = dmatrix
def cell_velocities(adata, ekey=None, vkey=None, X=None, V_mat=None, X_embedding=None, use_mnn=False, neighbors_from_basis=False, n_pca_components=None, min_r2=0.01, min_alpha=0.01, min_gamma=0.01, min_delta=0.01, basis="umap", method="pearson", neg_cells_trick=True, calc_rnd_vel=False, xy_grid_nums=(50, 50), correct_density=True, scale=True, sample_fraction=None, random_seed=19491001, other_kernels_dict={}, enforce=False, key=None, preserve_len=False, **kmc_kwargs): """Compute transition probability and project high dimension velocity vector to existing low dimension embedding. It is powered by the Itô kernel that not only considers the correlation between the vector from any cell to its nearest neighbors and its velocity vector but also the corresponding distances. We expect this new kernel will enable us to visualize more intricate vector flow or steady states in low dimension. We also expect it will improve the calculation of the stationary distribution or source states of sampled cells. The original "correlation/cosine" velocity projection method is also supported. Kernels based on the reconstructed velocity field is also possible. With the `key` argument, `cell_velocities` can be called by `cell_accelerations` to calculate RNA acceleration vector for each cell. Arguments --------- adata: :class:`~anndata.AnnData` an Annodata object. ekey: `str` or None (optional, default `None`) The dictionary key that corresponds to the gene expression in the layer attribute. By default, ekey and vkey will be automatically detected from the adata object. vkey: 'str' or None (optional, default `None`) The dictionary key that corresponds to the estimated velocity values in the layers attribute. X: 'np.ndarray' or `sp.csr_matrix` or None (optional, default `None`) The expression states of single cells (or expression states in reduced dimension, like pca, of single cells) V_mat: 'np.ndarray' or `sp.csr_matrix` or None (optional, default `None`) The RNA velocity of single cells (or velocity estimates projected to reduced dimension, like pca, of single cells). Note that X, V_mat need to have the exact dimensionalities. X_embedding: 'str' or None (optional, default `None`) The low expression reduced space (pca, umap, tsne, etc.) of single cells that RNA velocity will be projected onto. Note X_embedding, X and V_mat has to have the same cell/sample dimension and X_embedding should have less feature dimension comparing that of X or V_mat. use_mnn: `bool` (optional, default `False`) Whether to use mutual nearest neighbors for projecting the high dimensional velocity vectors. By default, we don't use the mutual nearest neighbors. Mutual nearest neighbors are calculated from nearest neighbors across different layers, which which accounts for cases where, for example, the cells from spliced expression may be nearest neighbors but far from nearest neighbors on unspliced data. Using mnn assumes your data from different layers are reliable (otherwise it will destroy real signals). neighbors_from_basis: `bool` (optional, default `False`) Whether to construct nearest neighbors from low dimensional space as defined by the `basis`, instead of using that calculated during UMAP process. n_pca_components: `int` (optional, default `None`) The number of pca components to project the high dimensional X, V before calculating transition matrix for velocity visualization. By default it is None and if method is `kmc`, n_pca_components will be reset to 30; otherwise use all high dimensional data for velocity projection. min_r2: `float` (optional, default `0.01`) The minimal value of r-squared of the parameter fits for selecting velocity genes. min_alpha: `float` (optional, default `0.01`) The minimal value of alpha kinetic parameter for selecting velocity genes. min_gamma: `float` (optional, default `0.01`) The minimal value of gamma kinetic parameter for selecting velocity genes. min_delta: `float` (optional, default `0.01`) The minimal value of delta kinetic parameter for selecting velocity genes. basis: 'int' (optional, default `umap`) The dictionary key that corresponds to the reduced dimension in `.obsm` attribute. method: `string` (optional, default `pearson`) The method to calculate the transition matrix and project high dimensional vector to low dimension, either `kmc`, `cosine`, `pearson`, or `transform`. "kmc" is our new approach to learn the transition matrix via diffusion approximation or an Itô kernel. "cosine" or "pearson" are the methods used in the original RNA velocity paper or the scvelo paper (Note that scVelo implementation actually centers both dX and V, so its cosine kernel is equivalent to pearson correlation kernel but we also provide the raw cosine kernel). "kmc" option is arguable better than "correlation" or "cosine" as it not only considers the correlation but also the distance of the nearest neighbors to the high dimensional velocity vector. Finally, the "transform" method uses umap's transform method to transform new data points to the UMAP space. "transform" method is NOT recommended. Kernels that are based on the reconstructed vector field in high dimension is also possible. neg_cells_trick: 'bool' (optional, default `True`) Whether we should handle cells having negative correlations in gene expression difference with high dimensional velocity vector separately. This option was borrowed from scVelo package (https://github.com/theislab/scvelo) and use in conjunction with "pearson" and "cosine" kernel. Not required if method is set to be "kmc". calc_rnd_vel: `bool` (default: `False`) A logic flag to determine whether we will calculate the random velocity vectors which can be plotted downstream as a negative control and used to adjust the quiver scale of the velocity field. xy_grid_nums: `tuple` (default: `(50, 50)`). A tuple of number of grids on each dimension. correct_density: `bool` (default: `False`) Whether to correct density when calculating the markov transition matrix, applicable to the `kmc` kernel. correct_density: `bool` (default: `False`) Whether to scale velocity when calculating the markov transition matrix, applicable to the `kmc` kernel. sample_fraction: `None` or `float` (default: `None`) The downsampled fraction of kNN for the purpose of acceleration, applicable to the `kmc` kernel. random_seed: `int` (default: `19491001`) The random seed for numba to ensure consistency of the random velocity vectors. Default value 19491001 is a special day for those who care. key: `str` or None (default: `None`) The prefix key that will be prefixed to the keys for storing calculated transition matrix, projection vectors, etc. preserve_len: `bool` (default: `False`) Whether to preserve the length of high dimension vector length. When set to be True, the length of low dimension projected vector will be proportionally scaled to that of the high dimensional vector. other_kernels_dict: `dict` (default: `{}`) A dictionary of paramters that will be passed to the cosine/correlation kernel. enforce: `bool` (default: `False`) Whether to enforce 1) redefining use_for_velocity column in obs attribute; 2) recalculation of transition matrix. Returns ------- Adata: :class:`~anndata.AnnData` Returns an updated `~anndata.AnnData` with transition_matrix and projected embedding of high dimension velocity vectors in the existing embeddings of current cell state, calculated using either the Itô kernel method (default) or the diffusion approximation or the method from (La Manno et al. 2018). """ mapper_r = get_mapper_inverse() layer = mapper_r[ekey] if (ekey is not None and ekey in mapper_r.keys()) else ekey ekey, vkey, layer = (get_ekey_vkey_from_adata(adata) if (ekey is None or vkey is None) else (ekey, vkey, layer)) if calc_rnd_vel: numba_random_seed(random_seed) if (not neighbors_from_basis) and ("neighbors" in adata.uns.keys()): if use_mnn: neighbors = adata.uns["mnn"] indices, dist = extract_indices_dist_from_graph( neighbors, adata.uns["neighbors"]["indices"].shape[1]) indices, dist = indices[:, 1:], dist[:, 1:] else: if adata.obsp["distances"].shape[0] == adata.obsp[ "distances"].shape[1]: knn_indices, knn_dists = extract_indices_dist_from_graph( adata.obsp["distances"], 30 # np.min((adata.uns["neighbors"]["connectivities"] > 0).sum(1).A) ) knn_dists = build_distance_graph(knn_indices, knn_dists) adata.uns["neighbors"]["indices"], adata.obsp[ "distances"] = knn_indices, knn_dists neighbors, dist, indices = ( adata.obsp["connectivities"], adata.obsp["distances"], adata.uns["neighbors"]["indices"], ) indices, dist = indices[:, 1:], dist[:, 1:] if 'use_for_velocity' not in adata.var.keys() or enforce: use_for_dynamics = True if "use_for_dynamics" in adata.var.keys( ) else False adata = set_velocity_genes( adata, vkey="velocity_S", min_r2=min_r2, use_for_dynamics=use_for_dynamics, min_alpha=min_alpha, min_gamma=min_gamma, min_delta=min_delta, ) X = adata[:, adata.var.use_for_velocity. values].layers[ekey] if X is None else X V_mat = (adata[:, adata.var.use_for_velocity.values].layers[vkey] if vkey in adata.layers.keys() else None) if V_mat is None else V_mat if X.shape != V_mat.shape and X.shape[0] != adata.n_obs: raise Exception( f"X and V_mat doesn't have the same dimensionalities or X/V_mat doesn't {adata.n_obs} rows!" ) if X_embedding is None: if vkey == "velocity_S": X_embedding = adata.obsm["X_" + basis] else: adata = reduceDimension(adata, layer=layer, reduction_method=basis) X_embedding = adata.obsm[layer + "_" + basis] if X.shape[0] != X_embedding.shape[0] and X.shape[1] > X_embedding.shape[1]: raise Exception( f"X and X_embedding doesn't have the same sample dimension or " f"X doesn't have the higher feature dimension!") V_mat = V_mat.A if issparse(V_mat) else V_mat X = X.A if issparse(X) else X finite_inds = get_finite_inds(V_mat) X, V_mat = X[:, finite_inds], V_mat[:, finite_inds] if method == 'kmc' and n_pca_components is None: n_pca_components = 30 if n_pca_components is not None: X = log1p_(adata, X) X_plus_V = log1p_(adata, X + V_mat) if ("velocity_pca_fit" not in adata.uns_keys() or type(adata.uns["velocity_pca_fit"]) == str): pca = PCA( n_components=min(n_pca_components, X.shape[1] - 1), svd_solver="arpack", random_state=0, ) pca_fit = pca.fit(X) X_pca = pca_fit.transform(X) adata.uns["velocity_pca_fit"] = pca_fit adata.uns["velocity_PCs"] = pca_fit.components_.T adata.obsm["X_velocity_pca"] = X_pca X_pca, PCs, pca_fit = ( adata.obsm["X_velocity_pca"], adata.uns["velocity_PCs"], adata.uns["velocity_pca_fit"], ) Y_pca = pca_fit.transform(X_plus_V) V_pca = Y_pca - X_pca # V_pca = (V_mat - V_mat.mean(0)).dot(PCs) adata.obsm["velocity_pca_raw"] = V_pca X, V_mat = X_pca[:, :n_pca_components], V_pca[:, :n_pca_components] if neighbors_from_basis: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric='eulcidean', n_neighbors=30, n_jobs=-1, random_state=19490110, **kwargs) indices, _ = nbrs.query(X, k=30) else: alg = "ball_tree" if X.shape[1] > 10 else 'kd_tree' nbrs = NearestNeighbors(n_neighbors=30, algorithm=alg, n_jobs=-1).fit(X) _, indices = nbrs.kneighbors(X) # add both source and sink distribution if method == "kmc": if method + '_transition_matrix' in adata.uns_keys() and not enforce: T = adata.uns[method + '_transition_matrix'] kmc = KernelMarkovChain(P=T) else: kmc = KernelMarkovChain() kmc_args = { "n_recurse_neighbors": 2, "M_diff": 2, "epsilon": None, "adaptive_local_kernel": True, "tol": 1e-7, } kmc_args = update_dict(kmc_args, kmc_kwargs) if method + '_transition_matrix' not in adata.uns_keys( ) or not enforce: kmc.fit(X, V_mat, neighbor_idx=indices, sample_fraction=sample_fraction, **kmc_args) # T = kmc.P if correct_density: delta_X = kmc.compute_density_corrected_drift( X_embedding, kmc.Idx, normalize_vector=True, scale=scale) # indices, k = 500 else: delta_X = kmc.compute_drift(X_embedding, num_prop=1, scale=scale) # indices, k = 500 # P = kmc.compute_stationary_distribution() # adata.obs['stationary_distribution'] = P X_grid, V_grid, D = velocity_on_grid(X_embedding, delta_X, xy_grid_nums=xy_grid_nums) if calc_rnd_vel: kmc = KernelMarkovChain() permute_rows_nsign(V_mat) kmc.fit(X, V_mat, **kmc_args) # neighbor_idx=indices, T_rnd = kmc.P if correct_density: delta_X_rnd = kmc.compute_density_corrected_drift( X_embedding, kmc.Idx, normalize_vector=True) # indices, k = 500 else: delta_X_rnd = kmc.compute_drift(X_embedding) # P_rnd = kmc.compute_stationary_distribution() # adata.obs['stationary_distribution_rnd'] = P_rnd X_grid_rnd, V_grid_rnd, D_rnd = velocity_on_grid( X_embedding, delta_X_rnd, xy_grid_nums=xy_grid_nums) adata.uns["kmc"] = kmc elif method in ["pearson", "cosine"]: vs_kwargs = { "n_recurse_neighbors": 2, "max_neighs": None, "transform": 'sqrt', "use_neg_vals": True, } vs_kwargs = update_dict(vs_kwargs, other_kernels_dict) if method + '_transition_matrix' in adata.uns_keys() and not enforce: T = adata.uns[method + '_transition_matrix'] delta_X = projection_with_transition_matrix( X.shape[0], T, X_embedding) X_grid, V_grid, D = velocity_on_grid( X_embedding[:, :2], (X_embedding + delta_X)[:, :2], xy_grid_nums=xy_grid_nums) else: T, delta_X, X_grid, V_grid, D = kernels_from_velocyto_scvelo( X, X_embedding, V_mat, indices, neg_cells_trick, xy_grid_nums, neighbors, method, **vs_kwargs) if calc_rnd_vel: permute_rows_nsign(V_mat) T_rnd, delta_X_rnd, X_grid_rnd, V_grid_rnd, D_rnd = kernels_from_velocyto_scvelo( X, X_embedding, V_mat, indices, neg_cells_trick, xy_grid_nums, neighbors, method, **vs_kwargs) elif method == "transform": umap_trans, n_pca_components = ( adata.uns["umap_fit"]["fit"], adata.uns["umap_fit"]["n_pca_components"], ) if "pca_fit" not in adata.uns_keys() or type( adata.uns["pca_fit"]) == str: CM = adata.X[:, adata.var.use_for_dynamics.values] from ..preprocessing.utils import pca adata, pca_fit, X_pca = pca(adata, CM, n_pca_components, "X") adata.uns["pca_fit"] = pca_fit X_pca, pca_fit = adata.obsm["X"], adata.uns["pca_fit"] V = (adata[:, adata.var.use_for_dynamics.values].layers[vkey] if vkey in adata.layers.keys() else None) CM, V = CM.A if issparse(CM) else CM, V.A if issparse(V) else V V[np.isnan(V)] = 0 Y_pca = pca_fit.transform(CM + V) Y = umap_trans.transform(Y_pca) delta_X = Y - X_embedding X_grid, V_grid, D = velocity_on_grid(X_embedding, delta_X, xy_grid_nums=xy_grid_nums), if preserve_len: basis_len, high_len = np.linalg.norm(delta_X, axis=1), np.linalg.norm(V_mat, axis=1) scaler = np.nanmedian(basis_len) / np.nanmedian(high_len) for i in tqdm(range(adata.n_obs), desc=f"rescaling velocity norm..."): idx = T[i].indices high_len_ = high_len[idx] T_i = T[i].data delta_X[i] *= T_i.dot(high_len_) / basis_len[i] * scaler if key is None: adata.uns[method + "_transition_matrix"] = T adata.obsm["velocity_" + basis] = delta_X adata.uns["grid_velocity_" + basis] = { "X_grid": X_grid, "V_grid": V_grid, "D": D } else: adata.uns[key + '_' + method + "_transition_matrix"] = T adata.obsm[key + '_' + basis] = delta_X adata.uns["grid_" + key + '_' + basis] = { "X_grid": X_grid, "V_grid": V_grid, "D": D } if calc_rnd_vel: if key is None: adata.uns[method + "_transition_matrix_rnd"] = T_rnd adata.obsm["X_" + basis + "_rnd"] = X_embedding adata.obsm["velocity_" + basis + "_rnd"] = delta_X_rnd adata.uns["grid_velocity_" + basis + "_rnd"] = { "X_grid": X_grid_rnd, "V_grid": V_grid_rnd, "D": D_rnd, } else: adata.uns[key + '_' + method + "_transition_matrix_rnd"] = T_rnd adata.obsm["X_" + key + "_" + basis + "_rnd"] = X_embedding adata.obsm[key + "_" + basis + "_rnd"] = delta_X_rnd adata.uns["grid_" + key + '_' + basis + "_rnd"] = { "X_grid": X_grid_rnd, "V_grid": V_grid_rnd, "D": D_rnd, } return adata
def cell_wise_confidence( adata, X_data=None, V_data=None, ekey="M_s", vkey="velocity_S", neighbors_from_basis=False, method="jaccard", ): """Calculate the cell-wise velocity confidence metric. Parameters ---------- adata: :class:`~anndata.AnnData` an Annodata object. X_data: 'np.ndarray' or `sp.csr_matrix` or None (optional, default `None`) The expression states of single cells (or expression states in reduced dimension, like pca, of single cells) V_data: 'np.ndarray' or `sp.csr_matrix` or None (optional, default `None`) The RNA velocity of single cells (or velocity estimates projected to reduced dimension, like pca, of single cells). Note that X, V_mat need to have the exact dimensionalities. ekey: `str` (optional, default `M_s`) The dictionary key that corresponds to the gene expression in the layer attribute. By default, it is the smoothed expression `M_s`. vkey: 'str' (optional, default `velocity_S`) The dictionary key that corresponds to the estimated velocity values in layers attribute. neighbors_from_basis: `bool` (optional, default `False`) Whether to construct nearest neighbors from low dimensional space as defined by the `basis`, instead of using that calculated during UMAP process. method: `str` (optional, default `jaccard`) Which method will be used for calculating the cell wise velocity confidence metric. By default it uses `jaccard` index, which measures how well each velocity vector meets the geometric constraints defined by the local neighborhood structure. Jaccard index is calculated as the fraction of the number of the intersected set of nearest neighbors from each cell at current expression state (X) and that from the future expression state (X + V) over the number of the union of these two sets. The `cosine` or `correlation` method is similar to that used by scVelo (https://github.com/theislab/scvelo). Returns ------- adata: :class:`~anndata.AnnData` Returns an updated `~anndata.AnnData` with `.obs.confidence` as the cell-wise velocity confidence. """ if method in ["cosine", "consensus", "correlation"]: if "indices" not in adata.uns["neighbors"].keys(): adata.uns["neighbors"]["indices"], _ = adj_to_knn( adata.obsp["connectivities"], n_neighbors=adata.uns["neighbors"]["params"]["n_neighbors"]) if ekey == "X": X, V = ( adata.X if X_data is None else X_data, adata.layers[vkey] if V_data is None else V_data, ) norm_method = adata.uns["pp"]["norm_method"].copy() adata.uns["pp"]["norm_method"] = "log1p" X = inverse_norm(adata, X) if X_data is None else X_data adata.uns["pp"]["norm_method"] = norm_method else: X, V = ( adata.layers[ekey] if X_data is None else X_data, adata.layers[vkey] if V_data is None else V_data, ) X = inverse_norm(adata, X) if X_data is None else X_data if not neighbors_from_basis: check_and_recompute_neighbors(adata, result_prefix="") n_neigh, X_neighbors = ( adata.uns["neighbors"]["params"]["n_neighbors"], adata.obsp["connectivities"], ) else: n_neigh = 30 if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent( X, metric="euclidean", n_neighbors=n_neigh + 1, n_jobs=-1, random_state=19491001, ) nbrs_idx, dist = nbrs.query(X, k=n_neigh + 1) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=n_neigh + 1, algorithm=alg, n_jobs=-1).fit(X) dist, nbrs_idx = nbrs.kneighbors(X) row = np.repeat(nbrs_idx[:, 0], n_neigh) col = nbrs_idx[:, 1:].flatten() X_neighbors = csr_matrix( (np.repeat(1, len(col)), (row, col)), shape=(adata.n_obs, adata.n_obs), ) n_neigh = n_neigh[0] if type(n_neigh) == np.ndarray else n_neigh n_pca_components = adata.obsm["X"].shape[1] finite_inds = get_finite_inds(V, 0) X, V = X[:, finite_inds], V[:, finite_inds] if method == "jaccard": jac, _, _ = jaccard(X, V, n_pca_components, n_neigh, X_neighbors) confidence = jac elif method == "hybrid": # this is inspired from the locality preservation paper jac, intersect_, _ = jaccard(X, V, n_pca_components, n_neigh, X_neighbors) confidence = np.zeros(adata.n_obs) for i in tqdm( range(adata.n_obs), desc= "calculating hybrid method (jaccard + consensus) based cell wise confidence", ): neigh_ids = np.where( intersect_[i].A)[0] if issparse(intersect_) else np.where( intersect_[i])[0] confidence[i] = (jac[i] * np.mean([ consensus(V[i].A.flatten(), V[j].A.flatten()) for j in neigh_ids ]) if issparse(V) else jac[i] * np.mean( [consensus(V[i].flatten(), V[j].flatten()) for j in neigh_ids])) elif method == "cosine": check_and_recompute_neighbors(adata, result_prefix="") indices = adata.uns["neighbors"]["indices"] confidence = np.zeros(adata.n_obs) for i in tqdm( range(adata.n_obs), desc="calculating cosine based cell wise confidence", ): neigh_ids = indices[i] confidence[i] = (np.mean([ einsum_correlation(V[i].A, V[j].A.flatten(), type="cosine")[0, 0] for j in neigh_ids ]) if issparse(V) else np.mean([ einsum_correlation( V[i][None, :], V[j].flatten(), type="cosine")[0, 0] for j in neigh_ids ])) elif method == "consensus": check_and_recompute_neighbors(adata, result_prefix="") indices = adata.uns["neighbors"]["indices"] confidence = np.zeros(adata.n_obs) for i in tqdm( range(adata.n_obs), desc="calculating consensus based cell wise confidence", ): neigh_ids = indices[i] confidence[i] = (np.mean([ consensus(V[i].A.flatten(), V[j].A.flatten()) for j in neigh_ids ]) if issparse(V) else np.mean( [consensus(V[i], V[j].flatten()) for j in neigh_ids])) elif method == "correlation": # this is equivalent to scVelo check_and_recompute_neighbors(adata, result_prefix="") indices = adata.uns["neighbors"]["indices"] confidence = np.zeros(adata.n_obs) for i in tqdm( range(adata.n_obs), desc="calculating correlation based cell wise confidence", ): neigh_ids = indices[i] confidence[i] = (np.mean([ einsum_correlation(V[i].A, V[j].A.flatten(), type="pearson")[0, 0] for j in neigh_ids ]) if issparse(V) else np.mean([ einsum_correlation( V[i][None, :], V[j].flatten(), type="pearson")[0, 0] for j in neigh_ids ])) elif method == "divergence": pass else: raise Exception( "The input {} method for cell-wise velocity confidence calculation is not implemented" " yet".format(method)) adata.obs[method + "_velocity_confidence"] = confidence return adata
class NearestNeighbors: """Greedy algorithm to balance a K-nearest neighbour graph It has an API similar to scikit-learn Parameters ---------- k : int (default=50) the number of neighbours in the final graph sight_k : int (default=100) the number of neighbours in the initialization graph It correspondent to the farthest neighbour that a sample is allowed to connect to when no closest neighbours are allowed. If sight_k is reached then the matrix is filled with the sample itself maxl : int (default=200) max degree of connectivity allowed. Avoids the presence of hubs in the graph, it is the maximum number of neighbours that are allowed to contact a node before the node is blocked mode : str (default="connectivity") decide wich kind of utput "distance" or "connectivity" n_jobs : int (default=4) parallelization of the standard KNN search preformed at initialization """ def __init__(self, k: int = 50, sight_k: int = 100, maxl: int = 200, mode: str = "distance", metric: str = "euclidean", minkowski_p: int = 20, n_jobs: int = -1) -> None: # input parameters self.k = k self.sight_k = sight_k self.maxl = maxl self.mode = mode self.metric = metric self.minkowski_p = minkowski_p self.n_jobs = n_jobs # NN graphs self.data = None self._nn = None # raw KNN self.bknn = None # balanced KNN self.dist = None # balanced KNN distances self.dsi = None # balanced KNN neighbor index self.l = None # balanced KNN degree of connectivity self.mknn = None # mutual KNN based on bknn self.rnn = None # radius NN based on mknn @property def n_samples(self) -> int: return self.data.shape[0] def fit(self, data: np.ndarray, sight_k: int = None) -> Any: """Fits the model data: np.ndarray (samples, features) np sight_k: int the farthest point that a node is allowed to connect to when its closest neighbours are not allowed """ self.data = data if sight_k is not None: self.sight_k = sight_k logging.debug( f"First search the {self.sight_k} nearest neighbours for {self.n_samples}" ) np.random.seed(13) if self.metric == "correlation": self._nn = _NearestNeighbors(n_neighbors=self.sight_k + 1, metric=self.metric, p=self.minkowski_p, n_jobs=self.n_jobs, algorithm="brute") self._nn.fit(self.data) elif self.metric == "js": self._nn = NNDescent(data=self.data, metric=jensen_shannon_distance) else: self._nn = _NearestNeighbors(n_neighbors=self.sight_k + 1, metric=self.metric, p=self.minkowski_p, n_jobs=self.n_jobs, leaf_size=30) self._nn.fit(self.data) # call this to calculate bknn self.kneighbors_graph(mode='distance') return self def kneighbors(self, X: np.ndarray = None, maxl: int = None, mode: str = "distance" ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: if self._nn is None: raise ValueError('must fit() before generating kneighbors graphs') """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : array-like, shape (n_query, n_features), The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. maxl: int max degree of connectivity allowed mode : "distance" or "connectivity" Decides the kind of output Returns ------- dist_new : np.ndarray (samples, k+1) distances to the NN dsi_new : np.ndarray (samples, k+1) indexes of the NN, first column is the sample itself l: np.ndarray (samples) l[i] is the number of connections from other samples to the sample i NOTE: First column (0) correspond to the sample itself, the nearest neighbour is at the second column (1) """ if X is not None: self.data = X if maxl is not None: self.maxl = maxl if mode == "distance": if self.metric == "js": self.dsi, self.dist = self._nn.query(self.data, k=self.sight_k + 1) else: self.dist, self.dsi = self._nn.kneighbors(self.data, return_distance=True) else: if self.metric == "js": self.dsi, _ = self._nn.query(self.data, k=self.sight_k + 1) else: self.dsi = self._nn.kneighbors(self.data, return_distance=False) self.dist = np.ones_like(self.dsi, dtype='float64') self.dist[:, 0] = 0 logging.debug( f"Using the initialization network to find a {self.k}-NN " f"graph with maximum connectivity of {self.maxl}") self.dist, self.dsi, self.l = knn_balance(self.dsi, self.dist, maxl=self.maxl, k=self.k) return self.dist, self.dsi, self.l def kneighbors_graph(self, X: np.ndarray = None, maxl: int = None, mode: str = "distance") -> sparse.csr_matrix: """Retrun the K-neighbors graph as a sparse csr matrix Parameters ---------- X : array-like, shape (n_query, n_features), The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. maxl: int max degree of connectivity allowed mode : "distance" or "connectivity" Decides the kind of output Returns ------- neighbor_graph : scipy.sparse.csr_matrix The values are either distances or connectivity dependig of the mode parameter NOTE: The diagonal will be zero even though the value 0 is actually stored """ dist_new, dsi_new, _ = self.kneighbors(X=X, maxl=maxl, mode=mode) logging.debug("Returning sparse matrix") self.bknn = sparse.csr_matrix( (np.ravel(dist_new), np.ravel(dsi_new), np.arange(0, dist_new.shape[0] * dist_new.shape[1] + 1, dist_new.shape[1])), (self.n_samples, self.n_samples)) self.bknn.eliminate_zeros() return self.bknn def mnn_graph(self): """get mutual nearest neighbor graph from bknn""" if self.mknn is None: if self.bknn is None: raise ValueError( 'must fit() before generating kneighbors graphs') # element-wise minimum between bknn and bknn.T, so non-mutual value will be 0 self.mknn = self.bknn.minimum(self.bknn.transpose()) return self.mknn def rnn_graph(self): """get rnn from mknn, return a sparse binary matrix""" # Convert distances to similarities if self.mknn is None: self.mnn_graph() mknn_sim = self.mknn.copy() bknn_sim = self.bknn.copy() max_d = self.bknn.data.max() bknn_sim.data = (max_d - bknn_sim.data) / max_d mknn_sim.data = (max_d - mknn_sim.data) / max_d mknn_sim = mknn_sim.tocoo() mknn_sim.setdiag(0) # Compute the effective resolution d = 1 - bknn_sim.data radius = np.percentile(d, 90) logging.info(f" 90th percentile radius: {radius:.02}") inside = mknn_sim.data > 1 - radius self.rnn = sparse.coo_matrix( (mknn_sim.data[inside], (mknn_sim.row[inside], mknn_sim.col[inside])), shape=mknn_sim.shape) return self.rnn
def fate_bias( adata, group, basis="umap", inds=None, speed_percentile=5, dist_threshold=None, source_groups=None, metric="euclidean", metric_kwds=None, cores=1, seed=19491001, **kwargs, ): """Calculate the lineage (fate) bias of states whose trajectory are predicted. Fate bias is currently calculated as the percentage of points along the predicted cell fate trajectory whose distance to their 0-th nearest neighbors on the data are close enough (determined by median 1-st nearest neighbors of all observed cells and the dist_threshold) to any cell from each group specified by `group` key. The details is described as following: Cell fate predicted by our vector field method sometimes end up in regions that are not sampled with cells. We thus developed a heuristic method to iteratively walk backward the integration path to assign cell fate. We first identify the regions with small velocity in the tail of the integration path (determined by `speed_percentile`), then we check whether the distance of 0-th nearest points on the observed data to all those points are far away from the observed data (determined by `dist_threshold`). If they are not all close to data, we then walk backwards along the trajectory by one time step until the distance of any currently visited integration path’s data points’ 0-th nearest points to the observed cells is close enough. In order to calculate the cell fate probability, we diffuse one step further of the identified nearest neighbors from the integration to identify more nearest observed cells, especially those from terminal cell types in case nearby cells first identified are all close to some random progenitor cells. Then we use group information of those observed cells to define the fate probability. `fate_bias` calculate a confidence score for the calculated fate probability with a simple metric, defined as :math:`1 - (sum(distances > dist_threshold * median_dist) + walk_back_steps) / (len(indices) + walk_back_steps)` The `distance` is currently visited integration path’s data points’ 0-th nearest points to the observed cells. `median_dist` is median distance of their 1-st nearest cell distance of all observed cells. `walk_back_steps` is the steps walked backward along the integration path until all currently visited integration points's 0-th nearest points to the observed cells satisfy the distance threshold. `indices` are the time indices of integration points that is regarded as the regions with `small velocity` (note when walking backward, those corresponding points are not necessarily have small velocity anymore). Arguments --------- adata: :class:`~anndata.AnnData` AnnData object that contains the predicted fate trajectories in the `uns` attribute. group: `str` The column key that corresponds to the cell type or other group information for quantifying the bias of cell state. basis: `str` or None (default: `None`) The embedding data space where cell fates were predicted and cell fates bias will be quantified. inds `list` or `float` or None (default: `None`): The indices of the time steps that will be used for calculating fate bias. If inds is None, the last a few steps of the fate prediction based on the `sink_speed_percentile` will be use. If inds is the float (between 0 and 1), it will be regarded as a percentage, and the last percentage of steps will be used for fate bias calculation. Otherwise inds need to be a list of integers of the time steps. speed_percentile: `float` (default: `5`) The percentile of speed that will be used to determine the terminal cells (or sink region on the prediction path where speed is smaller than this speed percentile). dist_threshold: `float` or `None` (default: `None`) A multiplier of the median nearest cell distance on the embedding to determine cells that are outside the sampled domain of cells. If the mean distance of identified "terminal cells" is above this number, we will look backward along the trajectory (by minimize all indices by 1) until it finds cells satisfy this threshold. By default it is set to be 1 to ensure only considering points that are very close to observed data points. source_groups: `list` or `None` (default: `None`) The groups that corresponds to progenitor groups. They has to have at least one intersection with the groups from the `group` column. If group is not `None`, any identified "source_groups" cells that happen to be in those groups will be ignored and the probability of cell fate of those cells will be reassigned to the group that has the highest fate probability among other non source_groups group cells. metric: `str` or callable, default='euclidean' The distance metric to use for the tree. The default metric is , and with p=2 is equivalent to the standard Euclidean metric. See the documentation of :class:`DistanceMetric` for a list of available metrics. If metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. metric_kwds : dict, default=None Additional keyword arguments for the metric function. cores: `int` (default: 1) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. seed: `int` (default `19491001`) Random seed to ensure the reproducibility of each run. kwargs: Additional arguments that will be passed to each nearest neighbor search algorithm. Returns ------- fate_bias: `pandas.DataFrame` A DataFrame that stores the fate bias for each cell state (row) to each cell group (column). """ if dist_threshold is None: dist_threshold = 1 if group not in adata.obs.keys(): raise ValueError( f"The group {group} you provided is not a key of .obs attribute.") else: clusters = adata.obs[group] basis_key = "X_" + basis if basis is not None else "X" fate_key = "fate_" + basis if basis is not None else "fate" if basis_key not in adata.obsm.keys(): raise ValueError( f"The basis {basis_key} you provided is not a key of .obsm attribute." ) if fate_key not in adata.uns.keys(): raise ValueError( f"The {fate_key} key is not existed in the .uns attribute of the adata object. You need to run" f"dyn.pd.fate(adata, basis='{basis}') before calculate fate bias.") if source_groups is not None: if type(source_groups) is str: source_groups = [source_groups] source_groups = list(set(source_groups).intersection(clusters)) if len(source_groups) == 0: raise ValueError( f"the {source_groups} you provided doesn't intersect with any groups in the {group} column." ) X = adata.obsm[basis_key] if basis_key != "X" else adata.X if X.shape[0] > 5000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric=metric, metric_kwds=metric_kwds, n_neighbors=30, n_jobs=cores, random_state=seed, **kwargs) knn, distances = nbrs.query(X, k=30) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=30, algorithm=alg, n_jobs=cores).fit(X) distances, knn = nbrs.kneighbors(X) median_dist = np.median(distances[:, 1]) pred_dict = {} cell_predictions, cell_indx = adata.uns[fate_key]["prediction"], adata.uns[ fate_key]["init_cells"] t = adata.uns[fate_key]["t"] confidence = np.zeros(len(t)) for i, prediction in tqdm(enumerate(cell_predictions), desc="calculating fate distributions"): cur_t, n_steps = t[i], len(t[i]) # ensure to identify sink where the speed is very slow if inds is not provided. # if inds is the percentage, use the last percentage of steps to check for cell fate bias. # otherwise inds need to be a list. if inds is None: avg_speed = np.array( [np.linalg.norm(i) for i in np.diff(prediction, 1).T]) / np.diff(cur_t) sink_checker = np.where( avg_speed[::-1] > np.percentile(avg_speed, speed_percentile) )[0] indices = np.arange(n_steps - max(min(sink_checker), 10), n_steps) elif inds is float: indices = np.arange(int(n_steps - inds * n_steps), n_steps) else: indices = inds if hasattr(nbrs, "query"): knn, distances = nbrs.query(prediction[:, indices].T, k=30) else: distances, knn = nbrs.kneighbors(prediction[:, indices].T) # if final steps too far away from observed cells, ignore them walk_back_steps = 0 while True: is_dist_larger_than_threshold = distances.flatten( ) < dist_threshold * median_dist if any(is_dist_larger_than_threshold): # let us diffuse one step further to identify cells from terminal cell types in case # cells with indices are all close to some random progenitor cells. if hasattr(nbrs, "query"): knn, _ = nbrs.query(X[knn.flatten(), :], k=30) else: _, knn = nbrs.kneighbors(X[knn.flatten(), :]) fate_prob = clusters[knn.flatten()].value_counts() / len( knn.flatten()) if source_groups is not None: source_p = fate_prob[source_groups].sum() if 1 > source_p > 0: fate_prob[source_groups] = 0 fate_prob[fate_prob.idxmax()] += source_p pred_dict[i] = fate_prob confidence[i] = 1 - ( sum(~is_dist_larger_than_threshold) + walk_back_steps) / ( len(is_dist_larger_than_threshold) + walk_back_steps) break else: walk_back_steps += 1 if any(indices - 1 < 0): pred_dict[i] = clusters[ knn.flatten()].value_counts() * np.nan break if hasattr(nbrs, "query"): knn, distances = nbrs.query(prediction[:, indices - 1].T, k=30) else: distances, knn = nbrs.kneighbors(prediction[:, indices - 1].T) knn, distances = knn[:, 0], distances[:, 0] indices = indices - 1 bias = pd.DataFrame(pred_dict).T conf = pd.DataFrame({"confidence": confidence}, index=bias.index) bias = pd.merge(conf, bias, left_index=True, right_index=True) if cell_indx is not None: bias.index = cell_indx return bias
def graphize_vecfld( func, X, nbrs_idx=None, dist=None, k=30, distance_free=True, n_int_steps=20, cores=1, ): n, d = X.shape nbrs = None if nbrs_idx is None: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent( X, metric="euclidean", n_neighbors=k + 1, n_jobs=-1, random_state=19491001, ) nbrs_idx, dist = nbrs.query(X, k=k + 1) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=-1).fit(X) dist, nbrs_idx = nbrs.kneighbors(X) if dist is None and not distance_free: D = pdist(X) else: D = None V = sp.csr_matrix((n, n)) if cores == 1: for i, idx in enumerate( LoggerManager.progress_logger( nbrs_idx, progress_name="graphize_vecfld")): V += construct_v(X, i, idx, n_int_steps, func, distance_free, dist, D, n) else: pool = ThreadPool(cores) res = pool.starmap( construct_v, zip( itertools.repeat(X), np.arange(len(nbrs_idx)), nbrs_idx, itertools.repeat(n_int_steps), itertools.repeat(func), itertools.repeat(distance_free), itertools.repeat(dist), itertools.repeat(D), itertools.repeat(n), ), ) pool.close() pool.join() V = functools.reduce((lambda a, b: a + b), res) return V, nbrs