def remove_duplicate_cells(adata): if 'X_pca' not in adata.obsm.keys(): pca(adata) idx_duplicates = get_duplicate_cells(adata) if len(idx_duplicates) > 0: mask = np.ones(adata.n_obs, bool) mask[idx_duplicates] = 0 logg.info('Removed', len(idx_duplicates), 'duplicate cells.') adata._inplace_subset_obs(mask) neighbors(adata)
def remove_duplicate_cells(adata): if 'X_pca' not in adata.obsm.keys(): pca(adata) l = list(np.sum(adata.obsm['X_pca'], 1) + adata.obs['n_counts']) n_unique_obs = len(set(l)) if n_unique_obs < adata.n_obs: idx = [l.index(x) for x in set(l)] logg.info('Removed ', adata.n_obs - n_unique_obs, ' duplicate cells.') adata._inplace_subset_obs(idx) neighbors(adata)
def remove_duplicate_cells(adata): if "X_pca" not in adata.obsm.keys(): pca(adata) idx_duplicates = get_duplicate_cells(adata) if len(idx_duplicates) > 0: mask = np.ones(adata.n_obs, bool) mask[idx_duplicates] = 0 logg.info("Removed", len(idx_duplicates), "duplicate cells.") adata._inplace_subset_obs(mask) if "neighbors" in adata.uns.keys(): neighbors(adata)
def recipe_pearson_residuals( adata: AnnData, *, theta: float = 100, clip: Optional[float] = None, n_top_genes: int = 1000, batch_key: Optional[str] = None, chunksize: int = 1000, n_comps: Optional[int] = 50, random_state: Optional[float] = 0, kwargs_pca: dict = {}, check_values: bool = True, inplace: bool = True, ) -> Optional[Tuple[AnnData, pd.DataFrame]]: """\ Full pipeline for HVG selection and normalization by analytic Pearson residuals ([Lause21]_). Applies gene selection based on Pearson residuals. On the resulting subset, Pearson residual normalization and PCA are performed. Expects raw count input. Params ------ {adata} {dist_params} {genes_batch_chunk} {pca_chunk} {check_values} {inplace} Returns ------- If `inplace=False`, separately returns the gene selection results (as :class:`~pandas.DataFrame`) and Pearson residual-based PCA results (as :class:`~anndata.AnnData`). If `inplace=True`, updates `adata` with the following fields for gene selection results: `.var['highly_variable']` : bool boolean indicator of highly-variable genes. `.var['means']` : float means per gene. `.var['variances']` : float variances per gene. `.var['residual_variances']` : float Pearson residual variance per gene. Averaged in the case of multiple batches. `.var['highly_variable_rank']` : float Rank of the gene according to residual variance, median rank in the case of multiple batches. `.var['highly_variable_nbatches']` : int If batch_key is given, this denotes in how many batches genes are detected as HVG. `.var['highly_variable_intersection']` : bool If batch_key is given, this denotes the genes that are highly variable in all batches. The following fields contain Pearson residual-based PCA results and normalization settings: `.uns['pearson_residuals_normalization']['pearson_residuals_df']` The subset of highly variable genes, normalized by Pearson residuals. `.uns['pearson_residuals_normalization']['theta']` The used value of the overdisperion parameter theta. `.uns['pearson_residuals_normalization']['clip']` The used value of the clipping parameter. `.obsm['X_pca']` PCA representation of data after gene selection and Pearson residual normalization. `.varm['PCs']` The principal components containing the loadings. When `inplace=True` this will contain empty rows for the genes not selected during HVG selection. `.uns['pca']['variance_ratio']` Ratio of explained variance. `.uns['pca']['variance']` Explained variance, equivalent to the eigenvalues of the covariance matrix. """ hvg_args = dict( flavor='pearson_residuals', n_top_genes=n_top_genes, batch_key=batch_key, theta=theta, clip=clip, chunksize=chunksize, check_values=check_values, ) if inplace: experimental.pp.highly_variable_genes(adata, **hvg_args, inplace=True) # TODO: are these copies needed? adata_pca = adata[:, adata.var['highly_variable']].copy() else: hvg = experimental.pp.highly_variable_genes(adata, **hvg_args, inplace=False) # TODO: are these copies needed? adata_pca = adata[:, hvg['highly_variable']].copy() experimental.pp.normalize_pearson_residuals(adata_pca, theta=theta, clip=clip, check_values=check_values) pca(adata_pca, n_comps=n_comps, random_state=random_state, **kwargs_pca) if inplace: normalization_param = adata_pca.uns['pearson_residuals_normalization'] normalization_dict = dict(**normalization_param, pearson_residuals_df=adata_pca.to_df()) adata.uns['pca'] = adata_pca.uns['pca'] adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps)) adata.varm['PCs'][adata.var['highly_variable']] = adata_pca.varm['PCs'] adata.uns['pearson_residuals_normalization'] = normalization_dict adata.obsm['X_pca'] = adata_pca.obsm['X_pca'] return None else: return adata_pca, hvg
def neighbors(adata, n_neighbors=30, n_pcs=None, use_rep=None, knn=True, random_state=0, method='umap', metric='euclidean', metric_kwds={}, num_threads=-1, copy=False): """ Compute a neighborhood graph of observations. The neighbor graph methods (umap, hnsw, sklearn) only differ in runtime and yield the same result as scanpy [Wolf18]_. Connectivities are computed with adaptive kernel width as proposed in Haghverdi et al. 2016 (doi:10.1038/nmeth.3971). Parameters ---------- adata Annotated data matrix. n_neighbors The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If `knn` is `True`, number of nearest neighbors to be searched. If `knn` is `False`, a Gaussian kernel width is set to the distance of the `n_neighbors` neighbor. n_pcs : `int` or `None` (default: None) Number of principal components to use. If not specified, the full space is used of a pre-computed PCA, or 30 components are used when PCA is computed internally. use_rep : `None`, `'X'` or any key for `.obsm` (default: None) Use the indicated representation. If `None`, the representation is chosen automatically: for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used. knn If `True`, use a hard threshold to restrict the number of neighbors to `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian Kernel to assign low weights to neighbors more distant than the `n_neighbors` nearest neighbor. random_state A numpy random seed. method : {{'umap', 'hnsw', 'sklearn'}} (default: `'umap'`) Method to compute neighbors, only differs in runtime. The 'hnsw' method is most efficient and requires to `pip install hnswlib`. Connectivities are computed with adaptive kernel width as proposed in Haghverdi et al. 2016 (https://doi.org/10.1038/nmeth.3971). metric A known metric’s name or a callable that returns a distance. metric_kwds Options for the metric. copy Return a copy instead of writing to adata. Returns ------- Depending on `copy`, updates or returns `adata` with the following: connectivities : sparse matrix (`.uns['neighbors']`, dtype `float32`) Weighted adjacency matrix of the neighborhood graph of data points. Weights should be interpreted as connectivities. distances : sparse matrix (`.uns['neighbors']`, dtype `float32`) Instead of decaying weights, this stores distances for each pair of neighbors. """ adata = adata.copy() if copy else adata if use_rep is None: use_rep = 'X' if adata.n_vars < 50 or n_pcs == 0 else 'X_pca' n_pcs = None if use_rep == 'X' else n_pcs elif use_rep not in adata.obsm.keys( ) and 'X_' + use_rep in adata.obsm.keys(): use_rep = 'X_' + use_rep if use_rep == 'X_pca': if 'X_pca' not in adata.obsm.keys( ) or n_pcs is not None and n_pcs > adata.obsm['X_pca'].shape[1]: pca(adata, n_comps=min(30 if n_pcs is None else n_pcs, adata.n_vars - 1), svd_solver='arpack') elif n_pcs is None and adata.obsm['X_pca'].shape[1] < 10: logg.warn('Neighbors are computed on ', adata.obsm['X_pca'].shape[1], ' principal components only.') n_duplicate_cells = len(get_duplicate_cells(adata)) if n_duplicate_cells > 0: logg.warn( 'You seem to have {} duplicate cells in your data.'.format( n_duplicate_cells), 'Consider removing these via pp.remove_duplicate_cells.') logg.info('computing neighbors', r=True) if method == 'sklearn': from sklearn.neighbors import NearestNeighbors X = adata.X if use_rep == 'X' else adata.obsm[use_rep] neighbors = NearestNeighbors(n_neighbors=n_neighbors - 1, metric=metric, metric_params=metric_kwds, n_jobs=num_threads) neighbors.fit(X if n_pcs is None else X[:, :n_pcs]) knn_distances, neighbors.knn_indices = neighbors.kneighbors() knn_distances, neighbors.knn_indices = set_diagonal( knn_distances, neighbors.knn_indices) neighbors.distances, neighbors.connectivities = \ compute_connectivities_umap(neighbors.knn_indices, knn_distances, X.shape[0], n_neighbors=n_neighbors) elif method == 'hnsw': X = adata.X if use_rep == 'X' else adata.obsm[use_rep] neighbors = FastNeighbors(n_neighbors=n_neighbors, num_threads=num_threads) neighbors.fit(X if n_pcs is None else X[:, :n_pcs], metric=metric, random_state=random_state, **metric_kwds) else: logg.switch_verbosity('off', module='scanpy') with warnings.catch_warnings( ): # ignore numba warning (reported in umap/issues/252) warnings.simplefilter("ignore") neighbors = Neighbors(adata) neighbors.compute_neighbors( n_neighbors=n_neighbors, knn=knn, n_pcs=n_pcs, method=method, use_rep=None if use_rep == 'X_pca' else use_rep, random_state=random_state, metric=metric, metric_kwds=metric_kwds, write_knn_indices=True) logg.switch_verbosity('on', module='scanpy') adata.uns['neighbors'] = {} try: adata.obsp['distances'] = neighbors.distances adata.obsp['connectivities'] = neighbors.connectivities adata.uns['neighbors']['connectivities_key'] = 'connectivities' adata.uns['neighbors']['distances_key'] = 'distances' except: adata.uns['neighbors']['distances'] = neighbors.distances adata.uns['neighbors']['connectivities'] = neighbors.connectivities if hasattr(neighbors, 'knn_indices'): adata.uns['neighbors']['indices'] = neighbors.knn_indices adata.uns['neighbors']['params'] = { 'n_neighbors': n_neighbors, 'method': method, 'metric': metric, 'n_pcs': n_pcs } logg.info(' finished', time=True, end=' ' if settings.verbosity > 2 else '\n') logg.hint( 'added \n' ' \'distances\' and \'connectivities\', weighted adjacency matrices (adata.obsp)' ) return adata if copy else None
def neighbors(adata, n_neighbors=30, n_pcs=None, use_rep=None, knn=True, random_state=0, method='umap', metric='euclidean', metric_kwds={}, num_threads=-1, copy=False): """ Compute a neighborhood graph of observations [McInnes18]_. The neighbor search efficiency of this heavily relies on UMAP [McInnes18]_, which also provides a method for estimating connectivities of data points - the connectivity of the manifold (`method=='umap'`). If `method=='diffmap'`, connectivities are computed according to [Coifman05]_, in the adaption of [Haghverdi16]_. Parameters ---------- adata Annotated data matrix. n_neighbors The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If `knn` is `True`, number of nearest neighbors to be searched. If `knn` is `False`, a Gaussian kernel width is set to the distance of the `n_neighbors` neighbor. n_pcs : `int` or `None` (default: None) Number of principal components to use. If not specified, the full space is used of a pre-computed PCA, or 30 components are used when PCA is computed internally. use_rep : `None`, `'X'` or any key for `.obsm` (default: None) Use the indicated representation. If `None`, the representation is chosen automatically: for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used. knn If `True`, use a hard threshold to restrict the number of neighbors to `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian Kernel to assign low weights to neighbors more distant than the `n_neighbors` nearest neighbor. random_state A numpy random seed. method : {{'umap', 'gauss', 'hnsw', 'sklearn', `None`}} (default: `'umap'`) Use 'umap' [McInnes18]_ or 'gauss' (Gauss kernel following [Coifman05]_ with adaptive width [Haghverdi16]_) for computing connectivities. metric A known metric’s name or a callable that returns a distance. metric_kwds Options for the metric. copy Return a copy instead of writing to adata. Returns ------- Depending on `copy`, updates or returns `adata` with the following: connectivities : sparse matrix (`.uns['neighbors']`, dtype `float32`) Weighted adjacency matrix of the neighborhood graph of data points. Weights should be interpreted as connectivities. distances : sparse matrix (`.uns['neighbors']`, dtype `float32`) Instead of decaying weights, this stores distances for each pair of neighbors. """ adata = adata.copy() if copy else adata if adata.isview: adata._init_as_actual(adata.copy()) if use_rep is None: use_rep = 'X' if adata.n_vars < 50 or n_pcs is 0 else 'X_pca' n_pcs = None if use_rep is 'X' else n_pcs elif use_rep not in adata.obsm.keys( ) and 'X_' + use_rep in adata.obsm.keys(): use_rep = 'X_' + use_rep if use_rep is 'X_pca': if 'X_pca' not in adata.obsm.keys( ) or n_pcs is not None and n_pcs > adata.obsm['X_pca'].shape[1]: pca(adata, n_comps=30 if n_pcs is None else n_pcs, svd_solver='arpack') elif n_pcs is None and adata.obsm['X_pca'].shape[1] < 10: logg.warn('Neighbors are computed on ', adata.obsm['X_pca'].shape[1], ' principal components only.') logg.info('computing neighbors', r=True) if method is 'sklearn': from sklearn.neighbors import NearestNeighbors X = adata.X if use_rep is 'X' else adata.obsm[use_rep] neighbors = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, metric_params=metric_kwds, n_jobs=num_threads) neighbors.fit(X if n_pcs is None else X[:, :n_pcs]) knn_distances, neighbors.knn_indices = neighbors.kneighbors() neighbors.distances, neighbors.connectivities = \ compute_connectivities_umap(neighbors.knn_indices, knn_distances, X.shape[0], n_neighbors=30) elif method is 'hnsw': X = adata.X if use_rep is 'X' else adata.obsm[use_rep] neighbors = FastNeighbors(n_neighbors=n_neighbors, num_threads=num_threads) neighbors.fit(X if n_pcs is None else X[:, :n_pcs], metric=metric, random_state=random_state, **metric_kwds) else: logg.switch_verbosity('off', module='scanpy') with warnings.catch_warnings( ): # ignore numba warning (reported in umap/issues/252) warnings.simplefilter("ignore") neighbors = Neighbors(adata) neighbors.compute_neighbors(n_neighbors=n_neighbors, knn=knn, n_pcs=n_pcs, use_rep=use_rep, method=method, metric=metric, metric_kwds=metric_kwds, random_state=random_state, write_knn_indices=True) logg.switch_verbosity('on', module='scanpy') adata.uns['neighbors'] = {} adata.uns['neighbors']['params'] = { 'n_neighbors': n_neighbors, 'method': method } adata.uns['neighbors']['distances'] = neighbors.distances adata.uns['neighbors']['connectivities'] = neighbors.connectivities if hasattr(neighbors, 'knn_indices'): adata.uns['neighbors']['indices'] = neighbors.knn_indices logg.info(' finished', time=True, end=' ' if settings.verbosity > 2 else '\n') logg.hint('added to `.uns[\'neighbors\']`\n' ' \'distances\', weighted adjacency matrix\n' ' \'connectivities\', weighted adjacency matrix') return adata if copy else None
def neighbors( adata, n_neighbors=30, n_pcs=None, use_rep=None, use_highly_variable=True, knn=True, random_state=0, method="umap", metric="euclidean", metric_kwds=None, num_threads=-1, copy=False, ): """ Compute a neighborhood graph of observations. The neighbor graph methods (umap, hnsw, sklearn) only differ in runtime and yield the same result as scanpy [Wolf18]_. Connectivities are computed with adaptive kernel width as proposed in Haghverdi et al. 2016 (doi:10.1038/nmeth.3971). Parameters ---------- adata Annotated data matrix. n_neighbors The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If `knn` is `True`, number of nearest neighbors to be searched. If `knn` is `False`, a Gaussian kernel width is set to the distance of the `n_neighbors` neighbor. n_pcs : `int` or `None` (default: None) Number of principal components to use. If not specified, the full space is used of a pre-computed PCA, or 30 components are used when PCA is computed internally. use_rep : `None`, `'X'` or any key for `.obsm` (default: None) Use the indicated representation. If `None`, the representation is chosen automatically: for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used. use_highly_variable: `bool` (default: True) Whether to use highly variable genes only, stored in .var['highly_variable']. knn If `True`, use a hard threshold to restrict the number of neighbors to `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian Kernel to assign low weights to neighbors more distant than the `n_neighbors` nearest neighbor. random_state A numpy random seed. method : {{'umap', 'hnsw', 'sklearn'}} (default: `'umap'`) Method to compute neighbors, only differs in runtime. The 'hnsw' method is most efficient and requires to `pip install hnswlib`. Connectivities are computed with adaptive kernel. metric A known metric’s name or a callable that returns a distance. metric_kwds Options for the metric. num_threads Number of threads to be used (for runtime). copy Return a copy instead of writing to adata. Returns ------- connectivities : `.obsp` Sparse weighted adjacency matrix of the neighborhood graph of data points. Weights should be interpreted as connectivities. distances : `.obsp` Sparse matrix of distances for each pair of neighbors. """ adata = adata.copy() if copy else adata if use_rep is None: use_rep = "X" if adata.n_vars < 50 or n_pcs == 0 else "X_pca" n_pcs = None if use_rep == "X" else n_pcs elif use_rep not in adata.obsm.keys() and f"X_{use_rep}" in adata.obsm.keys(): use_rep = f"X_{use_rep}" if use_rep == "X_pca": if ( "X_pca" not in adata.obsm.keys() or n_pcs is not None and n_pcs > adata.obsm["X_pca"].shape[1] ): n_vars = ( np.sum(adata.var["highly_variable"]) if use_highly_variable and "highly_variable" in adata.var.keys() else adata.n_vars ) n_comps = min(30 if n_pcs is None else n_pcs, n_vars - 1, adata.n_obs - 1) use_highly_variable &= "highly_variable" in adata.var.keys() pca( adata, n_comps=n_comps, use_highly_variable=use_highly_variable, svd_solver="arpack", ) elif n_pcs is None and adata.obsm["X_pca"].shape[1] < 10: logg.warn( f"Neighbors are computed on {adata.obsm['X_pca'].shape[1]} " f"principal components only." ) n_duplicate_cells = len(get_duplicate_cells(adata)) if n_duplicate_cells > 0: logg.warn( f"You seem to have {n_duplicate_cells} duplicate cells in your data.", "Consider removing these via pp.remove_duplicate_cells.", ) if metric_kwds is None: metric_kwds = {} logg.info("computing neighbors", r=True) if method == "sklearn": from sklearn.neighbors import NearestNeighbors X = adata.X if use_rep == "X" else adata.obsm[use_rep] neighbors = NearestNeighbors( n_neighbors=n_neighbors - 1, metric=metric, metric_params=metric_kwds, n_jobs=num_threads, ) neighbors.fit(X if n_pcs is None else X[:, :n_pcs]) knn_distances, neighbors.knn_indices = neighbors.kneighbors() knn_distances, neighbors.knn_indices = set_diagonal( knn_distances, neighbors.knn_indices ) neighbors.distances, neighbors.connectivities = compute_connectivities_umap( neighbors.knn_indices, knn_distances, X.shape[0], n_neighbors=n_neighbors ) elif method == "hnsw": X = adata.X if use_rep == "X" else adata.obsm[use_rep] neighbors = FastNeighbors(n_neighbors=n_neighbors, num_threads=num_threads) neighbors.fit( X if n_pcs is None else X[:, :n_pcs], metric=metric, random_state=random_state, **metric_kwds, ) else: logg.switch_verbosity("off", module="scanpy") with warnings.catch_warnings(): # ignore numba warning (umap/issues/252) warnings.simplefilter("ignore") neighbors = Neighbors(adata) neighbors.compute_neighbors( n_neighbors=n_neighbors, knn=knn, n_pcs=n_pcs, method=method, use_rep=use_rep, random_state=random_state, metric=metric, metric_kwds=metric_kwds, write_knn_indices=True, ) logg.switch_verbosity("on", module="scanpy") adata.uns["neighbors"] = {} try: adata.obsp["distances"] = neighbors.distances adata.obsp["connectivities"] = neighbors.connectivities adata.uns["neighbors"]["connectivities_key"] = "connectivities" adata.uns["neighbors"]["distances_key"] = "distances" except Exception: adata.uns["neighbors"]["distances"] = neighbors.distances adata.uns["neighbors"]["connectivities"] = neighbors.connectivities if hasattr(neighbors, "knn_indices"): adata.uns["neighbors"]["indices"] = neighbors.knn_indices adata.uns["neighbors"]["params"] = { "n_neighbors": n_neighbors, "method": method, "metric": metric, "n_pcs": n_pcs, "use_rep": use_rep, } logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" " 'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)" ) return adata if copy else None