def _regress_out_chunk(data): # data is a tuple containing the selected columns from adata.X # and the regressors dataFrame data_chunk = data[0] regressors = data[1] variable_is_categorical = data[2] responses_chunk_list = [] import statsmodels.api as sm from statsmodels.tools.sm_exceptions import PerfectSeparationError for col_index in range(data_chunk.shape[1]): if variable_is_categorical: regres = np.c_[np.ones(regressors.shape[0]), regressors[:, col_index]] else: regres = regressors try: result = sm.GLM(data_chunk[:, col_index], regres, family=sm.families.Gaussian()).fit() new_column = result.resid_response except PerfectSeparationError: # this emulates R's behavior logg.warn( 'Encountered PerfectSeparationError, setting to 0 as in R.') new_column = np.zeros(data_chunk.shape[0]) responses_chunk_list.append(new_column) return np.vstack(responses_chunk_list)
def downsample_counts(adata, target_counts=20000, random_state=0, copy=False): """Downsample counts so that each cell has no more than `target_counts`. Cells with fewer counts than `target_counts` are unaffected by this. This has been implemented by M. D. Luecken. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data matrix. target_counts : `int` (default: 20,000) Target number of counts for downsampling. Cells with more counts than 'target_counts' will be downsampled to have 'target_counts' counts. random_state : `int` or `None`, optional (default: 0) Random seed to change subsampling. copy : `bool`, optional (default: `False`) If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Returns ------- Depending on `copy` returns or updates an `adata` with downsampled `.X`. """ if target_counts < 1: raise ValueError( '`target_counts` must be a positive integer'.format(target_counts)) if not isinstance(adata, AnnData): raise ValueError('`adata` must be an `AnnData` object'.format(adata)) logg.msg('downsampling to {} counts'.format(target_counts), r=True) adata = adata.copy() if copy else adata np.random.seed(random_state) counts = adata.X.sum(axis=1) adata.obs['n_counts'] = counts for icell, _ in enumerate(adata.obs_names): if counts[icell] > target_counts: idx_vec = [] for ix, i in enumerate(adata.X[icell].astype(int)): idx_vec.extend([ix] * i) # idx_vec = np.array(idx_vec) downsamp = np.random.choice(idx_vec, target_counts) cell_profile = np.zeros(adata.n_vars) indices, values = np.unique(downsamp, return_counts=True) for i in range(len(indices)): cell_profile[indices[i]] = values[i] adata.X[icell] = cell_profile logg.msg('finished', t=True) return adata if copy else None
def regress_out(adata, keys, n_jobs=None, copy=False): """Regress out unwanted sources of variation. Uses simple linear regression. This is inspired by Seurat's `regressOut` function in R [Satija15]. Parameters ---------- adata : :class:`~anndata.AnnData` The annotated data matrix. keys : `str` or list of `str` Keys for observation annotation on which to regress on. n_jobs : `int` or `None`, optional. If None is given, then the n_jobs seting is used (default: `None`) Number of jobs for parallel computation. copy : `bool`, optional (default: `False`) If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Returns ------- Depending on `copy` returns or updates `adata` with the corrected data matrix. """ logg.info('regressing out', keys, r=True) if issparse(adata.X): logg.info(' sparse input is densified and may ' 'lead to high memory use') adata = adata.copy() if copy else adata if isinstance(keys, str): keys = [keys] if issparse(adata.X): adata.X = adata.X.toarray() n_jobs = sett.n_jobs if n_jobs is None else n_jobs # regress on a single categorical variable sanitize_anndata(adata) variable_is_categorical = False if keys[0] in adata.obs_keys() and is_categorical_dtype( adata.obs[keys[0]]): if len(keys) > 1: raise ValueError('If providing categorical variable, ' 'only a single one is allowed. For this one ' 'we regress on the mean for each category.') logg.msg('... regressing on per-gene means within categories') regressors = np.zeros(adata.X.shape, dtype='float32') for category in adata.obs[keys[0]].cat.categories: mask = (category == adata.obs[keys[0]]).values for ix, x in enumerate(adata.X.T): regressors[mask, ix] = x[mask].mean() variable_is_categorical = True # regress on one or several ordinal variables else: # create data frame with selected keys (if given) if keys: regressors = adata.obs[keys] else: regressors = adata.obs.copy() # add column of ones at index 0 (first column) regressors.insert(0, 'ones', 1.0) len_chunk = np.ceil(min(1000, adata.X.shape[1]) / n_jobs).astype(int) n_chunks = np.ceil(adata.X.shape[1] / len_chunk).astype(int) tasks = [] # split the adata.X matrix by columns in chunks of size n_chunk (the last chunk could be of smaller # size than the others) chunk_list = np.array_split(adata.X, n_chunks, axis=1) if variable_is_categorical: regressors_chunk = np.array_split(regressors, n_chunks, axis=1) for idx, data_chunk in enumerate(chunk_list): # each task is a tuple of a data_chunk eg. (adata.X[:,0:100]) and # the regressors. This data will be passed to each of the jobs. if variable_is_categorical: regres = regressors_chunk[idx] else: regres = regressors tasks.append(tuple((data_chunk, regres, variable_is_categorical))) if n_jobs > 1 and n_chunks > 1: import multiprocessing pool = multiprocessing.Pool(n_jobs) res = pool.map_async(_regress_out_chunk, tasks).get(9999999) pool.close() else: res = list(map(_regress_out_chunk, tasks)) # res is a list of vectors (each corresponding to a regressed gene column). # The transpose is needed to get the matrix in the shape needed adata.X = np.vstack(res).T.astype(adata.X.dtype) logg.info(' finished', t=True) return adata if copy else None
def pca(data, n_comps=None, zero_center=True, svd_solver='auto', random_state=0, return_info=False, use_highly_variable=None, dtype='float32', copy=False, chunked=False, chunk_size=None): """Principal component analysis [Pedregosa11]_. Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of *scikit-learn* [Pedregosa11]_. Parameters ---------- data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse` The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. n_comps : `int`, optional (default: 50) Number of principal components to compute. zero_center : `bool` or `None`, optional (default: `True`) If `True`, compute standard PCA from covariance matrix. If `False`, omit zero-centering variables (uses *TruncatedSVD* from scikit-learn), which allows to handle sparse input efficiently. svd_solver : `str`, optional (default: 'auto') SVD solver to use. Either 'arpack' for the ARPACK wrapper in SciPy (scipy.sparse.linalg.svds), or 'randomized' for the randomized algorithm due to Halko (2009). 'auto' chooses automatically depending on the size of the problem. random_state : `int`, optional (default: 0) Change to use different intial states for the optimization. return_info : `bool`, optional (default: `False`) Only relevant when not passing an :class:`~anndata.AnnData`: see "Returns". use_highly_variable : `bool`, optional (default: `None`) Whether to use highly variable genes only, stored in .var['highly_variable']. dtype : `str` (default: 'float32') Numpy data type string to which to convert the result. copy : `bool`, optional (default: `False`) If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Is ignored otherwise. chunked : `bool`, optional (default: `False`) If `True`, perform an incremental PCA on segments of `chunk_size`. The incremental PCA automatically zero centers and ignores settings of `random_seed` and `svd_solver`. If `False`, perform a full PCA. chunk_size : `int`, optional (default: `None`) Number of observations to include in each chunk. Required if `chunked` is `True`. Returns ------- If `data` is array-like and `return_info == False`, only returns `X_pca`,\ otherwise returns or adds to `adata`: X_pca : `.obsm` PCA representation of data. PCs : `.varm` The principal components containing the loadings. variance_ratio : `.uns['pca']` Ratio of explained variance. variance : `.uns['pca']` Explained variance, equivalent to the eigenvalues of the covariance matrix. """ # chunked calculation is not randomized, anyways if svd_solver in {'auto', 'randomized'} and not chunked: logg.info( 'Note that scikit-learn\'s randomized PCA might not be exactly ' 'reproducible across different computational platforms. For exact ' 'reproducibility, choose `svd_solver=\'arpack\'.` This will likely ' 'become the Scanpy default in the future.') if n_comps is None: n_comps = N_PCS if isinstance(data, AnnData): data_is_AnnData = True adata = data.copy() if copy else data else: data_is_AnnData = False adata = AnnData(data) logg.msg('computing PCA with n_comps =', n_comps, r=True, v=4) if adata.n_vars < n_comps: n_comps = adata.n_vars - 1 logg.msg('reducing number of computed PCs to', n_comps, 'as dim of data is only', adata.n_vars, v=4) if use_highly_variable is True and 'highly_variable' not in adata.var.keys( ): raise ValueError( 'Did not find adata.var[\'highly_variable\']. ' 'Either your data already only consists of highly-variable genes ' 'or consider running `pp.filter_genes_dispersion` first.') if use_highly_variable is None: use_highly_variable = True if 'highly_variable' in adata.var.keys( ) else False adata_comp = adata[:, adata. var['highly_variable']] if use_highly_variable else adata if chunked: if not zero_center or random_state or svd_solver != 'auto': logg.msg('Ignoring zero_center, random_state, svd_solver', v=4) from sklearn.decomposition import IncrementalPCA X_pca = np.zeros((adata_comp.X.shape[0], n_comps), adata_comp.X.dtype) pca_ = IncrementalPCA(n_components=n_comps) for chunk, _, _ in adata_comp.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk pca_.partial_fit(chunk) for chunk, start, end in adata_comp.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk X_pca[start:end] = pca_.transform(chunk) else: zero_center = zero_center if zero_center is not None else False if issparse( adata_comp.X) else True if zero_center: from sklearn.decomposition import PCA if issparse(adata_comp.X): logg.msg( ' as `zero_center=True`, ' 'sparse input is densified and may ' 'lead to huge memory consumption', v=4) X = adata_comp.X.toarray( ) # Copying the whole adata_comp.X here, could cause memory problems else: X = adata_comp.X pca_ = PCA(n_components=n_comps, svd_solver=svd_solver, random_state=random_state) else: from sklearn.decomposition import TruncatedSVD logg.msg( ' without zero-centering: \n' ' the explained variance does not correspond to the exact statistical defintion\n' ' the first component, e.g., might be heavily influenced by different means\n' ' the following components often resemble the exact PCA very closely', v=4) pca_ = TruncatedSVD(n_components=n_comps, random_state=random_state) X = adata_comp.X X_pca = pca_.fit_transform(X) if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype) if data_is_AnnData: adata.obsm['X_pca'] = X_pca if not use_highly_variable: adata.varm['PCs'] = pca_.components_.T else: PCs = np.empty(shape=(n_comps, adata.shape[1])) PCs[:] = np.nan PCs[:, adata.var['highly_variable']] = pca_.components_ adata.varm['PCs'] = PCs.T adata.uns['pca'] = {} adata.uns['pca']['variance'] = pca_.explained_variance_ adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_ logg.msg(' finished', t=True, end=' ', v=4) logg.msg( 'and added\n' ' \'X_pca\', the PCA coordinates (adata.obs)\n' ' \'PC1\', \'PC2\', ..., the loadings (adata.var)\n' ' \'pca_variance\', the variance / eigenvalues (adata.uns)\n' ' \'pca_variance_ratio\', the variance ratio (adata.uns)', v=4) return adata if copy else None else: if return_info: return X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_ else: return X_pca