def test_log1p(self, adata, adata_dist): log1p(adata_dist) result = materialize_as_ndarray(adata_dist.X) log1p(adata) assert result.shape == adata.shape assert result.shape == (adata.n_obs, adata.n_vars) npt.assert_allclose(result, adata.X)
def test(self, Xtest): testdata = AnnData(Xtest) normalize_per_cell(testdata, 1000, min_counts=0) log1p(testdata) testdata.X = _toarray(testdata.X) dxixk = scipy.spatial.distance.cdist(testdata.X, self.xkibar) return dxixk.argmin(axis=1)
def train(self, adata): adata = adata.copy() adata.X = _toarray(adata.X) normalize_per_cell(adata, 1000, min_counts=0) log1p(adata) adata = process_clusts(adata) self.xkibar = np.array( [ adata.X[adata.uns["clusterindices"][k]].mean(axis=0).tolist() for k in range(adata.uns["num_clusts"]) ] )
def highly_variable_genes(adata, batch_key=None, n_shared=2): """Calculate highly variable genes and return filtered adata containing only the HVGs. Parameters ---------- adata: `AnnData` AnnData object for which HVGs are to be calculated batch_key: `str` | default = None Specify adata.obs column to be used as batch. HVGs will then be calculated per batch. n_shared: `int` | default = 2 requirement for selection of HVGs - HVGs shared in nr_samples/n_shared will be included. A higher value will result in a less stringent selection, e.g. with 2 HVGs need to be present in at least 50% of the samples. Returns ------- returns an AnnData object with only HVG """ start = time() # take log1p log1p(adata) print('log1p taken of adata') sc_highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, inplace=True, batch_key=batch_key) if (batch_key != None): hvglist = adata.var['highly_variable'].copy() hvglist.loc[adata.var['highly_variable_nbatches'] >= len(set(adata.obs[batch_key])) / n_shared, ] = True adata.var['highly_variable'] = hvglist.copy() pl_highly_variable_genes(adata, save='.hvg.png', show=True) adata = adata[:, adata.var.highly_variable == True] # logging logging.info('After feature selection of highly variable genes: ' + str(adata.shape[0]) + ' cells, ' + str(adata.shape[1]) + ' genes') logging.info('\tTime for feature selection: ' + str(round(time() - start, 3)) + 's') return (adata)
def per_cell_normalize(adata, results_folder): # get start time start = time() # normalize per cell # already normalize BEFORE saving "raw" - as recommended in the scanpy tutorial normalize_per_cell(adata, counts_per_cell_after=1e4) print('adata normalized per cell') # keep raw copy adata.raw = log1p(adata, copy=True) print('log1p values saved into adata.raw') # make log entries logging.info('Per cell normalization completed successfully.') logging.info("\tTime for per-cell normalization: " + str(round(time() - start, 3)) + 's') # export to file start = time() export_cp10k(adata, basepath=results_folder) logging.info('cp10k values exported to file.') logging.info("\tTime for cp10k export: " + str(round(time() - start, 3)) + 's') return (adata)
def test_write_zarr(self, adata, adata_dist): import dask.array as da import zarr log1p(adata_dist) temp_store = zarr.TempStore() chunks = adata_dist.X.chunks if isinstance(chunks[0], tuple): chunks = (chunks[0][0], ) + chunks[1] # write metadata using regular anndata adata.write_zarr(temp_store, chunks) if isinstance(adata_dist.X, da.Array): adata_dist.X.to_zarr(temp_store.dir_path("X"), overwrite=True) else: adata_dist.X.to_zarr(temp_store.dir_path("X"), chunks) # read back as zarr directly and check it is the same as adata.X adata_log1p = ad.read_zarr(temp_store) log1p(adata) npt.assert_allclose(adata_log1p.X, adata.X)
def preprocess_for_scrublet(adata): adata_pp = adata.copy() pp.filter_genes(adata_pp, min_cells=3) pp.filter_cells(adata_pp, min_genes=3) adata_pp.layers['raw'] = adata_pp.X.copy() pp.normalize_total(adata_pp) logged = pp.log1p(adata_pp, copy=True) pp.highly_variable_genes(logged) adata_pp = adata_pp[:, logged.var['highly_variable']] return adata_pp
def recluster(adata, celltype, celltype_label = 'leiden', min_mean = 0.0125, max_mean = 4, min_disp = 0.5, resolution = 1.0, regress_out_key = None, random_seed = 0, show_plot_filter = False, method = 'leiden', batch_key = None): """ Perform subclustering on specific celltype to identify subclusters. Extract all cells that belong to the pre-labeled celltype into a new data subset. This datasubset is initialized with the raw data contained in adata.raw. New highly variable genes are selected and a new clustering is performed. The function returns the adata subset with the new clustering annotation. This can be performed on leiden clusters by setting celltype_label = 'leiden' and passing the clusters that are to be selected for reclustering as strings or tuple of strings to the parameter celltype. Parameters ---------- adata: the complete AnnData object of the Dataset. celltype: `str` or (`str`) string identifying the cluster which is to be filtered out, if more than one is to be selected please pass them as a tuple not as a list! celltype_label: `str` | default = 'leiden' string identifying which column in adata.obs will be matching with the celltype argument. min_mean: `float` | default = 0.0125 the minimum gene expression a gene must have to be considered highly variable max_mean: `float` | default = 4 the maximum gene expression a gene can have to be considered highly variable min_disp: `float` | default = 0.5 the minimum dispersion a gene must have to be considered highly variable regress_out_key: `list of str` | default = None A list of string identifiers of the adata.obs columns that should be regressed out before performing clustering. If None then no regress_out is calculated. random_seed: `int` | default = 0 the random seed that is used to produce reproducible PCA, clustering and UMAP results show_plot_filter: `bool` | default = False boolian value indicating if a plot showing the filtering results for highly variable gene detection should be displayed or not method: `str` | default = 'leiden' clustering method to use for the reclustering of the datasubset. Possible:louvain/leiden batch_key: `str` | default = None Specify a batch key if the HVG calculation should be done per batch Returns ------- AnnData object containing the subcluster annotated with PCA, nearest neighbors, louvain cluster, and UMAP coordinates. Examples -------- For a more detailed example of the entire reclustering process please refer to the code examples. >>> import besca as bc >>> import scanpy as sc >>> adata = bc.datasets.pbmc3k_processed() >>> adata_subset = bc.tl.rc.recluster(adata, celltype=('0', '1', '3', '6'), resolution = 1.3) >>> sc.pl.umap(adata_subset, color = ['leiden', 'CD3G', 'CD8A', 'CD4', 'IL7R', 'NKG7', 'GNLY']) """ if( not method in ['leiden', 'louvain']): raise ValueError("method argument should be leiden or louvain") if type(celltype) == str: cluster_subset = _subset_adata(adata, adata.obs.get(celltype_label) == celltype) elif type(celltype) == tuple: filter = adata.obs.get(celltype_label) == 'NONE' for i in range(len(celltype)): filter = filter | (adata.obs.get(celltype_label) == celltype[i]) cluster_subset = _subset_adata(adata, filter) else: sys.exit('specify cluster input as a string or tuple') cluster_subset.raw = cluster_subset #identify highly variable genes filter_result = sc_highly_variable_genes(cluster_subset, min_mean = min_mean, max_mean = max_mean, min_disp = min_disp, inplace=False, batch_key=batch_key) if show_plot_filter: plot_filter(filter_result) print('In total', str(sum(filter_result.highly_variable)), 'highly variable genes selected within cluster') #apply filter cluster_subset = _subset_adata(cluster_subset, filter_result.highly_variable, axis = 1, raw = False) #perform further processing log1p(cluster_subset) if regress_out_key is not None: regress_out(cluster_subset, keys = regress_out_key) sc_scale(cluster_subset) sc_pca(cluster_subset, random_state = random_seed, svd_solver='arpack') #using `svd_solver='arpack' ensures that the PCA leads to reproducible results neighbors(cluster_subset, n_neighbors=10, random_state = random_seed) umap(cluster_subset, random_state = random_seed) if method == 'louvain': louvain(cluster_subset, resolution = resolution, random_state=random_seed) if method == 'leiden': leiden(cluster_subset, resolution=resolution, random_state=random_seed) return(cluster_subset)