def per_cell_normalize(adata, results_folder): #get start time start = time() #normalize per cell normalize_per_cell( adata, counts_per_cell_after=1e4 ) #already normalize BEFORE saving "raw" - as recommended in the scanpy tutorial print('adata normalized per cell') #keep raw copy adata.raw = log1p(adata, copy=True) print('log1p values saved into adata.raw') #make log entries logging.info('Per cell normalization completed successfully.') logging.info("\tTime for per-cell normalization: " + str(round(time() - start, 3)) + 's') #export to file start = time() export_cp10k(adata, basepath=results_folder) logging.info('cp10k values exported to file.') logging.info("\tTime for cp10k export: " + str(round(time() - start, 3)) + 's') return (adata)
def normalize_layers(data, layers=['spliced', 'unspliced'], counts_per_cell_after=None, max_proportion_per_cell=None, by_total_size=None, enforce=False, copy=False): """Normalize by total counts to median. """ adata = data.copy() if copy else data from scanpy.api.pp import normalize_per_cell for layer in layers: if not_yet_normalized(adata.layers[layer]) or enforce: counts_per_cell = get_initial_size(adata, layer, by_total_size) if max_proportion_per_cell is not None and ( 0 < max_proportion_per_cell < 1): counts_per_cell = counts_per_cell_quantile( adata.X, max_proportion_per_cell, counts_per_cell) counts_per_cell += counts_per_cell == 0 adata.layers[layer] = normalize_per_cell(adata.layers[layer], counts_per_cell_after, counts_per_cell, copy=True) return adata if copy else None
def normalize_layers(data, layers={'spliced', 'unspliced'}, by_total_size=None, max_proportion_per_cell=None, copy=False): """Normalize by total counts to median. """ adata = data.copy() if copy else data from scanpy.api.pp import normalize_per_cell def not_normalized_yet(adata, layer): X = adata.layers[layer] return np.allclose((X.data[:10] if issparse(X) else X[0]) % 1, 0, atol=1e-3) for layer in layers: if not_normalized_yet(adata, layer): counts_per_cell = get_initial_size(adata, layer, by_total_size) if max_proportion_per_cell is not None and ( 0 < max_proportion_per_cell < 1): counts_per_cell = counts_per_cell_quantile( adata.X, max_proportion_per_cell, counts_per_cell) adata.layers[layer] = normalize_per_cell(adata.layers[layer], None, counts_per_cell, copy=True) return adata if copy else None
def normalize_layers(adata, layers={'spliced', 'unspliced'}, copy=False): """Normalize by total counts to median """ from scanpy.api.pp import normalize_per_cell, filter_cells for layer in layers: subset, counts = filter_cells(adata.layers[layer], min_counts=1) adata.layers[layer] = normalize_per_cell(adata.layers[layer], None, counts, copy=True) return adata if copy else None
def filter_and_normalize(adata, min_counts=10, n_top_genes=None, log=True, copy=False): """Filtering, normalization and log transform Expects non-logarithmized data. If using logarithmized data, pass `log=False`. Runs the following steps .. code:: python sc.pp.filter_genes(adata, min_counts=10) sc.pp.normalize_per_cell(adata) sc.pp.filter_genes_dispersion(adata, n_top_genes=10000) sc.pp.normalize_per_cell(adata) if log: sc.pp.log1p(adata) Arguments --------- adata: :class:`~anndata.AnnData` Annotated data matrix. min_counts: `int` (default: 10) Minimum number of gene counts per cell. n_top_genes: `int` (default: 10000) Number of genes to keep. log: `bool` (default: `True`) Take logarithm. copy: `bool` (default: `False`) Return a copy of `adata` instead of updating it. Returns ------- Returns or updates `adata` depending on `copy`. """ from scanpy.api.pp import filter_genes, filter_genes_dispersion, normalize_per_cell, log1p filter_genes(adata, min_counts=min_counts) if n_top_genes is not None and n_top_genes < adata.shape[1]: normalize_per_cell(adata) filter_genes_dispersion(adata, n_top_genes=n_top_genes) normalize_per_cell(adata) if log: log1p(adata) return adata if copy else None
def normalize_per_cell(data, counts_per_cell_after=None, counts_per_cell=None, key_n_counts=None, max_proportion_per_cell=None, layers={'spliced', 'unspliced'}, copy=False): """Normalize each cell by total counts over all genes. Parameters ---------- data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse` The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. counts_per_cell_after : `float` or `None`, optional (default: `None`) If `None`, after normalization, each cell has a total count equal to the median of the *counts_per_cell* before normalization. counts_per_cell : `np.array`, optional (default: `None`) Precomputed counts per cell. key_n_counts : `str`, optional (default: `'n_counts'`) Name of the field in `adata.obs` where the total counts per cell are stored. max_proportion_per_cell : `int` (default: `None`) Exclude genes counts that account for more than a specific proportion of cell size, e.g. 0.05. layers : `str` or `list` (default: `{'spliced', 'unspliced'}`) Keys for layers to be also considered for normalization. copy : `bool`, optional (default: `False`) If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Returns ------- Returns or updates `adata` with normalized version of the original `adata.X`, depending on `copy`. """ adata = data.copy() if copy else data from scanpy.api.pp import normalize_per_cell if max_proportion_per_cell is not None and (0 < max_proportion_per_cell < 1): counts_per_cell = counts_per_cell_quantile(adata.X, max_proportion_per_cell) normalize_per_cell(adata, counts_per_cell_after, counts_per_cell, key_n_counts) normalize_layers(adata, layers, max_proportion_per_cell) return adata if copy else None
def Normalized_per_Cell(self): self.scRNAseq_Propcessed = normalize_per_cell(self.scRNAseq_Counts, copy=True)
def filter_and_normalize(data, min_counts=3, min_counts_u=3, min_cells=None, min_cells_u=None, n_top_genes=None, log=True, plot=False, copy=False): """Filtering, normalization and log transform Expects non-logarithmized data. If using logarithmized data, pass `log=False`. Runs the following steps .. code:: python sc.pp.filter_genes(adata, min_counts=10) sc.pp.normalize_per_cell(adata) sc.pp.filter_genes_dispersion(adata, n_top_genes=10000) sc.pp.normalize_per_cell(adata) if log: sc.pp.log1p(adata) Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. min_counts: `int` (default: 10) Minimum number of gene counts per cell. n_top_genes: `int` (default: 10000) Number of genes to keep. log: `bool` (default: `True`) Take logarithm. copy: `bool` (default: `False`) Return a copy of `adata` instead of updating it. Returns ------- Returns or updates `adata` depending on `copy`. """ adata = data.copy() if copy else data from scanpy.api.pp import filter_genes, filter_genes_dispersion, normalize_per_cell, log1p def filter_genes_u(adata, min_counts_u=None, min_cells_u=None): counts = adata.layers[ 'unspliced'] if min_counts_u is not None else adata.layers[ 'unspliced'] > 0 counts = counts.sum(0).A1 if issparse(counts) else counts.sum(0) adata._inplace_subset_var(counts >= ( min_counts_u if min_counts_u is not None else min_cells_u)) if min_counts is not None: filter_genes(adata, min_counts=min_counts) if min_cells is not None: filter_genes(adata, min_cells=min_cells) if 'unspliced' in adata.layers.keys(): if min_counts_u is not None: filter_genes_u(adata, min_counts_u=min_counts_u) if min_cells_u is not None: filter_genes_u(adata, min_cells_u=min_cells_u) if n_top_genes is not None and n_top_genes < adata.shape[1]: normalize_per_cell(adata) filter_result = filter_genes_dispersion(adata.X, n_top_genes=n_top_genes, log=False) if plot: from scanpy.plotting.preprocessing import filter_genes_dispersion as plot_filter_genes_dispersion plot_filter_genes_dispersion(filter_result, log=True) adata._inplace_subset_var(filter_result.gene_subset) #filter_genes_dispersion(adata, n_top_genes=n_top_genes) normalize_per_cell(adata) if log: log1p(adata) return adata if copy else None
def filter_and_normalize(data, min_counts=None, min_counts_u=None, min_cells=None, min_cells_u=None, n_top_genes=None, flavor='seurat', log=True, copy=False): """Filtering, normalization and log transform Expects non-logarithmized data. If using logarithmized data, pass `log=False`. Runs the following steps .. code:: python scv.pp.filter_genes(adata) scv.pp.normalize_per_cell(adata) if n_top_genes is not None: scv.pp.filter_genes_dispersion(adata) if log: scv.pp.log1p(adata) Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. min_counts: `int` (default: `None`) Minimum number of counts required for a gene to pass filtering (spliced). min_counts_u: `int` (default: `None`) Minimum number of counts required for a gene to pass filtering (unspliced). min_cells: `int` (default: `None`) Minimum number of cells expressed required for a gene to pass filtering (spliced). min_cells_u: `int` (default: `None`) Minimum number of cells expressed required for a gene to pass filtering (unspliced). n_top_genes: `int` (default: `None`) Number of genes to keep. flavor: {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat') Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data. log: `bool` (default: `True`) Take logarithm. copy: `bool` (default: `False`) Return a copy of `adata` instead of updating it. Returns ------- Returns or updates `adata` depending on `copy`. """ adata = data.copy() if copy else data if 'spliced' in adata.layers.keys() and 'unspliced' in adata.layers.keys(): X_not_yet_processed = np.all( adata.X.data[:100] == adata.layers['spliced'].data[:100]) else: raise ValueError('Could not find spliced / unspliced counts.') filter_genes(adata, min_counts=min_counts, min_counts_u=min_counts_u, min_cells=min_cells, min_cells_u=min_cells_u) normalize_per_cell(adata) if n_top_genes is not None: filter_genes_dispersion(adata, n_top_genes=n_top_genes, flavor=flavor) if log and X_not_yet_processed: log1p(adata) logg.info('Logarithmized X.') elif log: logg.info('Did not modify X as it looks preprocessed already.') elif X_not_yet_processed: logg.info( 'Consider logarithmizing adata.X with `scv.pp.log1p` for better results.' ) return adata if copy else None
def normalize_per_cell(data, counts_per_cell_after=None, counts_per_cell=None, key_n_counts=None, max_proportion_per_cell=None, layers=['spliced', 'unspliced'], enforce=False, copy=False): """Normalize each cell by total counts over all genes. Parameters ---------- data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse` The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. counts_per_cell_after : `float` or `None`, optional (default: `None`) If `None`, after normalization, each cell has a total count equal to the median of the *counts_per_cell* before normalization. counts_per_cell : `np.array`, optional (default: `None`) Precomputed counts per cell. key_n_counts : `str`, optional (default: `'n_counts'`) Name of the field in `adata.obs` where the total counts per cell are stored. max_proportion_per_cell : `int` (default: `None`) Exclude genes counts that account for more than a specific proportion of cell size, e.g. 0.05. layers : `str` or `list` (default: `{'spliced', 'unspliced'}`) Keys for layers to be also considered for normalization. copy : `bool`, optional (default: `False`) If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Returns ------- Returns or updates `adata` with normalized version of the original `adata.X`, depending on `copy`. """ adata = data.copy() if copy else data from scanpy.api.pp import normalize_per_cell if max_proportion_per_cell is not None and (0 < max_proportion_per_cell < 1): counts_per_cell = counts_per_cell_quantile(adata.X, max_proportion_per_cell) if not_yet_normalized(adata.X) or enforce: normalize_per_cell(adata, counts_per_cell_after, counts_per_cell, key_n_counts) add_msg_str = 'X and ' else: add_msg_str = '' layers = [layers] if isinstance(layers, str) else [ layer for layer in layers if layer in adata.layers.keys() ] if all([not_yet_normalized(adata.layers[layer]) for layer in layers]) or enforce: normalize_layers(adata, layers, counts_per_cell_after, max_proportion_per_cell) logg.info('Normalized ' + add_msg_str + 'spliced/unspliced count data.') else: logg.info( 'Looks like it\'s already normalized.' 'If you want to (re-)normalize your data, use `scv.pp.normalize_per_cell(adata, enforce=True)`.' ) return adata if copy else None
def filter_and_normalize(data, min_counts=None, min_counts_u=None, min_cells=None, min_cells_u=None, n_top_genes=None, flavor='seurat', log=True, copy=False): """Filtering, normalization and log transform Expects non-logarithmized data. If using logarithmized data, pass `log=False`. Runs the following steps .. code:: python scv.pp.filter_genes(adata) scv.pp.normalize_per_cell(adata) if n_top_genes is not None: scv.pp.filter_genes_dispersion(adata) if log: scv.pp.log1p(adata) Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. min_counts: `int` (default: `None`) Minimum number of counts required for a gene to pass filtering (spliced). min_counts_u: `int` (default: `None`) Minimum number of counts required for a gene to pass filtering (unspliced). min_cells: `int` (default: `None`) Minimum number of cells expressed required for a gene to pass filtering (spliced). min_cells_u: `int` (default: `None`) Minimum number of cells expressed required for a gene to pass filtering (unspliced). n_top_genes: `int` (default: `None`) Number of genes to keep. flavor: {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat') Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data. log: `bool` (default: `True`) Take logarithm. copy: `bool` (default: `False`) Return a copy of `adata` instead of updating it. Returns ------- Returns or updates `adata` depending on `copy`. """ adata = data.copy() if copy else data filter_genes(adata, min_counts=min_counts, min_counts_u=min_counts_u, min_cells=min_cells, min_cells_u=min_cells_u) normalize_per_cell(adata) if n_top_genes is not None: filter_genes_dispersion(adata, n_top_genes=n_top_genes, flavor=flavor) if log: log1p(adata) return adata if copy else None