def test_log1p(self, adata, adata_dist):
     log1p(adata_dist)
     result = materialize_as_ndarray(adata_dist.X)
     log1p(adata)
     assert result.shape == adata.shape
     assert result.shape == (adata.n_obs, adata.n_vars)
     npt.assert_allclose(result, adata.X)
示例#2
0
 def test(self, Xtest):
     testdata = AnnData(Xtest)
     normalize_per_cell(testdata, 1000, min_counts=0)
     log1p(testdata)
     testdata.X = _toarray(testdata.X)
     dxixk = scipy.spatial.distance.cdist(testdata.X, self.xkibar)
     return dxixk.argmin(axis=1)
示例#3
0
 def train(self, adata):
     adata = adata.copy()
     adata.X = _toarray(adata.X)
     normalize_per_cell(adata, 1000, min_counts=0)
     log1p(adata)
     adata = process_clusts(adata)
     self.xkibar = np.array(
         [
             adata.X[adata.uns["clusterindices"][k]].mean(axis=0).tolist()
             for k in range(adata.uns["num_clusts"])
         ]
     )
示例#4
0
def highly_variable_genes(adata, batch_key=None, n_shared=2):
    """Calculate highly variable genes and return filtered adata containing only the HVGs.

    Parameters
    ----------
    adata: `AnnData`
      AnnData object for which HVGs are to be calculated
    batch_key: `str` | default = None
        Specify adata.obs column to be used as batch. HVGs will then be calculated per batch. 
    n_shared: `int` | default = 2
        requirement for selection of HVGs - HVGs shared in nr_samples/n_shared will be included. 
        A higher value will result in a less stringent selection, e.g. with 2 HVGs need to be present 
        in at least 50% of the samples. 

    Returns
    -------
    returns an AnnData object with only HVG
    """

    start = time()
    # take log1p
    log1p(adata)
    print('log1p taken of adata')

    sc_highly_variable_genes(adata,
                             min_mean=0.0125,
                             max_mean=3,
                             min_disp=0.5,
                             inplace=True,
                             batch_key=batch_key)
    if (batch_key != None):
        hvglist = adata.var['highly_variable'].copy()
        hvglist.loc[adata.var['highly_variable_nbatches'] >=
                    len(set(adata.obs[batch_key])) / n_shared, ] = True
        adata.var['highly_variable'] = hvglist.copy()

    pl_highly_variable_genes(adata, save='.hvg.png', show=True)

    adata = adata[:, adata.var.highly_variable == True]

    # logging
    logging.info('After feature selection of highly variable genes: ' +
                 str(adata.shape[0]) + ' cells, ' + str(adata.shape[1]) +
                 ' genes')
    logging.info('\tTime for feature selection: ' +
                 str(round(time() - start, 3)) + 's')

    return (adata)
示例#5
0
def per_cell_normalize(adata, results_folder):
    # get start time
    start = time()
    # normalize per cell
    # already normalize BEFORE saving "raw" - as recommended in the scanpy tutorial
    normalize_per_cell(adata, counts_per_cell_after=1e4)
    print('adata normalized per cell')

    # keep raw copy
    adata.raw = log1p(adata, copy=True)
    print('log1p values saved into adata.raw')

    # make log entries
    logging.info('Per cell normalization completed successfully.')
    logging.info("\tTime for per-cell normalization: " +
                 str(round(time() - start, 3)) + 's')

    # export to file
    start = time()
    export_cp10k(adata, basepath=results_folder)

    logging.info('cp10k values exported to file.')
    logging.info("\tTime for cp10k export: " + str(round(time() - start, 3)) +
                 's')

    return (adata)
    def test_write_zarr(self, adata, adata_dist):
        import dask.array as da
        import zarr

        log1p(adata_dist)
        temp_store = zarr.TempStore()
        chunks = adata_dist.X.chunks
        if isinstance(chunks[0], tuple):
            chunks = (chunks[0][0], ) + chunks[1]
        # write metadata using regular anndata
        adata.write_zarr(temp_store, chunks)
        if isinstance(adata_dist.X, da.Array):
            adata_dist.X.to_zarr(temp_store.dir_path("X"), overwrite=True)
        else:
            adata_dist.X.to_zarr(temp_store.dir_path("X"), chunks)
        # read back as zarr directly and check it is the same as adata.X
        adata_log1p = ad.read_zarr(temp_store)
        log1p(adata)
        npt.assert_allclose(adata_log1p.X, adata.X)
示例#7
0
    def preprocess_for_scrublet(adata):

        adata_pp = adata.copy()
        pp.filter_genes(adata_pp, min_cells=3)
        pp.filter_cells(adata_pp, min_genes=3)
        adata_pp.layers['raw'] = adata_pp.X.copy()
        pp.normalize_total(adata_pp)
        logged = pp.log1p(adata_pp, copy=True)
        pp.highly_variable_genes(logged)
        adata_pp = adata_pp[:, logged.var['highly_variable']]

        return adata_pp
示例#8
0
def recluster(adata,
              celltype,
              celltype_label = 'leiden',
              min_mean = 0.0125,
              max_mean = 4,
              min_disp = 0.5,
              resolution = 1.0,
              regress_out_key = None,
              random_seed = 0,
              show_plot_filter = False,
              method = 'leiden',
              batch_key = None):
    """ Perform subclustering on specific celltype to identify subclusters.

    Extract all cells that belong to the pre-labeled celltype into a new
    data subset. This datasubset is initialized with the raw data contained in adata.raw. New highly
    variable genes are selected and a new clustering is performed. The function returns the adata
    subset with the new clustering annotation.

    This can be performed on leiden clusters by setting celltype_label = 'leiden' and passing the
    clusters that are to be selected for reclustering as strings or tuple of strings to the parameter
    celltype.

    Parameters
    ----------
    adata:
        the complete AnnData object of the Dataset.
    celltype: `str` or (`str`)
        string identifying the cluster which is to be filtered out, if more than one is to be selected please
        pass them as a tuple not as a list!
    celltype_label: `str` | default = 'leiden'
        string identifying which column in adata.obs will be matching with the celltype argument.
    min_mean: `float` | default = 0.0125
        the minimum gene expression a gene must have to be considered highly variable
    max_mean: `float` | default = 4
        the maximum gene expression a gene can have to be considered highly variable
    min_disp: `float` | default = 0.5
        the minimum dispersion a gene must have to be considered highly variable
    regress_out_key: `list of str` | default = None
        A list of string identifiers of the adata.obs columns that should be regressed out before
        performing clustering. If None then no regress_out is calculated.
    random_seed: `int` | default = 0
        the random seed that is used to produce reproducible PCA, clustering and UMAP results
    show_plot_filter: `bool` | default = False
        boolian value indicating if a plot showing the filtering results for highly variable gene
        detection should be displayed or not
    method: `str` | default = 'leiden'
        clustering method to use for the reclustering of the datasubset. Possible:louvain/leiden
    batch_key: `str` | default = None
        Specify a batch key if the HVG calculation should be done per batch

    Returns
    -------

    AnnData object containing the subcluster annotated with PCA, nearest neighbors, louvain cluster,
    and UMAP coordinates.

    Examples
    --------

    For a more detailed example of the entire reclustering process please refer to the code examples.

    >>> import besca as bc
    >>> import scanpy as sc
    >>> adata = bc.datasets.pbmc3k_processed()
    >>> adata_subset = bc.tl.rc.recluster(adata, celltype=('0', '1', '3', '6'), resolution = 1.3)
    >>> sc.pl.umap(adata_subset, color = ['leiden', 'CD3G', 'CD8A', 'CD4', 'IL7R', 'NKG7', 'GNLY'])

    """
    if( not method in ['leiden', 'louvain']):
        raise ValueError("method argument should be leiden or louvain")
    if type(celltype) == str:
        cluster_subset = _subset_adata(adata, adata.obs.get(celltype_label) == celltype)
    elif type(celltype) == tuple:
        filter = adata.obs.get(celltype_label) == 'NONE'
        for i in range(len(celltype)):
            filter = filter | (adata.obs.get(celltype_label) == celltype[i])
        cluster_subset = _subset_adata(adata, filter)
    else:
        sys.exit('specify cluster input as a string or tuple')

    cluster_subset.raw = cluster_subset

    #identify highly variable genes
    filter_result = sc_highly_variable_genes(cluster_subset, min_mean = min_mean,
                                             max_mean = max_mean, min_disp = min_disp,  inplace=False, batch_key=batch_key)
    if show_plot_filter:
        plot_filter(filter_result)
    print('In total', str(sum(filter_result.highly_variable)), 'highly variable genes selected within cluster')

    #apply filter
    cluster_subset = _subset_adata(cluster_subset, filter_result.highly_variable, axis = 1, raw = False)

    #perform further processing
    log1p(cluster_subset)
    if regress_out_key is not None:
        regress_out(cluster_subset, keys = regress_out_key)
    sc_scale(cluster_subset)
    sc_pca(cluster_subset, random_state = random_seed, svd_solver='arpack') #using `svd_solver='arpack' ensures that the PCA leads to reproducible results
    neighbors(cluster_subset, n_neighbors=10, random_state = random_seed)
    umap(cluster_subset, random_state = random_seed)
    if method == 'louvain':
        louvain(cluster_subset, resolution = resolution, random_state=random_seed)
    if method == 'leiden':
        leiden(cluster_subset, resolution=resolution, random_state=random_seed)

    return(cluster_subset)