示例#1
0
def random_subsample(adata, frac=.5):
    subset = np.random.choice([True, False], size=adata.n_obs, p=[frac, 1-frac]).sum()
    adata.obs['subset'] = subset

    adata_subset = adata[subset].copy()
    neighbors(adata_subset)
    moments(adata_subset)

    return adata_subset
示例#2
0
def moments(adata, n_neighbors=30, n_pcs=30, mode='connectivities', renormalize=False, copy=False):
    """Computes first order moments for velocity estimation.

    Arguments
    ---------
    adata: :class:`~anndata.AnnData`
        Annotated data matrix.
    n_neighbors: `int` (default: 30)
        Number of neighbors to use.
    n_pcs: `int` (default: 30)
        Number of principal components to use.
    mode: `'connectivities'` or `'distances'`  (default: `'connectivities'`)
        Distance metric to use for moment computation.
    renormalize: `bool` (default: `False`)
        Renormalize the moments by total counts per cell to its median.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to adata.

    Returns
    -------
    Returns or updates `adata` with the attributes
    Ms: `.layers`
        dense matrix with first order moments of spliced counts.
    Mu: `.layers`
        dense matrix with first order moments of unspliced counts.
    """
    if 'neighbors' not in adata.uns.keys() or n_neighbors > adata.uns['neighbors']['params']['n_neighbors']:
        from scanpy.api.pp import neighbors, pca
        if 'X_pca' not in adata.obsm.keys() or n_pcs > adata.obsm['X_pca'].shape[1]:
            pca(adata, n_comps=n_pcs, svd_solver='arpack')
        neighbors(adata, n_neighbors=n_neighbors, use_rep='X_pca')

    if mode not in adata.uns['neighbors']:
        raise ValueError('mode can only be  \'connectivities\' or \'distances\'')

    logg.info('computing moments', r=True)
    normalize_layers(adata)

    connectivities = get_connectivities(adata, mode)
    #connectivities += connectivities.dot(connectivities*.5)

    adata.layers['Ms'] = csr_matrix.dot(connectivities, csr_matrix(adata.layers['spliced'])).A
    adata.layers['Mu'] = csr_matrix.dot(connectivities, csr_matrix(adata.layers['unspliced'])).A
    if renormalize: normalize_layers(adata, layers={'Ms', 'Mu'})

    logg.info('    finished', time=True, end=' ' if settings.verbosity > 2 else '\n')
    logg.hint(
        'added to `.layers`\n'
        '    \'Ms\', moments of spliced abundances\n'
        '    \'Mu\', moments of unspliced abundances')
    return adata if copy else None
示例#3
0
def pca_neighbors_umap(adata,
                       results_folder,
                       nrpcs=50,
                       nrpcs_neigh=None,
                       nrneigh=10,
                       method='NULL'):
    '''
  parameters
  ----------
  adata: `ÀnnData`
      AnnData object that is to be exported
  results_folder: `str`
      path to the results folder 
  nrpcs: int | nrpcs = 50
      number of principle components to calculate
  nrpcs_neigh: int | nrpcs_neigh = 50
      number of principle components to use for nearest neighbor calculation. 
      When set to None the number is chosen automatically. For .n_vars < 50, .X is used, otherwise ‘X_pca’ is used with 50 components.
  nrneigh: int | nrpcs = None
      number of principle components to calculate
  method: `str`
      Method for nearest neighbor calculation.  Can be set to 'NULL' or bbknn
  '''
    start = time()
    random_state = 0
    print('Using random_state = 0 for all the following calculations')

    #PCA
    sc_pca(adata,
           svd_solver='arpack',
           random_state=random_state,
           n_comps=nrpcs)
    adata.obsm['X_pca'] *= -1  # multiply by -1 to match Seurat
    print(
        "PCA calculated using svd_solver = 'arpack'. PCA multiplied by -1 to match Seurat output."
    )

    #generate plot of PCA
    fig, (ax1, ax2) = subplots(ncols=2, nrows=1)
    fig.set_figwidth(12)
    fig.set_figheight(6)
    fig.tight_layout(pad=4.5)

    cumulative_variance = cumsum(adata.uns['pca']['variance_ratio'])
    x = list(range(nrpcs))
    data = DataFrame({'x': x, 'y': cumulative_variance})

    ax1.scatter(x=x, y=cumulative_variance)
    ax1.set_ylabel('cumulative explained variance')
    ax1.set_xlabel('PCA components')
    ax1.set_title('cumulative explained variance (as ratio)')

    sc_pl_pca(
        adata,
        ax=ax2,
    )
    fig.savefig(join(results_folder, 'figures', 'PCA.png'))

    #display(fig)

    #neighbors
    if (method == 'bbknn'):
        if ('batch' in adata.obs.columns):
            bbknn.bbknn(adata)
    else:
        neighbors(adata,
                  n_neighbors=nrneigh,
                  random_state=random_state,
                  n_pcs=nrpcs_neigh)
        print('Nearest neighbors calculated with n_neighbors = ' +
              str(nrneigh))
        if nrpcs_neigh == 0:
            print('Using .X to calculate nearest neighbors instead of PCs.')
            logging.info(
                'Neighborhood analysis performed with .X instead of PCs.')
    #umap
    sc_umap(adata, random_state=random_state)
    print('UMAP coordinates calculated.')

    logging.info('Neighborhood analysis completed, and UMAP generated.')
    logging.info(
        '\t Time for PCA, nearest neighbor calculation and UMAP generation: ' +
        str(round(time() - start, 3)) + 's')

    #export metadata
    start = time()
    export_metadata(adata,
                    basepath=results_folder,
                    n_pcs=3,
                    umap=True,
                    tsne=False)
    logging.info(
        'Metadata containing 3 PCAs and UMAP coordinates exported successfully to file.'
    )
    logging.info('Time for export: ' + str(round(time() - start, 3)) + 's')

    return (adata)
示例#4
0
def recluster(adata,
              celltype,
              celltype_label='leiden',
              min_mean=0.0125,
              max_mean=4,
              min_disp=0.5,
              resolution=1.0,
              regress_out_key=None,
              random_seed=0,
              show_plot_filter=False,
              method='leiden'):
    """ Perform subclustering on specific celltype to identify subclusters.

    Extract all cells that belong to the pre-labeled celltype into a new 
    data subset. This datasubset is initialized with the raw data contained in adata.raw. New highly
    variable genes are selected and a new clustering is performed. The function returns the adata 
    subset with the new clustering annotation.

    This can be performed on leiden clusters by setting celltype_label = 'leiden' and passing the
    clusters that are to be selected for reclustering as strings or tuple of strings to the parameter
    celltype. 

    Parameters
    ----------
    adata: 
        the complete AnnData object of the Dataset.
    celltype: `str` or (`str`)
        string identifying the cluster which is to be filtered out, if more than one is to be selected please
        pass them as a tuple not as a list!
    celltype_label: `str` | default = 'leiden'
        string identifying which column in adata.obs will be matching with the celltype argument.
    min_mean: `float` | default = 0.0125
        the minimum gene expression a gene must have to be considered highly variable
    max_mean: `float` | default = 4
        the maximum gene expression a gene can have to be considered highly variable        
    min_disp: `float` | default = 0.5
        the minimum dispersion a gene must have to be considered highly variable
    regress_out_key: `list of str` | default = None
        A list of string identifiers of the adata.obs columns that should be regressed out before 
        performing clustering. If None then no regress_out is calculated.
    random_seed: `int` | default = 0
        the random seed that is used to produce reproducible PCA, clustering and UMAP results
    show_plot_filter: `bool` | default = False
        boolian value indicating if a plot showing the filtering results for highly variable gene 
        detection should be displayed or not
    method: `str` | default = 'louvain' 
        clustering method to use for the reclustering of the datasubset. Possible:louvain/leiden

    Returns
    -------

    AnnData object containing the subcluster annotated with PCA, nearest neighbors, louvain cluster,
    and UMAP coordinates.

    Examples
    --------

    For a more detailed example of the entire reclustering process please refer to the code examples.

    >>> import besca as bc
    >>> import scanpy.api as sc
    >>> adata = bc.datasets.pbmc3k_processed()
    >>> adata_subset = bc.tl.rc.recluster(adata, celltype=('0', '1', '3', '6'), resolution = 1.3)
    >>> sc.pl.umap(adata_subset, color = ['louvain', 'CD3G', 'CD8A', 'CD4', 'IL7R', 'NKG7', 'GNLY'])

    """
    if (not method in ['leiden', 'louvain']):
        raise ValueError("method argument should be leiden or louvain")
    if type(celltype) == str:
        cluster_subset = _subset_adata(
            adata,
            adata.obs.get(celltype_label) == celltype)
    elif type(celltype) == tuple:
        filter = adata.obs.get(celltype_label) == 'NONE'
        for i in range(len(celltype)):
            filter = filter | (adata.obs.get(celltype_label) == celltype[i])
        cluster_subset = _subset_adata(adata, filter)
    else:
        sys.exit('specify cluster input as a string or tuple')

    cluster_subset.raw = cluster_subset

    #identify highly variable genes
    filter_result = filter_genes_dispersion(cluster_subset.X,
                                            min_mean=min_mean,
                                            max_mean=max_mean,
                                            min_disp=min_disp)
    if show_plot_filter:
        plot_filter(filter_result)
    print('In total', str(sum(filter_result.gene_subset)),
          'highly variable genes selected within cluster')

    #apply filter
    cluster_subset = _subset_adata(cluster_subset,
                                   filter_result.gene_subset,
                                   axis=1,
                                   raw=False)

    #perform further processing
    log1p(cluster_subset)
    if regress_out_key is not None:
        regress_out(cluster_subset, keys=regress_out_key)
    sc_scale(cluster_subset)
    sc_pca(
        cluster_subset, random_state=random_seed, svd_solver='arpack'
    )  #using `svd_solver='arpack' ensures that the PCA leads to reproducible results
    neighbors(cluster_subset, n_neighbors=10, random_state=random_seed)
    umap(cluster_subset, random_state=random_seed)
    if method == 'louvain':
        louvain(cluster_subset,
                resolution=resolution,
                random_state=random_seed)
    if method == 'leiden':
        leiden(cluster_subset, resolution=resolution, random_state=random_seed)

    return (cluster_subset)