Exemplo n.º 1
0
def saveSeurat(adata, path, batch, hvgs=None):
    import re
    ro.r('library(Seurat)')
    ro.r('library(scater)')
    anndata2ri.activate()

    if sparse.issparse(adata.X):
        if not adata.X.has_sorted_indices:
            adata.X.sort_indices()

    for key in adata.layers:
        if sparse.issparse(adata.layers[key]):
            if not adata.layers[key].has_sorted_indices:
                adata.layers[key].sort_indices()

    ro.globalenv['adata'] = adata

    ro.r('sobj = as.Seurat(adata, counts="counts", data = "X")')

    # Fix error if levels are 0 and 1
    # ro.r(f'sobj$batch <- as.character(sobj${batch})')
    ro.r(f'Idents(sobj) = "{batch}"')
    ro.r(f'saveRDS(sobj, file="{path}")')
    if hvgs is not None:
        hvg_out = re.sub('\.RDS$', '', path) + '_hvg.RDS'
        #hvg_out = path+'_hvg.rds'
        ro.globalenv['hvgs'] = hvgs
        ro.r('unlist(hvgs)')
        ro.r(f'saveRDS(hvgs, file="{hvg_out}")')

    anndata2ri.deactivate()
Exemplo n.º 2
0
    def run(self):
        """
        Function to call scTransform from Python
        """

        ro.r('library(Seurat)')
        ro.r('library(scater)')
        anndata2ri.activate()

        sc.pp.filter_genes(self.data, min_cells=5)

        if issparse(self.data.X):
            if not self.data.X.has_sorted_indices:
                self.data.X.sort_indices()

        for key in self.data.layers:
            if issparse(self.data.layers[key]):
                if not self.data.layers[key].has_sorted_indices:
                    self.data.layers[key].sort_indices()

        ro.globalenv['adata'] = self.data

        ro.r('seurat_obj = as.Seurat(adata, counts="X", data = NULL)')

        ro.r('res <- SCTransform(object=seurat_obj)')

        norm_x = ro.r('res@assays$SCT@data').T

        self.data.layers['normalized'] = norm_x

        self.dump_to_h5ad("scTransform")
Exemplo n.º 3
0
def kBET_single(matrix, batch, type_ = None, k0 = 10, knn=None, subsample=0.5, heuristic=True, verbose=False):
    """
    params:
        matrix: expression matrix (at the moment: a PCA matrix, so do.pca is set to FALSE
        batch: series or list of batch assignemnts
        subsample: fraction to be subsampled. No subsampling if `subsample=None`
    returns:
        kBET p-value
    """
        
    anndata2ri.activate()
    ro.r("library(kBET)")
    
    if verbose:
        print("importing expression matrix")
    ro.globalenv['data_mtrx'] = matrix
    ro.globalenv['batch'] = batch
    #print(matrix.shape)
    #print(len(batch))
    
    if verbose:
        print("kBET estimation")
    #k0 = len(batch) if len(batch) < 50 else 'NULL'
    
    ro.globalenv['knn_graph'] = knn
    ro.globalenv['k0'] = k0
    batch_estimate = ro.r(f"batch.estimate <- kBET(data_mtrx, batch, knn=knn_graph, k0=k0, plot=FALSE, do.pca=FALSE, heuristic=FALSE, adapt=FALSE, verbose={str(verbose).upper()})")
            
    anndata2ri.deactivate()
    try:
        ro.r("batch.estimate$average.pval")[0]
    except rpy2.rinterface_lib.embedded.RRuntimeError:
        return np.nan
    else:
        return ro.r("batch.estimate$average.pval")[0]
Exemplo n.º 4
0
def identify_empty_droplets(data, min_cells=3, **kw):
    """Detect empty droplets using DropletUtils

    """
    import rpy2.robjects as robj
    from rpy2.robjects import default_converter
    from rpy2.robjects.packages import importr
    import anndata2ri
    from rpy2.robjects.conversion import localconverter
    importr("DropletUtils")
    adata = data.copy()
    col_sum = adata.X.sum(0)
    if hasattr(col_sum, 'A'):
        col_sum = col_sum.A.squeeze()
        
    keep = col_sum > min_cells
    adata = adata[:,keep]
    #adata.X = adata.X.tocsc()
    anndata2ri.activate()
    robj.globalenv["X"] = adata
    res = robj.r('res <- emptyDrops(assay(X))')
    anndata2ri.deactivate()
    keep = res.loc[res.FDR<0.01,:]
    data = data[keep.index,:] 
    data.obs['empty_FDR'] = keep['FDR']
    
    return data
Exemplo n.º 5
0
def pyScTransform(adata, output_file=None):
    """
    Function to call scTransform from Python
    """
    import rpy2.robjects as ro
    import anndata2ri

    ro.r('library(Seurat)')
    ro.r('library(scater)')
    anndata2ri.activate()

    sc.pp.filter_genes(adata, min_cells=5)
    
    if issparse(adata.X):
        if not adata.X.has_sorted_indices:
            adata.X.sort_indices()

    for key in adata.layers:
        if issparse(adata.layers[key]):
            if not adata.layers[key].has_sorted_indices:
                adata.layers[key].sort_indices()

    ro.globalenv['adata'] = adata

    ro.r('seurat_obj = as.Seurat(adata, counts="X", data = NULL)')

    ro.r('res <- SCTransform(object=seurat_obj, return.only.var.genes = FALSE, do.correct.umi = FALSE)')

    norm_x = ro.r('res@[email protected]').T

    adata.layers['normalized'] = norm_x

    if output_file:
        adata.write(output_file)
Exemplo n.º 6
0
def test_py2rpy_activate(check, shape, dataset):
    try:
        anndata2ri.activate()
        globalenv["adata"] = dataset()
    finally:
        anndata2ri.deactivate()
    ex = globalenv["adata"]
    assert tuple(baseenv["dim"](ex)[::-1]) == shape
    check(ex)
Exemplo n.º 7
0
def test_convert_activate(check, shape, dataset):
    try:
        anndata2ri.activate()
        ad = dataset()
    finally:
        anndata2ri.deactivate()
    assert isinstance(ad, AnnData)
    assert ad.shape == shape
    check(ad)
Exemplo n.º 8
0
def deviance(adata, n_genes=4000, rlib_loc=''):
    """
    Wrapper of the 'deviance' method of highly-variable gene selection, included in the 'scry' R package.  
    
    Parameters
    ----------
    adata: `AnnData`
        AnnData object of RNA counts.
    n_genes: `int`
        Number of highly-variable genes to return. A selection of 4000-5000 generally yields the best results. 
    rlib_loc: `str`
        R library location that will be added to the default .libPaths() to locate the required packages. 
  
    Returns
    -------
    returns an AnnData object reduced to the highly-variable genes. 
    """
    rpy2_import = importlib.util.find_spec('rpy2')
    if rpy2_import is None:
        raise ImportError(
            "deviance requires rpy2. Install with pip install rpy2")
    from rpy2.robjects.packages import importr
    import rpy2.robjects as ro
    import anndata2ri
    from scipy.sparse import issparse

    anndata2ri.activate()

    ro.globalenv['rlib_loc'] = rlib_loc
    ro.r('.libPaths(c(rlib_loc, .libPaths()))')
    ro.r('suppressPackageStartupMessages(library(scry))')
    ro.r('suppressPackageStartupMessages(library(Seurat))')

    if issparse(adata.X):
        if not adata.X.has_sorted_indices:
            adata.X.sort_indices()
    for key in adata.layers:
        if issparse(adata.layers[key]):
            if not adata.layers[key].has_sorted_indices:
                adata.layers[key].sort_indices()

    ro.globalenv['adata'] = adata
    ro.globalenv['n'] = n_genes
    print('Reducing the data to', n_genes, 'variable genes.')
    ro.globalenv['rownam'] = adata.var.index
    ro.r('seurat_obj = as.Seurat(adata, counts="X", data = NULL)')
    ro.r('adata <- t(as.matrix(seurat_obj@assays$RNA@counts))')
    ro.r('out <- devianceFeatureSelection(adata)')
    ro.r(
        'out <- sort(devianceFeatureSelection(adata),decreasing = TRUE)[1:n] ')
    hvgs_r = ro.r('rownam[order(out, decreasing = TRUE)][1:n]')
    adata = adata[:, list(hvgs_r)]
    adata.var['highly_variable'] = True

    return adata
Exemplo n.º 9
0
def save_adata(adata: AnnData, transpose: bool = False):
    anndata2ri.activate()

    if transpose:
        r.saveRDS(adata.X.T, file="adata_t.rds")
    else:
        r.saveRDS(adata.X, file="adata.rds")
    r.saveRDS(adata.obs_names.values, file="obs_names.rds")
    r.saveRDS(adata.var_names.values, file="var_names.rds")

    anndata2ri.deactivate()
Exemplo n.º 10
0
def test_py2rpy2_numpy_pbmc68k():
    """This has some weird metadata"""
    from scanpy.datasets import pbmc68k_reduced

    try:
        anndata2ri.activate()
        with catch_warnings(record=True) as logs:  # type: List[WarningMessage]
            simplefilter("ignore", DeprecationWarning)
            globalenv["adata"] = pbmc68k_reduced()
        assert len(logs) == 0, [m.message for m in logs]
    finally:
        anndata2ri.deactivate()
Exemplo n.º 11
0
def test_py2rpy2_numpy_pbmc68k():
    """This has some weird metadata"""
    from scanpy.datasets import pbmc68k_reduced

    try:
        anndata2ri.activate()
        with catch_warnings(record=True) as logs:  # type: List[WarningMessage]
            simplefilter("ignore", DeprecationWarning)
            globalenv["adata"] = pbmc68k_reduced()
        assert len(logs) == 1, [m.message for m in logs]
        assert logs[0].category is NotConvertedWarning
        assert "scipy.sparse.csr.csr_matrix" in str(logs[0].message)
    finally:
        anndata2ri.deactivate()
Exemplo n.º 12
0
def save_stemnet_cluster_pop(size: int, col: int):
    anndata2ri.activate()

    with open(DATA_DIR / "benchmarking" / "runtime_analysis" / "gpcca.pickle", "rb") as fin:
        data = pickle.load(fin)[size][str(col)]

    # old name: main_states
    cluster_annot = data["terminal_states"]
    clusters = cluster_annot.cat.categories

    df = pd.DataFrame(dict(zip(clusters, [cluster_annot.isin([c]) for c in clusters])))
    r.saveRDS(df, file="cluster_pop.rds")

    anndata2ri.deactivate()
Exemplo n.º 13
0
def pca_outliers(adata, min_genes_per_cell=5, verbose=True):
    """
    Function to filter outliers using scater PCA on quality measures
    """
    import numpy as np
    import rpy2.robjects as ro
    import anndata2ri
    import scanpy as sc
    from rpy2.robjects import pandas2ri
    from scipy.sparse import issparse
    import rpy2.rinterface_lib.callbacks
    import logging
    if not verbose:
        rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

    ro.r('library(scater)')

    pandas2ri.activate()
    anndata2ri.activate()

    print("Loading objects into R")
    if issparse(adata.X):
        ro.globalenv['rawMatrix'] = adata.X.T.todense()
    else:
        ro.globalenv['rawMatrix'] = adata.X.T
    ro.globalenv['variables'] = adata.var_names.copy()
    ro.globalenv['observations'] = adata.obs[['total_counts']]

    print('Calculate PCA outliers')

    ro.r('')
    ro.r('pd <- DataFrame(data = observations)')
    ro.r('colnames(rawMatrix) <- rownames(pd)')
    ro.r('rownames(rawMatrix) <- variables')
    ro.r(
        'sce <- SingleCellExperiment(assays = list(counts = as.matrix(rawMatrix) ), colData = pd)'
    )
    ro.r('sce <- calculateQCMetrics(sce)')
    ro.r('sce <- runPCA(sce, use_coldata = TRUE, detect_outliers = TRUE)')
    ro.r('cat("Nr of outliers detected:", sum(sce$outlier), sep=" ")')
    ro.r('outlier2 = sce@colData@rownames[sce$outlier]')
    ro.r(
        'plotReducedDim(sce, use_dimred="PCA", shape_by = "outlier", size_by = "total_counts", colour_by = "total_features_by_counts")'
    )

    outlier2 = ro.r('outlier2')
    adata = adata[np.invert(np.in1d(adata.obs_names, outlier2))].copy()
    sc.pp.filter_genes(adata, min_cells=min_genes_per_cell)

    return adata
Exemplo n.º 14
0
def readSeurat(path):
    anndata2ri.activate()
    ro.r('library(Seurat)')
    ro.r('library(scater)')
    ro.r(f'sobj <- readRDS("{path}")')
    adata = ro.r('as.SingleCellExperiment(sobj)')
    anndata2ri.deactivate()

    #Test for 'X_EMB'
    if 'X_EMB' in adata.obsm:
        if 'X_emb' in adata.obsm:
            print(
                'overwriting existing `adata.obsm["X_emb"] in the adata object'
            )
        adata.obsm['X_emb'] = adata.obsm['X_EMB']
        del adata.obsm['X_EMB']

    return (adata)
Exemplo n.º 15
0
def normalize(adata, min_mean=0.1):

    checkAdata(adata)

    # massive speedup when working with sparse matrix
    if not sparse.issparse(
            adata.X):  # quick fix: HVG doesn't work on dense matrix
        adata.X = sparse.csr_matrix(adata.X)

    anndata2ri.activate()
    ro.r('library("scran")')

    # keep raw counts
    adata.layers["counts"] = adata.X.copy()

    # Preliminary clustering for differentiated normalisation
    adata_pp = adata.copy()
    sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
    sc.pp.log1p(adata_pp)
    sc.pp.pca(adata_pp, n_comps=15, svd_solver='arpack')
    sc.pp.neighbors(adata_pp)
    sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

    ro.globalenv['data_mat'] = adata.X.T
    ro.globalenv['input_groups'] = adata_pp.obs['groups']
    size_factors = ro.r(
        f'computeSumFactors(data_mat, clusters = input_groups, min.mean = {min_mean})'
    )
    del adata_pp

    # modify adata
    adata.obs['size_factors'] = size_factors
    adata.X /= adata.obs['size_factors'].values[:, None]
    sc.pp.log1p(adata)
    # convert to sparse, bc operation always converts to dense
    adata.X = sparse.csr_matrix(adata.X)
    adata.raw = adata  # Store the full data set in 'raw' as log-normalised data for statistical testing
Exemplo n.º 16
0
def normalize(adata, min_mean=0.1, log=True, precluster=True, sparsify=True):

    checkAdata(adata)

    # Check for 0 count cells
    if np.any(adata.X.sum(axis=1) == 0):
        raise ValueError('found 0 count cells in the AnnData object.'
                         ' Please filter these from your dataset.')

    # Check for 0 count genes
    if np.any(adata.X.sum(axis=0) == 0):
        raise ValueError('found 0 count genes in the AnnData object.'
                         ' Please filter these from your dataset.')

    if sparsify:
        # massive speedup when working with sparse matrix
        if not sparse.issparse(
                adata.X):  # quick fix: HVG doesn't work on dense matrix
            adata.X = sparse.csr_matrix(adata.X)

    anndata2ri.activate()
    ro.r('library("scran")')

    # keep raw counts
    adata.layers["counts"] = adata.X.copy()

    is_sparse = False
    X = adata.X.T
    # convert to CSC if possible. See https://github.com/MarioniLab/scran/issues/70
    if sparse.issparse(X):
        is_sparse = True

        if X.nnz > 2**31 - 1:
            X = X.tocoo()
        else:
            X = X.tocsc()

    ro.globalenv['data_mat'] = X

    if precluster:
        # Preliminary clustering for differentiated normalisation
        adata_pp = adata.copy()
        sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
        sc.pp.log1p(adata_pp)
        sc.pp.pca(adata_pp, n_comps=15, svd_solver='arpack')
        sc.pp.neighbors(adata_pp)
        sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

        ro.globalenv['input_groups'] = adata_pp.obs['groups']
        size_factors = ro.r(
            'sizeFactors(computeSumFactors(SingleCellExperiment('
            'list(counts=data_mat)), clusters = input_groups,'
            f' min.mean = {min_mean}))')

        del adata_pp

    else:
        size_factors = ro.r(
            'sizeFactors(computeSumFactors(SingleCellExperiment('
            f'list(counts=data_mat)), min.mean = {min_mean}))')

    # modify adata
    adata.obs['size_factors'] = size_factors
    adata.X /= adata.obs['size_factors'].values[:, None]
    if log:
        print("Note! Performing log1p-transformation after normalization.")
        sc.pp.log1p(adata)
    else:
        print("No log-transformation performed after normalization.")

    if is_sparse:
        # convert to sparse, bc operation always converts to dense
        adata.X = sparse.csr_matrix(adata.X)

    adata.raw = adata  # Store the full data set in 'raw' as log-normalised data for statistical testing

    # Free memory in R
    ro.r('rm(list=ls())')
    ro.r(
        'lapply(names(sessionInfo()$loadedOnly), require, character.only = TRUE)'
    )
    ro.r(
        'invisible(lapply(paste0("package:", names(sessionInfo()$otherPkgs)), '
        'detach, character.only=TRUE, unload=TRUE))')
    ro.r('gc()')
Exemplo n.º 17
0
def valOutlier(adata, nmads=3, rlib_loc=''):
    """
    Estimates and returns the thresholds to use for gene/cell filtering based on outliers calculated from the deviation to the median QCs. Wrapper function based on 'isOutlier' function of the 'scater' R package. 
    
    Parameters
    ----------
    adata: `AnnData`
        Unfiltered AnnData object of RNA counts.
    nmads: `int`
        Number of median absolute deviation to use as threshold for outlier detection. Lenient NMADS (3 to 5) generally yield the best results.
    rlib_loc: `str`
        R library location that will be added to the default .libPaths() to locate the required packages. 
  
    Returns
    -------
    The estimated parameters to set in the besca workflow considering the QC distribution. 
    """

    rpy2_import = importlib.util.find_spec('rpy2')
    if rpy2_import is None:
        raise ImportError(
            "deviance requires rpy2. Install with pip install rpy2")
    import rpy2.robjects as ro
    import anndata2ri
    from scipy.sparse import issparse

    anndata2ri.activate()

    ro.globalenv['rlib_loc'] = rlib_loc
    ro.r('.libPaths(c(rlib_loc, .libPaths()))')
    ro.r('suppressPackageStartupMessages(library(scater))')
    ro.r('suppressPackageStartupMessages(library(Matrix))')
    ro.r('suppressPackageStartupMessages(library(Seurat))')

    if issparse(adata.X):
        if not adata.X.has_sorted_indices:
            adata.X.sort_indices()

    for key in adata.layers:
        if issparse(adata.layers[key]):
            if not adata.layers[key].has_sorted_indices:
                adata.layers[key].sort_indices()

    ro.globalenv['dat'] = adata
    ro.globalenv['sym'] = adata.var['SYMBOL']
    ro.r('seurat_obj = as.Seurat(dat, counts="X", data = NULL)')
    ro.r(
        'dat =  SingleCellExperiment(assays = list(counts=seurat_obj@assays$RNA@counts) )'
    )
    ro.r('rownames(dat) <- sym')
    ro.r('''
    valOutlier <- function(dat, nmads = 3){

      mito <- grep('MT-', rownames(dat))
      if (length(mito) == 0){
        mito <- NULL
      } else {
        mito <- list(Mito = mito)
      }

      stats_cells <- perCellQCMetrics(dat, subsets = mito )
      stats_genes <- perCellQCMetrics(t(counts(dat)))

      lower_detected <- as.numeric(attr(isOutlier(stats_cells$detected, nmads = nmads, type = 'lower'), 'thresholds')['lower'])
      if(lower_detected < 0) lower_detected <- 0
      rm_detected <- sum(isOutlier(stats_cells$detected, nmads = nmads, type = 'lower'))

      lower_expressed <- as.numeric(attr(isOutlier(stats_genes$detected, nmads = nmads, type = 'lower'), 'thresholds')['lower'])
      if(lower_expressed < 0) lower_expressed <- 0
      rm_expressed <- sum(isOutlier(stats_genes$detected, nmads = nmads, type = 'lower'))

      lower_sum <- as.numeric(attr(isOutlier(stats_cells$sum, nmads = nmads, type = 'lower'), 'thresholds')['lower'])
      if(lower_sum < 0) lower_sum <- 0
      rm_sum <- sum(isOutlier(stats_cells$sum, nmads = nmads, type = 'lower'))

      higher_detected <- as.numeric(attr(isOutlier(stats_cells$detected, nmads = nmads, type = 'higher'), 'thresholds')['higher'])
      rm_high_detected <- sum(isOutlier(stats_cells$detected, nmads = nmads, type = 'higher'))

      if(!length(mito) == 0){
        max_mito <- as.numeric(attr(isOutlier(stats_cells$subsets_Mito_percent , nmads = nmads, type = 'higher'), 'thresholds')['higher'])/100
        if(max_mito>1) max_mito <- 1
        rm_mito <- sum(isOutlier(stats_cells$subsets_Mito_percent, nmads = nmads, type = 'higher'))
      }

      higher_sum <- as.numeric(attr(isOutlier(stats_cells$sum, nmads = nmads, type = 'higher'), 'thresholds')['higher'])
      if(is.na(higher_sum)) higher_sum <- as.numeric(attr(isOutlier(stats_cells$sum, nmads = nmads, type = 'higher'), 'thresholds')['higher', 1])
      rm_high_sum <- sum(isOutlier(stats_cells$sum, nmads = nmads, type = 'higher'))

      message('Advised parameters based on outliers with ',nmads, ' NMADS:')
      message('standard_min_genes: ', round(lower_detected,2), ', removing ', rm_detected, ' cells')
      message('standard_min_cells: ', round(lower_expressed, 2), ', removing ', rm_expressed, ' genes')
      message('standard_min_counts: ', round(lower_sum,2), ', removing ', rm_sum, ' cells')
      message('standard_n_genes: ', round(higher_detected, 2), ', removing ', rm_high_detected, ' cells')
      if(!length(mito) == 0) {
          message('standard_percent_mito: ', round(max_mito, 2), ', removing ', rm_mito, ' cells')
      } else {
          message('No mitochondrial gene detected.')
          max_mito <- 1
      }
      message('standard_max_counts: ', round(higher_sum, 2), ', removing ', rm_high_sum, ' cells')
      
      return(c(round(lower_detected,2), 
            round(lower_expressed, 2), 
            round(lower_sum,2), 
            round(higher_detected, 2), 
            round(max_mito, 2), 
            round(higher_sum, 2)))
    }

     ''')
    ro.globalenv['nmads'] = nmads
    return ro.r('valOutlier(dat, nmads = nmads)')
Exemplo n.º 18
0
def scTransform(adata, hvg=False, n_genes=4000, rlib_loc=''):
    """
    Function to call scTransform normalization or HVG selection from Python. Modified from https://github.com/normjam/benchmark/blob/master/normbench/methods/ad2seurat.py. 
    
    Parameters
    ----------
    adata: `AnnData`
        AnnData object of RNA counts.
    hvg: `boolean`
        Should the hvg method be used (returning a reduced adata object) or the normalization method (returning a normalized adata). 
    n_genes: `int`
        Number of hvgs to return if the hvg method is selected. A selection of 4000-5000 generally yields the best results. 
    rlib_loc: `str`
        R library location that will be added to the default .libPaths() to locate the required packages. 
  
    Returns
    -------
    returns an AnnData object reduced to the highly-variable genes. 
    """

    rpy2_import = importlib.util.find_spec('rpy2')
    if rpy2_import is None:
        raise ImportError(
            "deviance requires rpy2. Install with pip install rpy2")
    import rpy2.robjects as ro
    from rpy2.robjects import numpy2ri
    import anndata2ri
    from scipy.sparse import issparse

    ro.globalenv['rlib_loc'] = rlib_loc
    ro.r('.libPaths(c(rlib_loc, .libPaths()))')
    ro.r('suppressPackageStartupMessages(library(Seurat))')
    ro.r('suppressPackageStartupMessages(library(scater))')
    anndata2ri.activate()

    sc.pp.filter_genes(adata, min_cells=5)

    if issparse(adata.X):
        if not adata.X.has_sorted_indices:
            adata.X.sort_indices()

    for key in adata.layers:
        if issparse(adata.layers[key]):
            if not adata.layers[key].has_sorted_indices:
                adata.layers[key].sort_indices()

    ro.globalenv['adata'] = adata

    ro.r('seurat_obj = as.Seurat(adata, counts="X", data = NULL)')
    if hvg:
        numpy2ri.activate()
        ro.globalenv['n_genes'] = n_genes
        print('Reducing the data to', n_genes, 'variable genes.')
        ro.r(
            'res <- SCTransform(object=seurat_obj, return.only.var.genes = TRUE, do.correct.umi = FALSE, variable.features.n = n_genes)'
        )
        hvgs_r = ro.r('res@[email protected]')
        adata = adata[:, list(hvgs_r)]
        adata.var['highly_variable'] = True
        return adata
    else:
        ro.r(
            'res <- SCTransform(object=seurat_obj, return.only.var.genes = FALSE, do.correct.umi = FALSE)'
        )

        norm_x = ro.r('res@[email protected]').T

        adata.layers['counts'] = norm_x
        adata.raw = adata
        return adata
Exemplo n.º 19
0
def sctransform(adata,
                genes=2000,
                min_genes_per_cell=5,
                method='poisson',
                latent=None,
                batch=None,
                cores=1,
                memory=10,
                verbose=True):
    """
    Function to use scTransform. It needs at least the adata.obj['total_counts'] number of UMIs calculated in the data.
    """
    import numpy as np
    import rpy2.robjects as ro
    import anndata2ri
    import scanpy as sc
    from rpy2.robjects import pandas2ri
    from scipy.sparse import issparse
    import rpy2.rinterface_lib.callbacks
    import logging
    if not verbose:
        rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

    ro.r('library(scater)')
    ro.r('library(sctransform)')
    ro.r('library(future)')
    pandas2ri.activate()
    anndata2ri.activate()

    print('Filtering genes')
    sc.pp.filter_genes(adata, min_cells=min_genes_per_cell)

    if issparse(adata.X):
        ro.globalenv['rawMatrix'] = adata.X.T.todense()
    else:
        ro.globalenv['rawMatrix'] = adata.X.T

    latent_var = []

    if latent is None:
        ro.r('cells_info = as.data.frame( colSums(rawMatrix) )')
        ro.globalenv['cellnames'] = np.asarray(adata.obs_names)
        ro.r('rownames(cells_info) = cellnames')
    else:
        latent_var = latent
        ro.globalenv['cells_info'] = adata.obs[latent_var]
        latent_var = ['"data.' + i + '"' for i in latent_var]
    ro.globalenv['genes_name'] = adata.var_names

    ro.r('cell_df <- DataFrame(data = cells_info)')
    #ro.r('print(head(cell_df))')
    #ro.r('print(rownames(cell_df)[1:10])')
    #ro.r('rawMatrix=as.data.frame(rawMatrix)')
    ro.r('colnames(rawMatrix) <- rownames(cell_df)')
    ro.r('rownames(rawMatrix) <- genes_name')
    print('Configure future multithreading')
    ro.globalenv['cores'] = cores
    ro.globalenv['memory'] = memory
    ro.r('future::plan(strategy = \'multicore\', workers = cores)')
    ro.r('options(future.globals.maxSize = memory * 1024 ^ 3)')
    print('Run scTransform')
    ro.globalenv['genes'] = int(genes)
    ro.globalenv['min_genes_per_cell'] = int(min_genes_per_cell)
    ro.globalenv['method'] = method
    stringCommand = 'vst_out=vst( as.matrix(rawMatrix), cell_attr=cell_df, n_genes=genes, method=method, show_progress=TRUE, min_cells=min_genes_per_cell, return_corrected_umi=TRUE'
    #latent_var = ['"data.'+i+'"' for i in latent_var]
    if batch is not None:
        batch = '"data.' + batch + '"'
        stringCommand = stringCommand + ', batch_var=' + batch
        if latent is not None:
            latent_var.remove(batch)
    if ((len(latent_var) > 1) and
        (batch is not None)) | ((len(latent_var) >= 1) and (batch is None)):
        #print(latent_var)
        stringCommand = stringCommand + ', latent_var=c(' + ','.join(
            latent_var) + ')'
    stringCommand += ')'
    print("Running the command:", stringCommand)
    ro.r(stringCommand)
    print('Extract results')
    new_matrix = ro.r('vst_out$y')
    sct_genes = ro.r('rownames(vst_out$model_pars)')
    all_genes = ro.r('rownames(vst_out$y)')
    umi_corrected = ro.r('vst_out$umi_corrected')

    adata = adata[:, all_genes].copy()
    adata.var['highly_variable'] = [i in sct_genes for i in adata.var_names]
    adata.layers['norm_sct'] = np.transpose(new_matrix)
    adata.layers['umi_corr'] = umi_corrected.T.copy()

    return adata
Exemplo n.º 20
0
import scanpy as sc
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import anndata2ri
import numpy as np
import pandas as pd
import os,sys
try:
    import cPickle as pickle
except ImportError:  # python 3.x
    import pickle

#from rpy2.robjects import r

# Activate the anndata2ri conversion between SingleCellExperiment and AnnData
anndata2ri.activate()

#Loading the rpy2 extension enables cell magic to be used
#This runs R code in jupyter notebook cells
%load_ext rpy2.ipython


folder = 'de10;1'
folder = ['de10;1','de10;2','de10;3', 'de10;4', 'de10;5',
          'db10_1', 'db10;2','db10;3', 'db10;4', 'db10;5',
          'dm10;1', 'dm10;2', 'dm10;3','dm10;4', 'dm10;5',
          'dp10;1', 'dp10;2', 'dp10;3', 'dp10;4','dp10;5',
          'db10_1','db10;2', 'db10;3', 'db10;4','db10;5']
folder = 'de10;1'
# folder = ['de10;1','de10;2','de10;3', 'de10;4', 'de10;5']
## single file
Exemplo n.º 21
0
def pca_covariates(adata, covariates=['total_counts'], verbose=False):
    """
    Function to output R^2 of covariates against PCA projection
    """
    import numpy as np
    import pandas as pd
    import rpy2.robjects as ro
    import anndata2ri
    import scanpy as sc
    from rpy2.robjects import pandas2ri
    from scipy.sparse import issparse
    import rpy2.rinterface_lib.callbacks
    import logging
    if not verbose:
        rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
    import seaborn as sns
    import matplotlib.pyplot as plt

    ro.r('library(scater)')

    pandas2ri.activate()
    anndata2ri.activate()

    print("Loading objects into R")
    if issparse(adata.X):
        ro.globalenv['rawMatrix'] = np.log1p(adata.X.T.todense())
    else:
        ro.globalenv['rawMatrix'] = np.log1p(adata.X.T)
    ro.globalenv['observations'] = adata.obs[covariates]

    print('Calculate PCA covariates')

    ro.r('pd <- DataFrame(data = observations)')
    #ro.r('print(pd[1:5,])')
    ro.r('colnames(rawMatrix) <- rownames(pd)')
    ro.r(
        'sce <- SingleCellExperiment(assays = list(counts = as.matrix(rawMatrix) ), colData = pd)'
    )
    commandString = 'getVarianceExplained(sce, exprs_values = "counts", variables = c('
    variables = ['"data.' + i + '"' for i in covariates]
    commandString = commandString + ','.join(variables) + ') )'
    print("using the R command")
    print(commandString)
    vals = ro.r(commandString)
    medians = np.argsort(-np.median(vals, 0))
    medianVals = -np.sort(-np.median(vals, 0))
    vals = pd.DataFrame(vals[:, medians])
    #print(covariates)
    #print(medians)
    vals.columns = np.asarray(covariates)[medians]
    plt.rcParams['figure.figsize'] = (8, 8)
    f, ax = plt.subplots(1)
    for nn, mm in zip(vals.columns, medianVals):
        sns.kdeplot(vals[nn], ax=ax, label=nn, clip=(mm, 97), gridsize=100)
    ax.set_xscale("symlog")
    #plt.xlim(0,100)
    ax.legend(title="Covariates", loc='best')

    adata.uns['pca_covariates'] = vals

    return adata