예제 #1
0
def svd(data_norm, scale=True, ncomp=75, only_sdev=False):
    """Principal component analysis via singular value decomposition

    Parameters
    ----------
    data_norm : :class:`pandas.DataFrame`
        A pandas data frame containing normalized gene expression
        data. Preferrably this should be a subset of the normalized
        gene expression matrix containing highly variable genes.
    scale : `bool`
        Scales input data prior to PCA. Default: True
    ncomp : `int`
        Number of components to return. Default: 75
    only_sdev : `bool`
        Only return the standard deviation of the components. Default:
        False

    References
    ----------
    .. [1] https://tinyurl.com/yyt6df5x

    Returns
    -------
    `pd.DataFrame`
        A py:class:`pandas.DataFrame` containing the components
        (columns). Only if only_sdev=False.
    `pd.DataFrame`
        A py:class:`pandas.DataFrame` containing the contributions of
        every gene (rows).  Only if only_sdev=False.
    `pd.DataFrame`
        A py:class:`pandas.DataFrame` containing standard deviations
        of components. Only if only_sdev is set to True.
    """
    inp = data_norm
    idx = inp.index
    cols = inp.columns
    inp = inp.transpose()
    if scale:
        inp = sklearn_scale(
            inp,  # cells as rows and genes as columns
            # over genes, i.e. features (columns)
            axis=0,
            with_mean=True,  # subtracting the column means
            with_std=True)  # scale the data to unit variance
        inp = pd.DataFrame(inp, columns=idx, index=cols)
    nfeatures = inp.shape[0]
    compute_uv = not only_sdev
    if only_sdev:
        s = scipy.linalg.svd(inp, compute_uv=compute_uv)
        sdev = s / np.sqrt(nfeatures - 1)
        return sdev
    # cells should be rows and genes as columns
    U, s, Vh = scipy.linalg.svd(inp, compute_uv=compute_uv)
    Vh = Vh.transpose()
    retx = inp.dot(Vh)
    retx = retx.iloc[:, 0:ncomp]
    comp = retx
    # gene loadings
    contr = pd.DataFrame(Vh[:, 0:ncomp], index=inp.columns)
    return comp, contr
예제 #2
0
    def pca(self):

        # remove WHERE when table cleaned up to remove header rows
        statement = (
            """SELECT transcript_id, TPM, sample_id FROM %s
        where transcript_id != 'Transcript' """
            % self.table
        )

        # fetch data
        df = self.getDataFrame(statement)

        # put dataframe so row=genes, cols = samples, cells contain TPM
        pivot_df = df.pivot("transcript_id", "sample_id")["TPM"]

        # filter dataframe to get rid of genes where TPM == 0 across samples
        filtered_df = pivot_df[pivot_df.sum(axis=1) > 0]

        # add +1 to counts and log transform data.
        logdf = np.log(filtered_df + 0.1)

        # Scale dataframe so variance =1 across rows
        logscaled = sklearn_scale(logdf, axis=1)

        # turn array back to df and add transcript id back to index
        logscaled_df = pd.DataFrame(logscaled)
        logscaled_df.index = list(logdf.index)

        # Now do the PCA - can change n_components
        sklearn_pca = sklearnPCA(n_components=self.n_components)
        sklearn_pca.fit(logscaled_df)

        index = logdf.columns

        return sklearn_pca, index
예제 #3
0
    def pca(self):

        # remove WHERE when table cleaned up to remove header rows
        statement = ("""SELECT transcript_id, TPM, sample_id FROM %s
        where transcript_id != 'Transcript' """ % self.table)

        # fetch data
        df = self.getDataFrame(statement)

        # put dataframe so row=genes, cols = samples, cells contain TPM
        pivot_df = df.pivot('transcript_id', 'sample_id')['TPM']

        # filter dataframe to get rid of genes where TPM == 0 across samples
        filtered_df = pivot_df[pivot_df.sum(axis=1) > 0]

        # add +1 to counts and log transform data.
        logdf = np.log(filtered_df + 0.1)

        # Scale dataframe so variance =1 across rows
        logscaled = sklearn_scale(logdf, axis=1)

        # turn array back to df and add transcript id back to index
        logscaled_df = pd.DataFrame(logscaled)
        logscaled_df.index = list(logdf.index)

        # Now do the PCA - can change n_components
        sklearn_pca = sklearnPCA(n_components=self.n_components)
        sklearn_pca.fit(logscaled_df)

        index = logdf.columns

        return sklearn_pca, index
예제 #4
0
def impute(obj,
           filtered=True,
           res=0.5,
           drop_thre=0.5,
           nworkers='auto',
           verbose=True):
    """Impute dropouts using the method described in Li (2018) Nature
    Communications

    Notes
    -----
    Dropouts are artifacts in scRNA-seq data. One method to alleviate
    the problem with dropouts is to perform imputation (i.e. replacing
    missing data points with predicted values).

    The present method uses a different procedure for subpopulation
    identification as compared with the original paper.

    Parameters
    ----------
    obj : :class:`adobo.data.dataset`
        A data class object.
    filtered : `bool`
        If data have been filtered using
        :func:`adobo.preproc.simple_filter`, run imputation on
        filtered data; otherwise runs on the entire raw read count
        matrix.  Default: True
    res : `float`
        Resolution parameter for the Leiden clustering, change to
        modify cluster resolution. Default: 0.5
    drop_thre : `float`
        Drop threshold. Default: 0.5
    nworkers : `int` or `{'auto'}`
        If a string, then the only accepted value is 'auto', and the
        number of worker processes will be the total number of
        detected physical cores. If an integer then it specifies the
        number of worker processes. Default: 'auto'
    verbose : `bool`
        Be verbose or not. Default: True

    References
    ----------
    .. [1] Li & Li (2018)
           An accurate and robust imputation method scImpute for single-cell
           RNA-seq data https://www.nature.com/articles/s41467-018-03405-7
    .. [2] https://github.com/Vivianstats/scImpute

    Returns
    -------
    Modifies the passed object.
    """
    ncores = psutil.cpu_count(logical=False)
    if type(nworkers) == str:
        if nworkers == 'auto':
            nworkers = ncores
        else:
            raise Exception('Invalid value for parameter "nworkers".')
    elif type(nworkers) == int:
        if nworkers > ncores:
            warning('"nworkers" is set to a number higher than the available \
number of physical cores on this machine (n=%s).' % ncores)
    if verbose:
        print('%s worker processes will be used' % nworkers)
    # contains normal and gamma probability density functions implemented in C (a bit
    # faster than using scipy.stats)
    time_start = time.time()
    for p in sys.path:
        pp = glob.glob('%s/pdf.*.so' % p)
        if len(pp) == 1:
            ext = ctypes.cdll.LoadLibrary(pp[0])
    ext.dgamma.argtypes = [
        npct.ndpointer(dtype=np.double, ndim=1, flags='CONTIGUOUS'),
        ctypes.c_int, ctypes.c_double, ctypes.c_double,
        npct.ndpointer(dtype=np.double, ndim=1, flags='CONTIGUOUS')
    ]
    ext.dnorm.argtypes = [
        npct.ndpointer(dtype=np.double, ndim=1, flags='CONTIGUOUS'),
        ctypes.c_int, ctypes.c_double, ctypes.c_double,
        npct.ndpointer(dtype=np.double, ndim=1, flags='CONTIGUOUS')
    ]
    # normalize
    raw = obj.count_data.copy()
    if filtered:
        # Remove low quality cells
        remove = obj.meta_cells.status[obj.meta_cells.status != 'OK']
        raw = raw.drop(remove.index, axis=1)
        # Remove uninformative genes (e.g. lowly expressed and ERCC)
        remove = obj.meta_genes.status[obj.meta_genes.status != 'OK']
        raw = raw.drop(remove.index, axis=0)
        if verbose:
            print('Running on the quality filtered data (dimensions %sx%s)' %
                  raw.shape)
    col_sums = np.array([np.sum(i[1]) for i in raw.transpose().iterrows()])
    raw = raw * (10**6 / col_sums)
    lnorm = np.log10(raw + 1.01)
    lnorm_imp = lnorm
    # estimate subpopulations
    hvg = seurat(lnorm, ngenes=1000)  # get hvg
    lnorm_hvg = lnorm[lnorm.index.isin(hvg)]
    d_scaled = sklearn_scale(
        lnorm_hvg.transpose(),  # cells as rows and genes as columns
        # over genes, i.e. features (columns)
        axis=0,
        with_mean=True,  # subtracting the column means
        with_std=True)  # scale the data to unit variance
    d_scaled = pd.DataFrame(d_scaled.transpose(), index=lnorm_hvg.index)
    comp, _ = irlb(d_scaled)
    # estimating subpopulations
    nn_idx = knn(comp)
    snn_graph = snn(nn_idx)
    cl = np.array(leiden(snn_graph, res))
    nclust = len(np.unique(cl))
    if verbose:
        print('going to work on %s clusters' % nclust)

    def weight(x, params):
        inp = x
        g_out = np.zeros(len(inp))
        n_out = np.zeros(len(inp))
        # takes scale as input (rate=1/scale)
        ext.dgamma(np.array(inp), len(inp), params[1], 1 / params[2], g_out)
        # SLOW (scipy.stats): dgamma.pdf(x, a=params[1], scale=1, loc=0)
        pz1 = params[0] * g_out
        ext.dnorm(np.array(inp), len(inp), params[3], params[4], n_out)
        # SLOW (scipy.stats): norm.pdf(x, params[3], params[4])
        pz2 = (1 - params[0]) * n_out
        pz = pz1 / (pz1 + pz2)
        pz[pz1 == 0] = 0
        return np.array([pz, 1 - pz])

    def update_gmm_pars(x, wt):
        tp_s = np.sum(wt)
        tp_t = np.sum(wt * x)
        tp_u = np.sum(wt * np.log(x))
        tp_v = -tp_u / tp_s - np.log(tp_s / tp_t)
        if tp_v <= 0:
            alpha = 20
        else:
            alpha0 = (3 - tp_v +
                      np.sqrt((tp_v - 3)**2 + 24 * tp_v)) / 12 / tp_v
            if alpha0 >= 20:
                alpha = 20
            else:
                alpha = root(lambda x: np.log(x) - digamma(x) - tp_v,
                             0.9 * alpha0).x[0]
        beta = tp_s / tp_t * alpha
        return alpha, beta

    def dmix(x, pars):
        inp = x
        g_out = np.zeros(len(inp))
        n_out = np.zeros(len(inp))
        ext.dgamma(np.array(inp), len(inp), pars[1], 1 / pars[2], g_out)
        #dg = dgamma(a=pars[1], scale=1/pars[2], loc=0)
        # dg.pdf(x)
        #dn = norm(pars[3], pars[4])
        # dn.pdf(x)
        ext.dnorm(np.array(inp), len(inp), pars[3], pars[4], n_out)
        return pars[0] * g_out * 2 + (1 - pars[0]) * n_out

    def para_est(x):
        params = [0, 0.5, 1, 0, 0]
        params[0] = np.sum(x == np.log10(1.01)) / len(x)
        if params[0] == 0:
            params[0] = 0.01
        x_rm = x[x > np.log10(1.01)]
        params[3] = np.mean(x_rm)
        params[4] = np.std(x_rm)
        eps, iter_, loglik_old = 10, 0, 0
        while eps > 0.5:
            wt = weight(x, params)
            params[0] = np.sum(wt[0]) / len(wt[0])
            params[3] = np.sum(wt[1] * x) / np.sum(wt[1])
            params[4] = np.sqrt(
                np.sum(wt[1] * (x - params[3])**2) / np.sum(wt[1]))
            params[1:3] = update_gmm_pars(x, wt[0])
            loglik = np.sum(np.log10(dmix(x, params)))
            eps = (loglik - loglik_old)**2
            loglik_old = loglik
            iter_ = iter_ + 1
            if iter_ > 100:
                break
        return params

    def get_par(mat, verbose):
        null_genes = np.abs(mat.sum(axis=1) -
                            np.log10(1.01) * mat.shape[1]) < 1e-10
        null_genes = null_genes[null_genes].index
        paramlist = []
        i = 0
        for g, k in mat.iterrows():
            if verbose:
                if (i % 100) == 0:
                    v = ('{:,}'.format(i), '{:,}'.format(mat.shape[0]))
                    s = 'estimating model parameters. finished with %s/%s genes' % v
                    print(s, end='\r')
            if g in null_genes:
                paramlist.append([np.nan] * 5)
            else:
                paramlist.append(para_est(k.values))
            i += 1
        if verbose:
            print('\nmodel parameter estimation has finished')
        return np.array(paramlist)

    def find_va_genes(mat, parlist):
        point = np.log10(1.01)
        is_na = [not np.any(i) for i in np.isnan(np.array(parlist))]
        valid_genes = np.logical_and(
            mat.sum(axis=1) > point * mat.shape[1], is_na)
        return valid_genes
        #mu = parlist[:, 3]
        #sgene1 = valid_genes.index[mu<=np.log10(1+1.01)]
        #dcheck1 = dgamma.pdf(mu+1, a=parlist[:,1], scale=1/parlist[:,2], loc=0)
        #dcheck2 = norm.pdf(mu+1, parlist[:, 3], parlist[:, 4])
        #sgene3 = valid_genes.index[np.logical_and(dcheck1 >= dcheck2, mu <= 1)]
        # return valid_genes[np.logical_not(np.logical_or(sgene1,sgene3))].index

    for cc in np.arange(0, nclust):
        if verbose:
            print('estimating dropout probability for cluster %s' % cc)
        lnorm_cc = lnorm.iloc[:, cl == cc]
        # estimate model parameters
        parlist = get_par(lnorm_cc, verbose)
        if verbose:
            print('searching for valid genes for cluster %s' % cc)
        valid_genes = find_va_genes(lnorm_cc, parlist)
        if verbose:
            print('%s genes are valid' % '{:,}'.format(len(valid_genes)))
        subcount = lnorm_cc.loc[valid_genes, :]
        subcount = subcount.reindex(valid_genes[valid_genes].index)
        Ic = subcount.shape[0]
        Jc = subcount.shape[1]
        if Jc == 1:
            continue
        parlist = parlist[valid_genes]
        idx = 0
        droprate = []
        for g, k in subcount.iterrows():
            wt = weight(k, parlist[idx])[0]
            idx += 1
            droprate.append(wt)
        droprate = np.array(droprate)
        mu = parlist[:, 3]
        mucheck = subcount.apply(lambda x: x > mu, axis=0)
        droprate[np.logical_and(mucheck, droprate > drop_thre)] = 0
        # dropouts
        if verbose:
            print('running imputation for cluster %s' % cc)
        imputed = []
        pool = Pool(nworkers)

        def update_result(yimpute):
            imputed.append(yimpute)

        time_s = time.time()
        ids = np.arange(0, subcount.shape[1])
        if len(ids) < nworkers or len(ids) < 50:
            batch_size = len(ids)
        else:
            batch_size = round(len(ids) / nworkers)
        batch = 1
        while len(ids) > 0:
            ids_b = ids[0:batch_size]
            args = (ids_b, subcount, droprate, cc, Ic, Jc, drop_thre, verbose,
                    batch)
            r = pool.apply_async(_imputation_worker,
                                 args=args,
                                 callback=update_result)
            ids = ids[batch_size:]
            batch += 1
        pool.close()
        pool.join()
        if len(imputed) == 0:
            continue
        # sorting b/c cells are not returned from subprocesses in the
        # original order
        cellids = []
        d = []
        for item in imputed:
            cellids.append(item[0])
            d.append(item[1])
        cellids = np.concatenate(cellids)
        imputed = np.concatenate(d)
        time_e = time.time()
        if verbose:
            v = (cc, (time_e - time_s) / 60)
            print('imputation for cluster %s finished in %.2f minutes' % v)
        imputed = imputed.transpose()
        imputed = pd.DataFrame(imputed)
        imputed.columns = cellids
        imputed.index = valid_genes[valid_genes].index
        imputed = imputed.sort_index(axis=1)
        lnorm_imp.loc[valid_genes, lnorm_cc.columns] = imputed.to_numpy()
    # reverse normalisation
    lnorm_imp = 10**lnorm_imp - 1.01
    lnorm_imp = lnorm_imp * (col_sums / 10**6)
    lnorm_imp = round(lnorm_imp, 2)
    obj.imp_count_data = lnorm_imp
    time_end = time.time()
    if verbose:
        t = (time_end - time_start) / 60
        print('imputation finished in %.2f minutes. imputed data are present \
in the "imp_count_data" attribute.' % t)
    obj.set_assay(sys._getframe().f_code.co_name)
예제 #5
0
def cell_cycle_predict(obj, clf, tr_features, name=(), verbose=False):
    """Predicts cell cycle phase

    Notes
    -----
    The classifier is trained on mouse data, so it should _only_ be
    used on mouse data unless it is trained on something else. Gene
    identifiers must use ensembl identifiers (prefixed with
    'ENSMUSG'); pure gene symbols are not enough. Results are returned
    as a column in the data frame `meta_cells` of the passed
    object. Does not return probability scores.

    Parameters
    ----------
    obj : :class:`adobo.data.dataset`
        A data class object.
    clf : `sklearn.linear_model.SGDClassifier`
        The classifier.
    tr_features : `list`
        Training features.
    name : `tuple`
        A tuple of normalization to use. If it has the length zero,
        then all available normalizations will be used.
    verbose : `bool`
        Be verbose. Default: False

    Returns
    -------
    Modifies the passed object.
    """
    targets = {}
    if len(name) == 0 or name == '':
        targets = obj.norm_data
    else:
        targets[name] = obj.norm_data[name]
    for i, k in enumerate(targets):
        if verbose:
            print('Running cell type prediction on %s' % k)
        item = targets[k]
        X = item['data']
        cols = X.columns
        if X.index[0].rfind('ENSMUSG') < 0:
            raise Exception('Gene identifiers must use ENSMUSG format. Are \
you sure this is mouse data?')
        X_g = X.index
        if re.search('ENSMUSG\d+\.\d+', X_g[0]):
            X_g = X_g.str.extract('^(.*)\.[0-9]+$', expand=False)
        if re.search('_ENSMUSG', X_g[0]):
            X_g = X_g.str.extract('^\S+?_(\S+)$', expand=False)
        X_found = X[X_g.isin(symb)]
        X_g = X_found.index
        if re.search('ENSMUSG\d+\.\d+', X_g[0]):
            X_g = X_g.str.extract('^(.*)\.[0-9]+$', expand=False)
        if re.search('_ENSMUSG', X_g[0]):
            X_g = X_g.str.extract('^\S+?_(\S+)$', expand=False)
        if len(X_found) == 0:
            raise Exception('No genes found.')
        X_found.index = X_g
        symb = [i[1] for i in tr_features.str.split('_')]
        symb = pd.Series(symb)
        missing = symb[np.logical_not(symb.isin(X_g))]
        X_empty = pd.DataFrame(np.zeros((len(missing), X_found.shape[1])))
        X_empty.index = missing
        X_empty.columns = X_found.columns
        X = pd.concat([X_found, X_empty])
        X = X.reindex(symb)
        # scale
        X = X.transpose()  # cells as rows and genes as columns
        X = sklearn_scale(
            X,
            # over genes, i.e. features (columns)
            axis=0,
            with_mean=True,  # subtracting the column means
            with_std=True)  # scale the data to unit variance
        pred = clf.predict(X)
        srs = pd.Series(pred, dtype='category', index=cols)
        obj.add_meta_data(axis='cells',
                          key='cell_cycle',
                          data=srs,
                          type_='cat')
예제 #6
0
def cell_cycle_train(verbose=False):
    """Trains a cell cycle classifier using Stochastic Gradient
    Descent with data from Buettner et al.

    Notes
    -----
    Genes are selected from GO:0007049

    Does only need to be trained once; the second time it is
    serialized from disk.

    Parameters
    ----------
    verbose : `bool`
        Be verbose or not. Default: False

    References
    ----------
    .. [1] Buettner et al. (2015) Computational analysis of
           cell-to-cell heterogeneity in single-cell RNA-sequencing
           data reveals hidden subpopulations of cells. Nat Biotech.

    Returns
    -------
    `sklearn.linear_model.SGDClassifier`
        A trained classifier.
    `list`
        Containing training features.
    """
    path_pkg = re.sub('/_log.py', '', adobo._log.__file__)
    path_data = path_pkg + '/data/Buettner_2015.mat'
    path_gene_lengths = path_pkg + '/data/Buettner_2015.mat.lengths'
    path_cc_genes = path_pkg + '/data/GO_0007049.txt'  # cell cycle genes
    path_clf = path_pkg + '/data/cc_classifier.joblib'
    if os.path.exists(path_clf):
        clf, features = joblib.load(path_clf)
        if verbose:
            print('A trained classifier was found. \
Loading it from %s' % path_clf)
    else:
        desc = 'Buettner et al. (2015) doi:10.1038/nbt.3102'
        B = adobo.IO.load_from_file(path_data, desc=desc)
        adobo.preproc.detect_ercc_spikes(B, ercc_pattern='NA_ERCC-[0-9]+')
        adobo.normalize.norm(B, method='rpkm', gene_lengths=path_gene_lengths)
        cc_genes = pd.read_csv(path_cc_genes, sep='\t', header=None)
        symb = pd.Series([i[0] for i in B.norm.index.str.split('_')])
        norm_cc_mat = B.norm[symb.isin(cc_genes[1]).values]
        X = norm_cc_mat.transpose()  # cells as rows and genes as columns
        X = sklearn_scale(
            X,
            # over genes, i.e. features (columns)
            axis=0,
            with_mean=True,  # subtracting the column means
            with_std=True)  # scale the data to unit variance
        Y = [i[0] for i in norm_cc_mat.columns.str.split('_')]

        clf = SGDClassifier(loss='hinge',
                            penalty='l2',
                            max_iter=5,
                            shuffle=True,
                            verbose=verbose)
        clf.fit(X, Y)
        features = norm_cc_mat.index
        joblib.dump([clf, features], path_clf)
    # np.sum(clf.predict(X) != Y)
    return clf, features
예제 #7
0
def cell_type_predict(obj,
                      name=(),
                      clustering=(),
                      min_cluster_size=10,
                      cell_type_markers=None,
                      verbose=False):
    """Predicts cell types using the expression of marker genes

    Notes
    -----
    Gene identifiers should be in symbol form, not ensembl
    identifiers, etc.

    Parameters
    ----------
    obj : :class:`adobo.data.dataset`
        A data class object.
    name : `tuple`
        A tuple of normalization to use. If it has the length zero,
        then all available normalizations will be used.
    clustering : `tuple`, optional
        Specifies the clustering outcomes to work on.
    min_cluster_size : `int`
        Minimum number of cells per cluster; clusters smaller than
        this are ignored.  Default: 10
    cell_type_markers : `pandas.DataFrame`
        Source of gene markers used to define cell types. This is set
        to None as default, indicating that PanglaoDB markers will be
        used. To use custom markers, set this to a pandas data frame
        where the first column is a gene and the second column is the
        name of the cell type (every cell type will have multiple
        rows). Default: None
    Default: None
    verbose : `bool`
        Be verbose or not. Default: False

    Returns
    -------
    Modifies the passed object.
    """
    targets = {}
    if len(name) == 0 or name == '':
        targets = obj.norm_data
    else:
        targets[name] = obj.norm_data[name]

    if isinstance(cell_type_markers, pd.DataFrame):
        # custom cell type markers were provided
        ma_ss = cell_type_markers
        ma_ss.columns = ['official gene symbol', 'cell type']
    else:
        ma = pd.read_csv('%s/data/markers.tsv' %
                         os.path.dirname(adobo.IO.__file__),
                         sep='\t')
        # restrict to mouse
        ma = ma[ma.species.str.match('Mm')]
        markers = ma
        ui = ma.iloc[:, ma.columns == 'ubiquitousness index']
        ma = ma[np.array(ui).flatten() < 0.05]
        ma_ss = ma.iloc[:,
                        ma.columns.isin(['official gene symbol', 'cell type'])]

    marker_freq = ma_ss[ma_ss.columns[0]].value_counts()
    markers = ma_ss
    # reference symbols
    fn = '%s/data/mouse_gene_symbols.txt' % os.path.dirname(adobo.IO.__file__)
    mgs = pd.read_csv(fn, header=None)
    mgs = mgs[0].str.upper()
    markers = markers[markers[markers.columns[0]].isin(mgs)]
    dd = defaultdict(list)
    for item in markers.groupby('cell type'):
        dd[item[0]] = set(item[1][item[1].columns[0]])
    # down-weighting overlapping genes improves gene set analysis
    # Tarca AL, Draghici S, Bhatti G, Romero R; BMC Bioinformatics 2012 13:136
    s = mgs.unique()
    s_freqs = marker_freq[marker_freq.index.isin(s)]
    weights = 1 + np.sqrt(
        ((max(marker_freq) - s_freqs) / (max(marker_freq) - min(marker_freq))))

    def _guess_cell_type(x):
        rr = median_expr.loc[:, median_expr.columns == x.name].values.flatten()
        # genes expressed in this cell cluster
        genes_exp = set(x.index[rr > 0])
        # genes _not_ expressed in this cell cluster
        genes_not_exp = set(x.index[rr == 0])
        res = list()
        for ct in dd:
            s = dd[ct]
            x_ss = x[x.index.isin(s)]
            if len(x_ss) == 0:
                continue
            gene_weights = weights[weights.index.isin(x_ss.index)]
            gene_weights = pd.Series(gene_weights, x_ss.index)
            activity_score = sum(x_ss * gene_weights) / len(x_ss)**0.3
            # how many expressed genesets are found in the geneset?
            ct_exp = len(genes_exp & s)
            # how many _non_ expressed genes are found in the geneset?
            ct_non_exp = len(genes_not_exp & s)
            # how many expressed genes are NOT found in the geneset?
            ct_exp_not_found = len(genes_exp - s)
            # how many _non_ expressed genes are NOT found in the geneset?
            not_exp_not_found_in_geneset = len(genes_not_exp - s)
            # one sided fisher
            contigency_tbl = [[ct_exp, ct_non_exp],
                              [ct_exp_not_found, not_exp_not_found_in_geneset]]
            odds_ratio, pval = fisher_exact(contigency_tbl,
                                            alternative='greater')
            markers_found = ','.join(list(genes_exp & s))
            if markers_found == '':
                markers_found = 'NA'
            res.append({
                'activity_score': activity_score,
                'ct': ct,
                'pvalue': pval,
                'markers': markers_found
            })
        res = sorted(res, key=lambda k: k['activity_score'], reverse=True)
        return res

    for i, k in enumerate(targets):
        if verbose:
            print('Running cell type prediction on %s' % k)
        item = targets[k]
        X = item['data']
        clusters = item['clusters']
        if len(clusters) == 0:
            raise Exception(
                'No clusters found, run adobo.clustering.generate(...) first.')
        for algo in clusters:
            if len(clustering) == 0 or algo in clustering:
                if verbose:
                    print('Running on the %s clustering' % algo)
                cl = clusters[algo]['membership']
                ret = X.sparse.to_dense().groupby(cl.values,
                                                  axis=1).aggregate(np.median)
                q = pd.Series(cl).value_counts()
                cl_remove = q[q < min_cluster_size].index
                ret = ret.iloc[:, np.logical_not(ret.columns.isin(cl_remove))]
                median_expr = ret
                obj.norm_data[k]['clusters'][algo]['median_expr'] = median_expr
                median_expr.index = median_expr.index.str.upper()
                s = np.sum(median_expr.index.str.match('^(.+)_.+'))
                if median_expr.shape[0] == s:
                    input_symbols = median_expr.index.str.extract(
                        '^(.+)_.+')[0]
                    input_symbols = input_symbols.str.upper()
                    median_expr.index = input_symbols
                # (1) centering is done by subtracting the column means
                # (2) scaling is done by dividing the (centered) by their standard
                # deviations
                scaled = sklearn_scale(median_expr, with_mean=True, axis=0)
                median_expr_Z = pd.DataFrame(scaled)
                median_expr_Z.index = median_expr.index
                median_expr_Z.columns = median_expr.columns
                ret = median_expr_Z.apply(func=_guess_cell_type, axis=0)
                # restructure
                bucket = []
                for i, kk in enumerate(ret):
                    _df = pd.DataFrame(kk)
                    _df['cluster'] = [i] * len(kk)
                    cols = _df.columns.tolist()
                    _df = _df[cols[-1:] + cols[:-1]]
                    bucket.append(_df)
                final_tbl = pd.concat(bucket)
                if final_tbl.shape[0] == 0:
                    raise Exception('Final table is empty. Check gene symbols \
of input data.')
                padj = p_adjust_bh(final_tbl['pvalue'])
                final_tbl['padj_BH'] = padj
                final_tbl.columns = [
                    'cluster', 'activity score', 'cell type', 'p-value',
                    'markers', 'adjusted p-value BH'
                ]
                # save the best scoring for each cluster
                res_pred = final_tbl.groupby('cluster').nth(0)
                _a = res_pred['adjusted p-value BH'] > 0.10
                res_pred.loc[_a, 'cell type'] = 'Unknown'
                key = 'cell_type_prediction'
                obj.norm_data[k]['clusters'][algo][key] = res_pred
                key = 'cell_type_prediction_full'
                obj.norm_data[k]['clusters'][algo][key] = final_tbl
예제 #8
0
def jackstraw(obj,
              normalization=None,
              permutations=500,
              ncomp=None,
              subset_frac_genes=0.05,
              score_thr=1e-03,
              fdr=0.01,
              retx=True,
              verbose=False):
    """Determine the number of relevant PCA components.

    Notes
    -----
    Permutes a subset of the data matrix and compares PCA scores with
    the original. The final output is a p-value for each component
    generated using a Chi-sq test.

    Parameters
    ----------
    obj : :class:`adobo.data.dataset`
          A dataset class object.
    normalization : `str`
        The name of the normalization to operate on. If this is empty
        or None then the function will be applied on all
        normalizations available.
    permutations : `int`
        Number of permutations to run. Default: 500
    ncomp : `int`
        Number of principal components to calculate significance
        for. If None, then will calculate for all components
        previously saved from py:func:`adobo.dr.pca`.  Default: None
    subset_frac_genes : `float`
        Proportion genes to use. Default: 0.10
    score_thr : `float`
        Threshold for significance. Default: 1e-05
    fdr : `float`
        Acceptable false discovery rate. Default: 0.01
    retx : `bool`
        In addition to also modifying the object, also return
        results. Default: True
    verbose : `bool`
        Be verbose. Default: False

    References
    ----------
    .. [1] Chung & Storey (2015) Statistical significance of variables
            driving systematic variation in high-dimensional data,
            Bioinformatics
            https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4325543/

    Returns
    -------
    pandas.DataFrame
        A genes by principal component data frame containing empirical
        p-values for the significance of every gene of the PC.
    pandas.DataFrame
        A data frame containing a single p-value for every PC
        generated from a Chi^2 test.  Can be used to select the number
        of components to include by examinng p-values.
    """
    start_time = time.time()
    if normalization == None or normalization == '':
        norm = list(obj.norm_data.keys())[-1]
    else:
        norm = normalization
    item = obj.norm_data[norm]
    try:
        loadings = np.abs(item['dr']['pca']['contr'])
    except KeyError:
        raise Exception('Run `adobo.dr.pca(...)` first.')
    X = item['data']
    if not ncomp:
        ncomp = loadings.shape[1]
    elif ncomp > loadings.shape[1]:
        raise Exception('"ncomp" is higher than the number of available \
components computed by adobo.dr.pca(...)')
    if verbose:
        print('computing for ncomp=%s' % ncomp)
    try:
        hvg = item['hvg']['genes']
    except KeyError:
        raise Exception('Run adobo.dr.find_hvg() first.')
    X = X[X.index.isin(hvg)]
    X_scaled = sklearn_scale(X.transpose(),
                             axis=0,
                             with_mean=True,
                             with_std=True).transpose()
    X_scaled = pd.DataFrame(X_scaled, index=X.index, columns=X.columns)

    perm_loadings = []
    for perm in np.arange(0, permutations):
        if verbose:
            print('random set %s ' % perm)
        rand_genes = sample(list(X.index),
                            round(X.shape[0] * subset_frac_genes))
        X_cp = X_scaled.copy()
        data_perm = X_cp.loc[rand_genes, :]
        # permutate every row
        data_perm = [
            np.random.permutation(_col) for g, _col in data_perm.iterrows()
        ]
        data_perm = pd.DataFrame(np.array(data_perm),
                                 index=rand_genes,
                                 columns=X_cp.columns)
        # put permutated data back into the original data
        X_cp.loc[rand_genes, :] = data_perm
        comp, contr = irlb(X_cp, scale=False, ncomp=ncomp)
        pl = contr[contr.index.isin(rand_genes)].iloc[:, 0:ncomp]
        pl = np.abs(pl)
        perm_loadings.append(pl)
    perm_loadings = pd.concat(perm_loadings, axis=0, ignore_index=True)
    res = []
    for i, pc in perm_loadings.iloc[:, 0:ncomp].transpose().iterrows():
        real = loadings[i]
        emp_p = [np.sum(pc > val) / len(pc) for g, val in real.iteritems()]
        res.append(pd.Series(emp_p, name=i))
    res = pd.concat(res, axis=1, ignore_index=True)
    n = [
        q1 + q2 for q1, q2 in zip(['PC'] *
                                  res.shape[1], res.columns.values.astype(str))
    ]
    res.columns = n
    # generate one p-value per component
    final = []
    for i, pc in res.transpose().iterrows():
        nsign_found = np.sum(pc < score_thr)
        # expecting a uniform distribution
        nsign_expected = np.floor(len(pc) * score_thr)
        ct = [[nsign_found, nsign_expected],
              [len(pc) - nsign_found,
               len(pc) - nsign_expected]]
        try:
            pv = chi2_contingency(np.array(ct))[1]
        except ValueError:
            pv = 1
        final.append([i, pv])
    final = pd.DataFrame(final)
    final['p.adj'] = p_adjust_bh(final[1])
    final.columns = ['PC', 'chi2_p', 'chi2_p_adj']
    final['significant'] = final.chi2_p_adj < fdr
    end_time = time.time()
    if verbose:
        print('Analysis took %.2f minutes' % ((end_time - start_time) / 60))
    obj.norm_data[norm]['dr']['jackstraw'] = {
        'score_mat': res,
        'results_by_comp': final
    }
    if retx:
        return res, final
예제 #9
0
def irlb(data_norm, scale=True, ncomp=75, var_weigh=True, seed=None):
    """Truncated SVD by implicitly restarted Lanczos bidiagonalization

    Notes
    -----
    The augmented implicitly restarted Lanczos bidiagonalization
    algorithm (IRLBA) finds a few approximate largest singular values
    and corresponding singular vectors using a method of Baglama and
    Reichel.

    Cells should be rows and genes as columns.

    Parameters
    ----------
    data_norm : :py:class:`pandas.DataFrame`
        A pandas data frame containing normalized gene expression data.
    scale : `bool`
        Scales input data prior to PCA. Default: True
    ncomp : `int`
        Number of components to return. Default: 75
    var_weigh : `bool`
        Weigh by the variance of each component. Default: True
    seed : `int`
        For reproducibility. Default: None

    References
    ----------
    .. [1] Baglama et al (2005) Augmented Implicitly Restarted Lanczos
           Bidiagonalization Methods SIAM Journal on Scientific
           Computing
    .. [2] https://github.com/bwlewis/irlbpy

    Returns
    -------
    `pd.DataFrame`
        A py:class:`pandas.DataFrame` containing the components
        (columns).
    `pd.DataFrame`
        A py:class:`pandas.DataFrame` containing the contributions of
        every gene (rows).
    """
    inp = data_norm
    idx = inp.index
    cols = inp.columns
    inp = inp.transpose()
    if scale:
        inp = sklearn_scale(
            inp.sparse.to_dense(),  # cells as rows and genes as columns
            # over genes, i.e. features (columns)
            axis=0,
            with_mean=True,  # subtracting the column means
            with_std=True)  # scale the data to unit variance
        inp = pd.DataFrame(inp, columns=idx, index=cols)
    # cells should be rows and genes as columns
    lanc = irlbpy.lanczos(inp, nval=ncomp, maxit=1000, seed=seed)
    if var_weigh:
        # weighing by variance
        comp = np.dot(lanc.U, np.diag(lanc.s))
    else:
        comp = lanc.U
    comp = pd.DataFrame(comp, index=inp.index)
    # gene loadings
    contr = pd.DataFrame(lanc.V, index=inp.columns)
    return comp, contr