def svd(data_norm, scale=True, ncomp=75, only_sdev=False): """Principal component analysis via singular value decomposition Parameters ---------- data_norm : :class:`pandas.DataFrame` A pandas data frame containing normalized gene expression data. Preferrably this should be a subset of the normalized gene expression matrix containing highly variable genes. scale : `bool` Scales input data prior to PCA. Default: True ncomp : `int` Number of components to return. Default: 75 only_sdev : `bool` Only return the standard deviation of the components. Default: False References ---------- .. [1] https://tinyurl.com/yyt6df5x Returns ------- `pd.DataFrame` A py:class:`pandas.DataFrame` containing the components (columns). Only if only_sdev=False. `pd.DataFrame` A py:class:`pandas.DataFrame` containing the contributions of every gene (rows). Only if only_sdev=False. `pd.DataFrame` A py:class:`pandas.DataFrame` containing standard deviations of components. Only if only_sdev is set to True. """ inp = data_norm idx = inp.index cols = inp.columns inp = inp.transpose() if scale: inp = sklearn_scale( inp, # cells as rows and genes as columns # over genes, i.e. features (columns) axis=0, with_mean=True, # subtracting the column means with_std=True) # scale the data to unit variance inp = pd.DataFrame(inp, columns=idx, index=cols) nfeatures = inp.shape[0] compute_uv = not only_sdev if only_sdev: s = scipy.linalg.svd(inp, compute_uv=compute_uv) sdev = s / np.sqrt(nfeatures - 1) return sdev # cells should be rows and genes as columns U, s, Vh = scipy.linalg.svd(inp, compute_uv=compute_uv) Vh = Vh.transpose() retx = inp.dot(Vh) retx = retx.iloc[:, 0:ncomp] comp = retx # gene loadings contr = pd.DataFrame(Vh[:, 0:ncomp], index=inp.columns) return comp, contr
def pca(self): # remove WHERE when table cleaned up to remove header rows statement = ( """SELECT transcript_id, TPM, sample_id FROM %s where transcript_id != 'Transcript' """ % self.table ) # fetch data df = self.getDataFrame(statement) # put dataframe so row=genes, cols = samples, cells contain TPM pivot_df = df.pivot("transcript_id", "sample_id")["TPM"] # filter dataframe to get rid of genes where TPM == 0 across samples filtered_df = pivot_df[pivot_df.sum(axis=1) > 0] # add +1 to counts and log transform data. logdf = np.log(filtered_df + 0.1) # Scale dataframe so variance =1 across rows logscaled = sklearn_scale(logdf, axis=1) # turn array back to df and add transcript id back to index logscaled_df = pd.DataFrame(logscaled) logscaled_df.index = list(logdf.index) # Now do the PCA - can change n_components sklearn_pca = sklearnPCA(n_components=self.n_components) sklearn_pca.fit(logscaled_df) index = logdf.columns return sklearn_pca, index
def pca(self): # remove WHERE when table cleaned up to remove header rows statement = ("""SELECT transcript_id, TPM, sample_id FROM %s where transcript_id != 'Transcript' """ % self.table) # fetch data df = self.getDataFrame(statement) # put dataframe so row=genes, cols = samples, cells contain TPM pivot_df = df.pivot('transcript_id', 'sample_id')['TPM'] # filter dataframe to get rid of genes where TPM == 0 across samples filtered_df = pivot_df[pivot_df.sum(axis=1) > 0] # add +1 to counts and log transform data. logdf = np.log(filtered_df + 0.1) # Scale dataframe so variance =1 across rows logscaled = sklearn_scale(logdf, axis=1) # turn array back to df and add transcript id back to index logscaled_df = pd.DataFrame(logscaled) logscaled_df.index = list(logdf.index) # Now do the PCA - can change n_components sklearn_pca = sklearnPCA(n_components=self.n_components) sklearn_pca.fit(logscaled_df) index = logdf.columns return sklearn_pca, index
def impute(obj, filtered=True, res=0.5, drop_thre=0.5, nworkers='auto', verbose=True): """Impute dropouts using the method described in Li (2018) Nature Communications Notes ----- Dropouts are artifacts in scRNA-seq data. One method to alleviate the problem with dropouts is to perform imputation (i.e. replacing missing data points with predicted values). The present method uses a different procedure for subpopulation identification as compared with the original paper. Parameters ---------- obj : :class:`adobo.data.dataset` A data class object. filtered : `bool` If data have been filtered using :func:`adobo.preproc.simple_filter`, run imputation on filtered data; otherwise runs on the entire raw read count matrix. Default: True res : `float` Resolution parameter for the Leiden clustering, change to modify cluster resolution. Default: 0.5 drop_thre : `float` Drop threshold. Default: 0.5 nworkers : `int` or `{'auto'}` If a string, then the only accepted value is 'auto', and the number of worker processes will be the total number of detected physical cores. If an integer then it specifies the number of worker processes. Default: 'auto' verbose : `bool` Be verbose or not. Default: True References ---------- .. [1] Li & Li (2018) An accurate and robust imputation method scImpute for single-cell RNA-seq data https://www.nature.com/articles/s41467-018-03405-7 .. [2] https://github.com/Vivianstats/scImpute Returns ------- Modifies the passed object. """ ncores = psutil.cpu_count(logical=False) if type(nworkers) == str: if nworkers == 'auto': nworkers = ncores else: raise Exception('Invalid value for parameter "nworkers".') elif type(nworkers) == int: if nworkers > ncores: warning('"nworkers" is set to a number higher than the available \ number of physical cores on this machine (n=%s).' % ncores) if verbose: print('%s worker processes will be used' % nworkers) # contains normal and gamma probability density functions implemented in C (a bit # faster than using scipy.stats) time_start = time.time() for p in sys.path: pp = glob.glob('%s/pdf.*.so' % p) if len(pp) == 1: ext = ctypes.cdll.LoadLibrary(pp[0]) ext.dgamma.argtypes = [ npct.ndpointer(dtype=np.double, ndim=1, flags='CONTIGUOUS'), ctypes.c_int, ctypes.c_double, ctypes.c_double, npct.ndpointer(dtype=np.double, ndim=1, flags='CONTIGUOUS') ] ext.dnorm.argtypes = [ npct.ndpointer(dtype=np.double, ndim=1, flags='CONTIGUOUS'), ctypes.c_int, ctypes.c_double, ctypes.c_double, npct.ndpointer(dtype=np.double, ndim=1, flags='CONTIGUOUS') ] # normalize raw = obj.count_data.copy() if filtered: # Remove low quality cells remove = obj.meta_cells.status[obj.meta_cells.status != 'OK'] raw = raw.drop(remove.index, axis=1) # Remove uninformative genes (e.g. lowly expressed and ERCC) remove = obj.meta_genes.status[obj.meta_genes.status != 'OK'] raw = raw.drop(remove.index, axis=0) if verbose: print('Running on the quality filtered data (dimensions %sx%s)' % raw.shape) col_sums = np.array([np.sum(i[1]) for i in raw.transpose().iterrows()]) raw = raw * (10**6 / col_sums) lnorm = np.log10(raw + 1.01) lnorm_imp = lnorm # estimate subpopulations hvg = seurat(lnorm, ngenes=1000) # get hvg lnorm_hvg = lnorm[lnorm.index.isin(hvg)] d_scaled = sklearn_scale( lnorm_hvg.transpose(), # cells as rows and genes as columns # over genes, i.e. features (columns) axis=0, with_mean=True, # subtracting the column means with_std=True) # scale the data to unit variance d_scaled = pd.DataFrame(d_scaled.transpose(), index=lnorm_hvg.index) comp, _ = irlb(d_scaled) # estimating subpopulations nn_idx = knn(comp) snn_graph = snn(nn_idx) cl = np.array(leiden(snn_graph, res)) nclust = len(np.unique(cl)) if verbose: print('going to work on %s clusters' % nclust) def weight(x, params): inp = x g_out = np.zeros(len(inp)) n_out = np.zeros(len(inp)) # takes scale as input (rate=1/scale) ext.dgamma(np.array(inp), len(inp), params[1], 1 / params[2], g_out) # SLOW (scipy.stats): dgamma.pdf(x, a=params[1], scale=1, loc=0) pz1 = params[0] * g_out ext.dnorm(np.array(inp), len(inp), params[3], params[4], n_out) # SLOW (scipy.stats): norm.pdf(x, params[3], params[4]) pz2 = (1 - params[0]) * n_out pz = pz1 / (pz1 + pz2) pz[pz1 == 0] = 0 return np.array([pz, 1 - pz]) def update_gmm_pars(x, wt): tp_s = np.sum(wt) tp_t = np.sum(wt * x) tp_u = np.sum(wt * np.log(x)) tp_v = -tp_u / tp_s - np.log(tp_s / tp_t) if tp_v <= 0: alpha = 20 else: alpha0 = (3 - tp_v + np.sqrt((tp_v - 3)**2 + 24 * tp_v)) / 12 / tp_v if alpha0 >= 20: alpha = 20 else: alpha = root(lambda x: np.log(x) - digamma(x) - tp_v, 0.9 * alpha0).x[0] beta = tp_s / tp_t * alpha return alpha, beta def dmix(x, pars): inp = x g_out = np.zeros(len(inp)) n_out = np.zeros(len(inp)) ext.dgamma(np.array(inp), len(inp), pars[1], 1 / pars[2], g_out) #dg = dgamma(a=pars[1], scale=1/pars[2], loc=0) # dg.pdf(x) #dn = norm(pars[3], pars[4]) # dn.pdf(x) ext.dnorm(np.array(inp), len(inp), pars[3], pars[4], n_out) return pars[0] * g_out * 2 + (1 - pars[0]) * n_out def para_est(x): params = [0, 0.5, 1, 0, 0] params[0] = np.sum(x == np.log10(1.01)) / len(x) if params[0] == 0: params[0] = 0.01 x_rm = x[x > np.log10(1.01)] params[3] = np.mean(x_rm) params[4] = np.std(x_rm) eps, iter_, loglik_old = 10, 0, 0 while eps > 0.5: wt = weight(x, params) params[0] = np.sum(wt[0]) / len(wt[0]) params[3] = np.sum(wt[1] * x) / np.sum(wt[1]) params[4] = np.sqrt( np.sum(wt[1] * (x - params[3])**2) / np.sum(wt[1])) params[1:3] = update_gmm_pars(x, wt[0]) loglik = np.sum(np.log10(dmix(x, params))) eps = (loglik - loglik_old)**2 loglik_old = loglik iter_ = iter_ + 1 if iter_ > 100: break return params def get_par(mat, verbose): null_genes = np.abs(mat.sum(axis=1) - np.log10(1.01) * mat.shape[1]) < 1e-10 null_genes = null_genes[null_genes].index paramlist = [] i = 0 for g, k in mat.iterrows(): if verbose: if (i % 100) == 0: v = ('{:,}'.format(i), '{:,}'.format(mat.shape[0])) s = 'estimating model parameters. finished with %s/%s genes' % v print(s, end='\r') if g in null_genes: paramlist.append([np.nan] * 5) else: paramlist.append(para_est(k.values)) i += 1 if verbose: print('\nmodel parameter estimation has finished') return np.array(paramlist) def find_va_genes(mat, parlist): point = np.log10(1.01) is_na = [not np.any(i) for i in np.isnan(np.array(parlist))] valid_genes = np.logical_and( mat.sum(axis=1) > point * mat.shape[1], is_na) return valid_genes #mu = parlist[:, 3] #sgene1 = valid_genes.index[mu<=np.log10(1+1.01)] #dcheck1 = dgamma.pdf(mu+1, a=parlist[:,1], scale=1/parlist[:,2], loc=0) #dcheck2 = norm.pdf(mu+1, parlist[:, 3], parlist[:, 4]) #sgene3 = valid_genes.index[np.logical_and(dcheck1 >= dcheck2, mu <= 1)] # return valid_genes[np.logical_not(np.logical_or(sgene1,sgene3))].index for cc in np.arange(0, nclust): if verbose: print('estimating dropout probability for cluster %s' % cc) lnorm_cc = lnorm.iloc[:, cl == cc] # estimate model parameters parlist = get_par(lnorm_cc, verbose) if verbose: print('searching for valid genes for cluster %s' % cc) valid_genes = find_va_genes(lnorm_cc, parlist) if verbose: print('%s genes are valid' % '{:,}'.format(len(valid_genes))) subcount = lnorm_cc.loc[valid_genes, :] subcount = subcount.reindex(valid_genes[valid_genes].index) Ic = subcount.shape[0] Jc = subcount.shape[1] if Jc == 1: continue parlist = parlist[valid_genes] idx = 0 droprate = [] for g, k in subcount.iterrows(): wt = weight(k, parlist[idx])[0] idx += 1 droprate.append(wt) droprate = np.array(droprate) mu = parlist[:, 3] mucheck = subcount.apply(lambda x: x > mu, axis=0) droprate[np.logical_and(mucheck, droprate > drop_thre)] = 0 # dropouts if verbose: print('running imputation for cluster %s' % cc) imputed = [] pool = Pool(nworkers) def update_result(yimpute): imputed.append(yimpute) time_s = time.time() ids = np.arange(0, subcount.shape[1]) if len(ids) < nworkers or len(ids) < 50: batch_size = len(ids) else: batch_size = round(len(ids) / nworkers) batch = 1 while len(ids) > 0: ids_b = ids[0:batch_size] args = (ids_b, subcount, droprate, cc, Ic, Jc, drop_thre, verbose, batch) r = pool.apply_async(_imputation_worker, args=args, callback=update_result) ids = ids[batch_size:] batch += 1 pool.close() pool.join() if len(imputed) == 0: continue # sorting b/c cells are not returned from subprocesses in the # original order cellids = [] d = [] for item in imputed: cellids.append(item[0]) d.append(item[1]) cellids = np.concatenate(cellids) imputed = np.concatenate(d) time_e = time.time() if verbose: v = (cc, (time_e - time_s) / 60) print('imputation for cluster %s finished in %.2f minutes' % v) imputed = imputed.transpose() imputed = pd.DataFrame(imputed) imputed.columns = cellids imputed.index = valid_genes[valid_genes].index imputed = imputed.sort_index(axis=1) lnorm_imp.loc[valid_genes, lnorm_cc.columns] = imputed.to_numpy() # reverse normalisation lnorm_imp = 10**lnorm_imp - 1.01 lnorm_imp = lnorm_imp * (col_sums / 10**6) lnorm_imp = round(lnorm_imp, 2) obj.imp_count_data = lnorm_imp time_end = time.time() if verbose: t = (time_end - time_start) / 60 print('imputation finished in %.2f minutes. imputed data are present \ in the "imp_count_data" attribute.' % t) obj.set_assay(sys._getframe().f_code.co_name)
def cell_cycle_predict(obj, clf, tr_features, name=(), verbose=False): """Predicts cell cycle phase Notes ----- The classifier is trained on mouse data, so it should _only_ be used on mouse data unless it is trained on something else. Gene identifiers must use ensembl identifiers (prefixed with 'ENSMUSG'); pure gene symbols are not enough. Results are returned as a column in the data frame `meta_cells` of the passed object. Does not return probability scores. Parameters ---------- obj : :class:`adobo.data.dataset` A data class object. clf : `sklearn.linear_model.SGDClassifier` The classifier. tr_features : `list` Training features. name : `tuple` A tuple of normalization to use. If it has the length zero, then all available normalizations will be used. verbose : `bool` Be verbose. Default: False Returns ------- Modifies the passed object. """ targets = {} if len(name) == 0 or name == '': targets = obj.norm_data else: targets[name] = obj.norm_data[name] for i, k in enumerate(targets): if verbose: print('Running cell type prediction on %s' % k) item = targets[k] X = item['data'] cols = X.columns if X.index[0].rfind('ENSMUSG') < 0: raise Exception('Gene identifiers must use ENSMUSG format. Are \ you sure this is mouse data?') X_g = X.index if re.search('ENSMUSG\d+\.\d+', X_g[0]): X_g = X_g.str.extract('^(.*)\.[0-9]+$', expand=False) if re.search('_ENSMUSG', X_g[0]): X_g = X_g.str.extract('^\S+?_(\S+)$', expand=False) X_found = X[X_g.isin(symb)] X_g = X_found.index if re.search('ENSMUSG\d+\.\d+', X_g[0]): X_g = X_g.str.extract('^(.*)\.[0-9]+$', expand=False) if re.search('_ENSMUSG', X_g[0]): X_g = X_g.str.extract('^\S+?_(\S+)$', expand=False) if len(X_found) == 0: raise Exception('No genes found.') X_found.index = X_g symb = [i[1] for i in tr_features.str.split('_')] symb = pd.Series(symb) missing = symb[np.logical_not(symb.isin(X_g))] X_empty = pd.DataFrame(np.zeros((len(missing), X_found.shape[1]))) X_empty.index = missing X_empty.columns = X_found.columns X = pd.concat([X_found, X_empty]) X = X.reindex(symb) # scale X = X.transpose() # cells as rows and genes as columns X = sklearn_scale( X, # over genes, i.e. features (columns) axis=0, with_mean=True, # subtracting the column means with_std=True) # scale the data to unit variance pred = clf.predict(X) srs = pd.Series(pred, dtype='category', index=cols) obj.add_meta_data(axis='cells', key='cell_cycle', data=srs, type_='cat')
def cell_cycle_train(verbose=False): """Trains a cell cycle classifier using Stochastic Gradient Descent with data from Buettner et al. Notes ----- Genes are selected from GO:0007049 Does only need to be trained once; the second time it is serialized from disk. Parameters ---------- verbose : `bool` Be verbose or not. Default: False References ---------- .. [1] Buettner et al. (2015) Computational analysis of cell-to-cell heterogeneity in single-cell RNA-sequencing data reveals hidden subpopulations of cells. Nat Biotech. Returns ------- `sklearn.linear_model.SGDClassifier` A trained classifier. `list` Containing training features. """ path_pkg = re.sub('/_log.py', '', adobo._log.__file__) path_data = path_pkg + '/data/Buettner_2015.mat' path_gene_lengths = path_pkg + '/data/Buettner_2015.mat.lengths' path_cc_genes = path_pkg + '/data/GO_0007049.txt' # cell cycle genes path_clf = path_pkg + '/data/cc_classifier.joblib' if os.path.exists(path_clf): clf, features = joblib.load(path_clf) if verbose: print('A trained classifier was found. \ Loading it from %s' % path_clf) else: desc = 'Buettner et al. (2015) doi:10.1038/nbt.3102' B = adobo.IO.load_from_file(path_data, desc=desc) adobo.preproc.detect_ercc_spikes(B, ercc_pattern='NA_ERCC-[0-9]+') adobo.normalize.norm(B, method='rpkm', gene_lengths=path_gene_lengths) cc_genes = pd.read_csv(path_cc_genes, sep='\t', header=None) symb = pd.Series([i[0] for i in B.norm.index.str.split('_')]) norm_cc_mat = B.norm[symb.isin(cc_genes[1]).values] X = norm_cc_mat.transpose() # cells as rows and genes as columns X = sklearn_scale( X, # over genes, i.e. features (columns) axis=0, with_mean=True, # subtracting the column means with_std=True) # scale the data to unit variance Y = [i[0] for i in norm_cc_mat.columns.str.split('_')] clf = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, shuffle=True, verbose=verbose) clf.fit(X, Y) features = norm_cc_mat.index joblib.dump([clf, features], path_clf) # np.sum(clf.predict(X) != Y) return clf, features
def cell_type_predict(obj, name=(), clustering=(), min_cluster_size=10, cell_type_markers=None, verbose=False): """Predicts cell types using the expression of marker genes Notes ----- Gene identifiers should be in symbol form, not ensembl identifiers, etc. Parameters ---------- obj : :class:`adobo.data.dataset` A data class object. name : `tuple` A tuple of normalization to use. If it has the length zero, then all available normalizations will be used. clustering : `tuple`, optional Specifies the clustering outcomes to work on. min_cluster_size : `int` Minimum number of cells per cluster; clusters smaller than this are ignored. Default: 10 cell_type_markers : `pandas.DataFrame` Source of gene markers used to define cell types. This is set to None as default, indicating that PanglaoDB markers will be used. To use custom markers, set this to a pandas data frame where the first column is a gene and the second column is the name of the cell type (every cell type will have multiple rows). Default: None Default: None verbose : `bool` Be verbose or not. Default: False Returns ------- Modifies the passed object. """ targets = {} if len(name) == 0 or name == '': targets = obj.norm_data else: targets[name] = obj.norm_data[name] if isinstance(cell_type_markers, pd.DataFrame): # custom cell type markers were provided ma_ss = cell_type_markers ma_ss.columns = ['official gene symbol', 'cell type'] else: ma = pd.read_csv('%s/data/markers.tsv' % os.path.dirname(adobo.IO.__file__), sep='\t') # restrict to mouse ma = ma[ma.species.str.match('Mm')] markers = ma ui = ma.iloc[:, ma.columns == 'ubiquitousness index'] ma = ma[np.array(ui).flatten() < 0.05] ma_ss = ma.iloc[:, ma.columns.isin(['official gene symbol', 'cell type'])] marker_freq = ma_ss[ma_ss.columns[0]].value_counts() markers = ma_ss # reference symbols fn = '%s/data/mouse_gene_symbols.txt' % os.path.dirname(adobo.IO.__file__) mgs = pd.read_csv(fn, header=None) mgs = mgs[0].str.upper() markers = markers[markers[markers.columns[0]].isin(mgs)] dd = defaultdict(list) for item in markers.groupby('cell type'): dd[item[0]] = set(item[1][item[1].columns[0]]) # down-weighting overlapping genes improves gene set analysis # Tarca AL, Draghici S, Bhatti G, Romero R; BMC Bioinformatics 2012 13:136 s = mgs.unique() s_freqs = marker_freq[marker_freq.index.isin(s)] weights = 1 + np.sqrt( ((max(marker_freq) - s_freqs) / (max(marker_freq) - min(marker_freq)))) def _guess_cell_type(x): rr = median_expr.loc[:, median_expr.columns == x.name].values.flatten() # genes expressed in this cell cluster genes_exp = set(x.index[rr > 0]) # genes _not_ expressed in this cell cluster genes_not_exp = set(x.index[rr == 0]) res = list() for ct in dd: s = dd[ct] x_ss = x[x.index.isin(s)] if len(x_ss) == 0: continue gene_weights = weights[weights.index.isin(x_ss.index)] gene_weights = pd.Series(gene_weights, x_ss.index) activity_score = sum(x_ss * gene_weights) / len(x_ss)**0.3 # how many expressed genesets are found in the geneset? ct_exp = len(genes_exp & s) # how many _non_ expressed genes are found in the geneset? ct_non_exp = len(genes_not_exp & s) # how many expressed genes are NOT found in the geneset? ct_exp_not_found = len(genes_exp - s) # how many _non_ expressed genes are NOT found in the geneset? not_exp_not_found_in_geneset = len(genes_not_exp - s) # one sided fisher contigency_tbl = [[ct_exp, ct_non_exp], [ct_exp_not_found, not_exp_not_found_in_geneset]] odds_ratio, pval = fisher_exact(contigency_tbl, alternative='greater') markers_found = ','.join(list(genes_exp & s)) if markers_found == '': markers_found = 'NA' res.append({ 'activity_score': activity_score, 'ct': ct, 'pvalue': pval, 'markers': markers_found }) res = sorted(res, key=lambda k: k['activity_score'], reverse=True) return res for i, k in enumerate(targets): if verbose: print('Running cell type prediction on %s' % k) item = targets[k] X = item['data'] clusters = item['clusters'] if len(clusters) == 0: raise Exception( 'No clusters found, run adobo.clustering.generate(...) first.') for algo in clusters: if len(clustering) == 0 or algo in clustering: if verbose: print('Running on the %s clustering' % algo) cl = clusters[algo]['membership'] ret = X.sparse.to_dense().groupby(cl.values, axis=1).aggregate(np.median) q = pd.Series(cl).value_counts() cl_remove = q[q < min_cluster_size].index ret = ret.iloc[:, np.logical_not(ret.columns.isin(cl_remove))] median_expr = ret obj.norm_data[k]['clusters'][algo]['median_expr'] = median_expr median_expr.index = median_expr.index.str.upper() s = np.sum(median_expr.index.str.match('^(.+)_.+')) if median_expr.shape[0] == s: input_symbols = median_expr.index.str.extract( '^(.+)_.+')[0] input_symbols = input_symbols.str.upper() median_expr.index = input_symbols # (1) centering is done by subtracting the column means # (2) scaling is done by dividing the (centered) by their standard # deviations scaled = sklearn_scale(median_expr, with_mean=True, axis=0) median_expr_Z = pd.DataFrame(scaled) median_expr_Z.index = median_expr.index median_expr_Z.columns = median_expr.columns ret = median_expr_Z.apply(func=_guess_cell_type, axis=0) # restructure bucket = [] for i, kk in enumerate(ret): _df = pd.DataFrame(kk) _df['cluster'] = [i] * len(kk) cols = _df.columns.tolist() _df = _df[cols[-1:] + cols[:-1]] bucket.append(_df) final_tbl = pd.concat(bucket) if final_tbl.shape[0] == 0: raise Exception('Final table is empty. Check gene symbols \ of input data.') padj = p_adjust_bh(final_tbl['pvalue']) final_tbl['padj_BH'] = padj final_tbl.columns = [ 'cluster', 'activity score', 'cell type', 'p-value', 'markers', 'adjusted p-value BH' ] # save the best scoring for each cluster res_pred = final_tbl.groupby('cluster').nth(0) _a = res_pred['adjusted p-value BH'] > 0.10 res_pred.loc[_a, 'cell type'] = 'Unknown' key = 'cell_type_prediction' obj.norm_data[k]['clusters'][algo][key] = res_pred key = 'cell_type_prediction_full' obj.norm_data[k]['clusters'][algo][key] = final_tbl
def jackstraw(obj, normalization=None, permutations=500, ncomp=None, subset_frac_genes=0.05, score_thr=1e-03, fdr=0.01, retx=True, verbose=False): """Determine the number of relevant PCA components. Notes ----- Permutes a subset of the data matrix and compares PCA scores with the original. The final output is a p-value for each component generated using a Chi-sq test. Parameters ---------- obj : :class:`adobo.data.dataset` A dataset class object. normalization : `str` The name of the normalization to operate on. If this is empty or None then the function will be applied on all normalizations available. permutations : `int` Number of permutations to run. Default: 500 ncomp : `int` Number of principal components to calculate significance for. If None, then will calculate for all components previously saved from py:func:`adobo.dr.pca`. Default: None subset_frac_genes : `float` Proportion genes to use. Default: 0.10 score_thr : `float` Threshold for significance. Default: 1e-05 fdr : `float` Acceptable false discovery rate. Default: 0.01 retx : `bool` In addition to also modifying the object, also return results. Default: True verbose : `bool` Be verbose. Default: False References ---------- .. [1] Chung & Storey (2015) Statistical significance of variables driving systematic variation in high-dimensional data, Bioinformatics https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4325543/ Returns ------- pandas.DataFrame A genes by principal component data frame containing empirical p-values for the significance of every gene of the PC. pandas.DataFrame A data frame containing a single p-value for every PC generated from a Chi^2 test. Can be used to select the number of components to include by examinng p-values. """ start_time = time.time() if normalization == None or normalization == '': norm = list(obj.norm_data.keys())[-1] else: norm = normalization item = obj.norm_data[norm] try: loadings = np.abs(item['dr']['pca']['contr']) except KeyError: raise Exception('Run `adobo.dr.pca(...)` first.') X = item['data'] if not ncomp: ncomp = loadings.shape[1] elif ncomp > loadings.shape[1]: raise Exception('"ncomp" is higher than the number of available \ components computed by adobo.dr.pca(...)') if verbose: print('computing for ncomp=%s' % ncomp) try: hvg = item['hvg']['genes'] except KeyError: raise Exception('Run adobo.dr.find_hvg() first.') X = X[X.index.isin(hvg)] X_scaled = sklearn_scale(X.transpose(), axis=0, with_mean=True, with_std=True).transpose() X_scaled = pd.DataFrame(X_scaled, index=X.index, columns=X.columns) perm_loadings = [] for perm in np.arange(0, permutations): if verbose: print('random set %s ' % perm) rand_genes = sample(list(X.index), round(X.shape[0] * subset_frac_genes)) X_cp = X_scaled.copy() data_perm = X_cp.loc[rand_genes, :] # permutate every row data_perm = [ np.random.permutation(_col) for g, _col in data_perm.iterrows() ] data_perm = pd.DataFrame(np.array(data_perm), index=rand_genes, columns=X_cp.columns) # put permutated data back into the original data X_cp.loc[rand_genes, :] = data_perm comp, contr = irlb(X_cp, scale=False, ncomp=ncomp) pl = contr[contr.index.isin(rand_genes)].iloc[:, 0:ncomp] pl = np.abs(pl) perm_loadings.append(pl) perm_loadings = pd.concat(perm_loadings, axis=0, ignore_index=True) res = [] for i, pc in perm_loadings.iloc[:, 0:ncomp].transpose().iterrows(): real = loadings[i] emp_p = [np.sum(pc > val) / len(pc) for g, val in real.iteritems()] res.append(pd.Series(emp_p, name=i)) res = pd.concat(res, axis=1, ignore_index=True) n = [ q1 + q2 for q1, q2 in zip(['PC'] * res.shape[1], res.columns.values.astype(str)) ] res.columns = n # generate one p-value per component final = [] for i, pc in res.transpose().iterrows(): nsign_found = np.sum(pc < score_thr) # expecting a uniform distribution nsign_expected = np.floor(len(pc) * score_thr) ct = [[nsign_found, nsign_expected], [len(pc) - nsign_found, len(pc) - nsign_expected]] try: pv = chi2_contingency(np.array(ct))[1] except ValueError: pv = 1 final.append([i, pv]) final = pd.DataFrame(final) final['p.adj'] = p_adjust_bh(final[1]) final.columns = ['PC', 'chi2_p', 'chi2_p_adj'] final['significant'] = final.chi2_p_adj < fdr end_time = time.time() if verbose: print('Analysis took %.2f minutes' % ((end_time - start_time) / 60)) obj.norm_data[norm]['dr']['jackstraw'] = { 'score_mat': res, 'results_by_comp': final } if retx: return res, final
def irlb(data_norm, scale=True, ncomp=75, var_weigh=True, seed=None): """Truncated SVD by implicitly restarted Lanczos bidiagonalization Notes ----- The augmented implicitly restarted Lanczos bidiagonalization algorithm (IRLBA) finds a few approximate largest singular values and corresponding singular vectors using a method of Baglama and Reichel. Cells should be rows and genes as columns. Parameters ---------- data_norm : :py:class:`pandas.DataFrame` A pandas data frame containing normalized gene expression data. scale : `bool` Scales input data prior to PCA. Default: True ncomp : `int` Number of components to return. Default: 75 var_weigh : `bool` Weigh by the variance of each component. Default: True seed : `int` For reproducibility. Default: None References ---------- .. [1] Baglama et al (2005) Augmented Implicitly Restarted Lanczos Bidiagonalization Methods SIAM Journal on Scientific Computing .. [2] https://github.com/bwlewis/irlbpy Returns ------- `pd.DataFrame` A py:class:`pandas.DataFrame` containing the components (columns). `pd.DataFrame` A py:class:`pandas.DataFrame` containing the contributions of every gene (rows). """ inp = data_norm idx = inp.index cols = inp.columns inp = inp.transpose() if scale: inp = sklearn_scale( inp.sparse.to_dense(), # cells as rows and genes as columns # over genes, i.e. features (columns) axis=0, with_mean=True, # subtracting the column means with_std=True) # scale the data to unit variance inp = pd.DataFrame(inp, columns=idx, index=cols) # cells should be rows and genes as columns lanc = irlbpy.lanczos(inp, nval=ncomp, maxit=1000, seed=seed) if var_weigh: # weighing by variance comp = np.dot(lanc.U, np.diag(lanc.s)) else: comp = lanc.U comp = pd.DataFrame(comp, index=inp.index) # gene loadings contr = pd.DataFrame(lanc.V, index=inp.columns) return comp, contr