def diffexpressed(self, _subset, _factor, qval_limit, verbose=True): """Returns an array of probes that are differentially expressed according to the following method: 1) Perform an independent t-test on the probe values for the specified subset against the probe values for the non-subset samples. 2) To correct for multiple testing errors, calculate the Benjamini-Hochberg FDR q-value for each p-value. 3) Filter probes where the q-value is above the cutoff. Arguments: _subset: the subset to test for expressed genes _factor: the factor the subset belongs to fdr_limit: the FDR q-value representing the upper limit for results """ if not self.filtered(): print("Warning: Finding differentially expressed genes on an unfiltered matrix may fail. Run dataset.filter().") matrix = self.matrix probes = self.probes samples = self.factors[_factor][_subset] inA = array([x in samples for x in self.header[2:]]) A = numpy.transpose(matrix[:, inA]) B = numpy.transpose(matrix[:, numpy.invert(inA)]) t, pvals = stats.ttest_ind(A, B) rejected, qvals = multitest.fdrcorrection(pvals, alpha=qval_limit) # probe values are [probe_name, entrez_id] form (hence x[0]) diffexp = [x[0] for i, x in enumerate(probes) if qvals[i] < qval_limit] if verbose: print("%d samples, %d differentially expressed genes in %s: %s" % (len([x for x in inA if x]), len(diffexp), _factor, _subset)) return diffexp
def compute_regression(input_cancer_type): if input_cancer_type == "CCRCC": cancer = cptac.Ccrcc() elif input_cancer_type == "Endometrial": cancer = cptac.Endometrial() elif input_cancer_type == "LUAD": cancer = cptac.Luad() elif input_cancer_type == "HNSCC": cancer = cptac.Hnscc() elif input_cancer_type == "LSCC": cancer = cptac.Lscc() elif input_cancer_type == "PDAC": cancer = cptac.Pdac() df = dc.get_prot_trans_df(cancer) results = df.groupby('Gene').apply(regression) reg_df = pd.DataFrame(list(results)) reg_df.index = results.index reg_df.reset_index(inplace=True) reg_df = reg_df.dropna() reg_df['interaction_FDR'] = ssm.fdrcorrection( reg_df['interaction_pval'])[1] reg_df['condition_FDR'] = ssm.fdrcorrection(reg_df['condition_pval'])[1] reg_df['intercept_FDR'] = ssm.fdrcorrection(reg_df['intercept_pval'])[1] reg_df['Cancer'] = [input_cancer_type] * len(reg_df) file_name = input_cancer_type + '_regressions.csv' reg_df.to_csv(file_name, index=False)
def test_multi_pvalcorrection(): #test against R package multtest mt.rawp2adjp #because of sort this doesn't check correct sequence - TODO: rewrite DONE rmethods = { 'rawp': (0, 'pval'), 'Bonferroni': (1, 'b'), 'Holm': (2, 'h'), 'Hochberg': (3, 'sh'), 'SidakSS': (4, 's'), 'SidakSD': (5, 'hs'), 'BH': (6, 'fdr_i'), 'BY': (7, 'fdr_n') } for k, v in rmethods.items(): if v[1] in ['b', 's', 'sh', 'hs', 'h', 'fdr_i', 'fdr_n']: #pvalscorr = np.sort(multipletests(pval0, alpha=0.1, method=v[1])[1]) r_sortindex = [6, 8, 9, 7, 5, 1, 2, 4, 0, 3] pvalscorr = multipletests(pval0, alpha=0.1, method=v[1])[1][r_sortindex] assert_almost_equal(pvalscorr, res_multtest[:, v[0]], 15) pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1]) assert_almost_equal(pvalscorr, res_multtest[:, 7], 15) pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1]) assert_almost_equal(pvalscorr, res_multtest[:, 6], 15)
def test_multi_pvalcorrection(self): # test against R package multtest mt.rawp2adjp res_multtest = self.res2 pval0 = res_multtest[:, 0] pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1]) assert_almost_equal(pvalscorr, res_multtest[:, 7], 15) pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1]) assert_almost_equal(pvalscorr, res_multtest[:, 6], 15)
def test_multi_pvalcorrection(self): #test against R package multtest mt.rawp2adjp res_multtest = self.res2 pval0 = res_multtest[:,0] pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1]) assert_almost_equal(pvalscorr, res_multtest[:,7], 15) pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1]) assert_almost_equal(pvalscorr, res_multtest[:,6], 15)
def make_bed(qc_data, positive_dir, negative_dir, bed): bed = pd.read_table(bed) target_loc = pd.read_table(qc_data) target_loc = target_loc.loc[target_loc["Average_Total_Reads"] >= 30] target_loc = target_loc.loc[target_loc["Average_Number_Peaks"] >= 1.5] positive_samples = glob("{0}/*_MSIscore.xls".format(positive_dir)) negative_samples = glob("{0}/*_MSIscore.xls".format(negative_dir)) positive_s = [] for s in positive_samples: data = pd.read_table(s) data = data.loc[data["MSID"].isin(target_loc["MSID"].tolist())] data = data[["MSID", "Normalized_Number_of_Peaks"]] data.columns = ["MSID", s.split("/")[-1].split("_")[0] + "_positive"] target_loc = pd.merge(target_loc, data, on="MSID", how="inner") positive_s.append(s.split("/")[-1].split("_")[0] + "_positive") negative_s = [] for s in negative_samples: data = pd.read_table(s) data = data.loc[data["MSID"].isin(target_loc["MSID"].tolist())] data = data[["MSID", "Normalized_Number_of_Peaks"]] data.columns = ["MSID", s.split("/")[-1].split("_")[0] + "_negative"] target_loc = pd.merge(target_loc, data, on="MSID", how="inner") negative_s.append(s.split("/")[-1].split("_")[0] + "_negative") target_loc["pval"] = [ranksums(i, ii).pvalue for i, ii in zip(target_loc[positive_s].as_matrix(), target_loc[negative_s].as_matrix())] fdr = target_loc["pval"] reject, pvals_corrected = mul.fdrcorrection(fdr) target_loc['FDR_bh'] = pvals_corrected target_loc = target_loc.loc[target_loc["pval"] <= 0.01] target_loc.to_csv("peaks.txt", sep="\t", index=False) bed = bed.loc[bed["MSID"].isin(target_loc["MSID"].tolist())] bed.to_csv("bed.txt", sep="\t", index=False)
def global_fdr(self, df, alpha_fdr): global_stats = { 'global_padj': [], 'cilow_global_padj': [], 'ciupp_global_padj': [] } colloc = { 'global_padj': 8, 'cilow_global_padj': 4, 'ciupp_global_padj': 4 } ids = df[~df['pval_rep0'].isna()].index qvals = np.empty((len(ids), len(self.nvs))) qvals[:] = np.nan for i in range(len(self.nvs)): _, qvals[:, i] = fdrcorrection(df['pval_rep' + str(i)][ids], alpha=alpha_fdr, method='indep') for i in range(qvals.shape[0]): mean_padj, low_padj, upp_padj = self.log_stats(qvals[i, :]) global_stats['global_padj'].append(mean_padj) global_stats['cilow_global_padj'].append(low_padj) global_stats['ciupp_global_padj'].append(upp_padj) for key in global_stats.keys(): df.insert((len(df.columns) - colloc[key] - len(self.nvs)), key, np.nan) df.loc[ids, key] = global_stats[key] return df
def do_FDR_correction(df): """ Do FDR correction and add results to dataframe # code from gonenrich module of goenrich package by Jan Rudolph (jdrudolph) # https://github.com/jdrudolph/goenrich/blob/master/goenrich/enrich.py :param df <pd.DataFrame>: GO expression data :return <pd.DataFrame>: GO expression data w/ FDR results """ _p = np.array(df['pval']) # create array of len corresponding to p padj = _p.copy() rej = _p.copy() # list of bools not nan mask = ~np.isnan(_p) # remove false entries p = _p[mask] _rej, _padj = fdrcorrection(p) # only change values not nan values rej[mask] = _rej padj[mask] = _padj df['padj'] = padj df['rejected'] = rej return df
def FDR(p_values, fdr, total=None): """ Runs false detection correction for a table of statistics Parameters ---------- p_values : ~pandas.DataFrame DataFrame with a 'pvalue' column fdr : float False detection rate total : int Total number of tests (for multi-enrichment) Returns ------- ~pandas.DataFrame Table containing entries that passed multiple hypothesis correction """ if total is not None: pvals = p_values.pvalue.values.tolist() + [1] * (total - len(p_values)) else: pvals = p_values.pvalue.values keep, qvals = fdrcorrection(pvals, alpha=fdr) result = p_values.copy() result["qvalue"] = qvals[:len(p_values)] result = result[keep[:len(p_values)]] return result.sort_values("qvalue")
def get_list_enrichment(gene_list: pd.Series, alpha: float = 0.05, hide_rejected: bool = False) -> pd.DataFrame: print("{} genes in gene list {} are not part of the backgroud".format( gene_list[~gene_list.isin(annotated.index)].shape[0], gene_list.name), file=sys.stderr) list_cluster_dedup = annotated[annotated.index.isin(gene_list)].drop_duplicates('match_id') list_cluster_size = list_cluster_dedup.groupby('#pattern name').size() def cluster_fisher(row): return fisher_exact( [[row[0], row[1] - row[0]], [list_cluster_dedup.shape[0] - row[0], ann_dedup.shape[0] - list_cluster_dedup.shape[0] - row[1] + row[0]]], alternative='greater')[1] p_values = pd.concat([list_cluster_size, cluster_size], axis=1).fillna(0).apply(cluster_fisher, axis=1).sort_values() reject, adj_p = fdrcorrection(p_values, alpha=alpha, is_sorted=True) if hide_rejected: p_values = p_values[reject] adj_p = adj_p[reject] adj_p = pd.Series(adj_p, index=p_values.index) return pd.concat([p_values, adj_p], axis=1).rename(columns={0: 'p', 1: 'adj_p'})
def pairwise_comp(data, cty_prop, prop_list, params, sig_level=0.05): """ Pairwise comparison of parameters between cell-types """ diff_param_list = [] p_val_list = [] for param in params: for comb in combinations(prop_list, 2): cty_x, cty_y = comb paramx = data.loc[data[cty_prop] == cty_x, param].values paramy = data.loc[data[cty_prop] == cty_y, param].values _, p_val_x = mannwhitneyu(paramx, paramy, alternative='less') _, p_val_y = mannwhitneyu(paramy, paramx, alternative='less') comp_type = '%s<%s' % ( cty_x, cty_y) if p_val_x < p_val_y else '%s<%s' % (cty_y, cty_x) p_val = min(p_val_x, p_val_y) sig_dict = {'Comp_type': comp_type, 'param': param} diff_param_list.append(sig_dict) p_val_list.append(p_val) # FDR correction for multiple comparison _, p_val_corrected = fdrcorrection(p_val_list) diff_param_df = pd.DataFrame(diff_param_list) diff_param_df['p_val'] = p_val_corrected diff_param_df['sig_level'] = diff_param_df['p_val'].apply( lambda x: man_utils.pval_to_sig(x)) return diff_param_df
def dist2atlas_reg(bfp_path, ref_atlas, sub_files, reg_var, len_time=235): """ Perform regression stats based on square distance to atlas """ print('dist2atlas_reg, assume that the data is normalized') num_vert = ref_atlas.shape[1] num_sub = len(sub_files) # Take absolute value of difference from the mean # for the IQ measure reg_var = sp.absolute(reg_var - sp.mean(reg_var)) diff = sp.zeros((num_vert, num_sub)) # Compute distance to atlas for ind in tqdm(range(num_sub)): sub_data = spio.loadmat(sub_files[ind])['dtseries'].T sub_data, _, _ = normalizeData(sub_data[:len_time, :]) Y2, _ = brainSync(X=ref_atlas, Y=sub_data) diff[:, ind] = sp.sum((Y2 - ref_atlas)**2, axis=0) corr_pval = sp.zeros(num_vert) for vrt in tqdm(range(num_vert)): _, corr_pval[vrt] = sp.stats.pearsonr(diff[vrt, :], reg_var) corr_pval[sp.isnan(corr_pval)] = .5 lab = spio.loadmat(bfp_path + '/supp_data/USCBrain_grayord_labels.mat') labs = lab['labels'].squeeze() corr_pval_fdr = sp.zeros(num_vert) _, pv = fdrcorrection(corr_pval[labs > 0]) corr_pval_fdr[labs > 0] = pv return corr_pval, corr_pval_fdr
def multitest_correction(dataset, ontology, annotation_files): annotation_years = (json.load(open(f)) for f in annotation_files) factor = 'disease state' db = get_connection(100) for annotations in annotation_years: year = annotations['meta']['year'] # If we didn't shuffle the annotations, the shuffle level is 0 shuffled = annotations['meta'].get('shuffled', 0.0) for subset in dataset.factors[factor]: print"[%s]-[%s]-[%s]-[%s]-[%f]:" % (dataset.id, year, ontology, subset, shuffled), with closing(db.cursor()) as c: _id = (dataset.id, subset, year, ontology, shuffled) print "selecting pvals... ", c.execute(select_pvals_sql, _id) # list of tuples [(_subid, pval), ...] results = list(c.fetchall()) pvals = [x[1] for x in results] subids = [_id + (x[0],) for x in results] print "calculating FDR... ", rejected, qvals = multitest.fdrcorrection(pvals) results = [(qvals[i],) + subids[i] for i, v in enumerate(qvals)] with closing(db.cursor()) as c: print "inserting %d qvals... " % len(results), c.executemany(insert_qval_sql, results) db.commit() print "done." db.close()
def compute_latency(self, visual_hfb, image_id, visual_channels): """ Compute latency response of visual channels" """ A_postim = self.crop_stim_hfb(visual_hfb, image_id, tmin=0, tmax=1.5) A_prestim = self.crop_stim_hfb(visual_hfb, image_id, tmin=-0.4, tmax=0) A_baseline = np.mean(A_prestim, axis=-1) #No pval = [0]*A_postim.shape[2] tstat = [0]*A_postim.shape[2] latency_response = [0]*len(visual_channels) for i in range(0, len(visual_channels)): for t in range(0,np.size(A_postim,2)): tstat[t] = spstats.wilcoxon(A_postim[:,i,t], A_baseline[:,i], zero_method=self.zero_method) pval[t] = tstat[t][1] reject, pval_correct = fdrcorrection(pval, alpha=self.alpha) # correct for multiple hypotheses for t in range(0,np.size(A_postim,2)): if np.all(reject[t:t+50])==True : latency_response[i]=t/500*1e3 break else: continue return latency_response
def pval_series(self, visual_hfb, image_id, visual_channels): """ Return pvalue of postimulus visual responsivity along observations """ nchan = len(visual_channels) A_postim = self.crop_stim_hfb(visual_hfb, image_id, tmin=0, tmax=1.5) A_prestim = self.crop_stim_hfb(visual_hfb, image_id, tmin=-0.4, tmax=-0.1) A_baseline = np.mean(A_prestim, axis=-1) nobs = A_postim.shape[2] pval = [0]*nobs tstat = [0]*nobs reject = np.zeros((nchan, nobs)) pval_correct = np.zeros((nchan, nobs)) for i in range(0, nchan): for t in range(0,nobs): tstat[t] = spstats.wilcoxon(A_postim[:,i,t], A_baseline[:,i], zero_method=self.zero_method) pval[t] = tstat[t][1] reject[i,:], pval_correct[i, :] = fdrcorrection(pval, alpha=self.alpha) # correct for multiple hypotheses return reject, pval_correct
def multiple_wilcoxon_test(self, A_postim, A_prestim, alternative='two-sided'): """ Wilcoxon test hypothesis of no difference between prestimulus and postimulus amplitude Correct for multilple hypothesis test. ---------- Parameters ---------- A_postim: (...,times) array Postimulus amplitude A_prestim: (...,times) array Presimulus amplitude alpha: float significance threshold to reject the null From scipy.stats.wilcoxon: alternative: {“two-sided”, “greater”, “less”}, optional zero_method: {“pratt”, “wilcox”, “zsplit”}, optional """ A_postim = np.mean(A_postim, axis=-1) A_prestim = np.mean(A_prestim, axis=-1) # Iniitialise inflated p values nchans = A_postim.shape[1] pval = [0]*nchans tstat = [0]*nchans # Compute inflated stats given non normal distribution for i in range(0,nchans): tstat[i], pval[i] = spstats.wilcoxon(A_postim[:,i], A_prestim[:,i], zero_method=self.zero_method, alternative=self.alternative) # Correct for multiple testing reject, pval_correct = fdrcorrection(pval, alpha=self.alpha) w_test = reject, pval_correct, tstat return w_test
def corr_perm_test(X_pairs, Y_pairs, reg_var, num_sub, nperm=1000): # X: nsub x vertices # Y: cognitive scores nsub X 1 X, _, _ = normalizeData(X_pairs) num_pairs = X.shape[0] Y_pairs, _, _ = normalizeData(Y_pairs[:, None]) rho_orig = np.sum(X * Y_pairs, axis=0) max_null = np.zeros(nperm) n_count = np.zeros(X.shape[1]) print('Permutation testing') for ind in tqdm(range(nperm)): pairs, _ = gen_rand_pairs(num_sub=num_sub, num_pairs=num_pairs) pairs = np.array(pairs) Y = sp.square(reg_var[pairs[:, 0]] - reg_var[pairs[:, 1]]) Y, _, _ = normalizeData(Y[:, None]) rho_perm = np.sum(X * Y, axis=0) max_null[ind] = np.amax(rho_perm) n_count += np.float32(rho_perm >= rho_orig) pval_max = np.sum(rho_orig[:, None] <= max_null[None, :], axis=1) / nperm pval_perm = n_count / nperm _, pval_perm_fdr = fdrcorrection(pval_perm) return pval_max, pval_perm_fdr, pval_perm
def LinReg_corr(subTest_diff, subTest_varmain, subTest_varc1, subTest_varc2): print('regressing out 1st covariate') diff_resid1 = sp.zeros(subTest_diff.shape) numV = subTest_diff.shape[0] for nv in tqdm(range(numV)): diff_resid1[nv, :] = LinReg_resid(subTest_varc1, subTest_diff[nv, :]) print('regressing out 2nd covariate') diff_resid2 = sp.zeros(subTest_diff.shape) for nv in tqdm(range(numV)): diff_resid2[nv, :] = LinReg_resid(subTest_varc2, diff_resid1[nv, :]) print('computing correlation against main variable') rval = sp.zeros(numV) pval = sp.zeros(numV) for nv in tqdm(range(numV)): _, _, rval[nv], pval[nv], _ = sp.stats.linregress( subTest_varmain, diff_resid2[nv, :]) a = spio.loadmat('supp_data/USCBrain_grayordinate_labels.mat') labs = a['labels'].squeeze() labs[sp.isnan(labs)] = 0 pval_fdr = sp.zeros(numV) _, pv = fdrcorrection(pval[labs > 0]) pval_fdr[labs > 0] = pv return rval, pval, pval_fdr
def fdrcorrection_matrix(arr, include_diagonal=True): """Apply FDR correction for matrix elements including diagonal entries. Args: arr (np.array): Matrix containing p-values. include_diagoonal (bool, optional): Whether diaganal elements should also be corrected. Defaults to True. Returns: Matrix containing corrected p-values. """ n = arr.shape[0] k = 0 if include_diagonal else 1 # Vectorize v_triu = arr[np.triu_indices(n, k=k)] # Restore 2D matrix new = np.zeros((n, n)) new[np.triu_indices(n, k=k)] = fdrcorrection(v_triu)[1] new = new + np.tril(new.T, k=-1) return new
def calculatePvalue(df): df = pd.Series(data=features) df = df.value_counts().rename_axis('feature').reset_index(name='TestStatistic') df['pvalues'] = [st.binom_test(x, 20, 1/20, alternative='greater') for x in df.TestStatistic] fdrs = correct.fdrcorrection(df.pvalues, method='negcorr' )[1] df['FDR']= fdrs return(df)
def test_enrichment(temp_K0_res, rel_kos_df, rel_p): ko_enrichment_res = pd.DataFrame( index=rel_p, columns=['pval', 'natural_pval', 'fdr', 'in_fraction_of_KOs']) for pathway in rel_p: _, p0 = mannwhitneyu(temp_K0_res.loc[rel_kos_df[rel_kos_df[pathway] == 1].index]['rank'], temp_K0_res.loc[rel_kos_df[rel_kos_df[pathway] == 0].index]['rank'], alternative='less') _, p1 = mannwhitneyu(temp_K0_res.loc[rel_kos_df[rel_kos_df[pathway] == 1].index]['rank'], temp_K0_res.loc[rel_kos_df[rel_kos_df[pathway] == 0].index]['rank'], alternative='greater') if p0 < p1: ko_enrichment_res.loc[pathway, 'natural_pval'] = p0 ko_enrichment_res.loc[pathway, 'pval'] = -(np.log10(p0)) else: ko_enrichment_res.loc[pathway, 'natural_pval'] = p1 ko_enrichment_res.loc[pathway, 'pval'] = np.log10(p1) ko_enrichment_res.loc[pathway, 'in_fraction_of_KOs'] = rel_kos_df[pathway].sum( ) / rel_kos_df.shape[0] ko_enrichment_res['fdr'] = fdrcorrection( ko_enrichment_res['natural_pval'])[1] ko_enrichment_res = ko_enrichment_res.sort_values('fdr') return ko_enrichment_res
def ANOVA_test(meta_data,df_otu_counts,Group_factor,Group_list,P_cut): '''Conducting one way ANOVA test to find significantly enriched OTUs in each group dict_enriched_OTU_group: enriched OTUs in each group df_ANOVA_results_group: F, p, and p_adj for each enriched OTU within each group ''' meta_data_filter = meta_data.loc[df_otu_counts.columns] dict_group = dict(zip(Group_list,[list(meta_data_filter[meta_data_filter[Group_factor] == x].index) for x in Group_list])) ANOVA_results = {} for group in Group_list: Others = list(set(df_otu_counts.columns) - set(dict_group[group])) ANOVA_results[group] = [f_oneway(list(df_otu_counts[dict_group[group]].loc[x]), list(df_otu_counts[Others].loc[x]))[0:2] for x in df_otu_counts.index] df_ANOVA_results_group = {} for group in Group_list: df_ANOVA_results_group[group] = pd.DataFrame(ANOVA_results[group],index = df_otu_counts.index, columns=['F_ratio','p_value']) df_ANOVA_results_group[group]['P_adj'] = list(fdrcorrection(df_ANOVA_results_group[group]['p_value'])[1]) df_ANOVA_results_group[group] = df_ANOVA_results_group[group].sort_values(by = 'F_ratio',ascending = False) # Target the enriched ones for each group of interest Enrich_ratio = pd.DataFrame(index = df_otu_counts.index) for group in Group_list: Other_group = list(set(df_otu_counts.columns) - set(dict_group[group])) Enrich_ratio[group] = df_otu_counts[dict_group[group]].mean(axis = 1) - df_otu_counts[Other_group].mean(axis = 1) dict_enriched_OTU_group = {} for group in Group_list: df_temp = df_ANOVA_results_group[group] pos_list = set(df_temp.index).intersection(Enrich_ratio[Enrich_ratio[group]>0][group].index) dict_enriched_OTU_group[group] = df_temp.loc[pos_list][df_temp.P_adj<P_cut].index return dict_enriched_OTU_group,df_ANOVA_results_group
def _transform(self, result): p = result.maps['p'] _, p_corr = mc.fdrcorrection(p, alpha=self.q, method=self.method, is_sorted=False) corr_maps = {'p': p_corr} self._generate_secondary_maps(result, corr_maps) return corr_maps
def MW_U(data): """ Mann Whitney test corrected with fdrcorrection Arguments: ---------- data: neuro_data + clinical_data Returns: ------- pandas dataframe with the information related to the Mann Whitney test. """ patients_fa, controls_fa, feats = stats_data(data) MannWhitney_tests = pd.DataFrame(columns=['ROI', 'U', 'pvalue']) for attr in feats: stat, p = mannwhitneyu(patients_fa[attr], controls_fa[attr]) MannWhitney_tests = MannWhitney_tests.append( { 'ROI': attr, 'U': stat, 'pvalue': p }, ignore_index=True) test, p_corr = fdrcorrection(MannWhitney_tests["pvalue"], alpha=0.05, method="indep", is_sorted=False) MannWhitney_tests["Rejected"] = test MannWhitney_tests["p_corr"] = p_corr return MannWhitney_tests
def stretchFinder(profile, l, m=10**4): """ implementation of strechFinder as described in : "Synonymous site conservation in the HIV-1 genome" :param profile: a vector of entropy values :param l: the window size :param m: number of permutations :return: """ start_index = [] p_values = [] #create a per-profile distribution of averages, then sample avgs = np.array([]) for j in range(m): new_profile = profile cur_avg = np.mean(new_profile[np.random.choice(len(new_profile), size=l, replace=False)]) avgs = np.insert(avgs, avgs.searchsorted(cur_avg), cur_avg) for i in tqdm(range(0,len(profile) - l)): # get the current window and its average value w = profile[i:i+l] avg = np.mean(w) # sort average in order to get the p value idx = np.searchsorted(avgs, avg) p_value = idx/m p_values.append(p_value) start_index.append(i) data = pd.DataFrame({'start':start_index, 'p_value':p_values, 'l':l}) # correct for multiple tests data['corrected_pvalue'] = multi.fdrcorrection(data['p_value'])[1] return data
def detect_visual(A_pr, A_po, HFB_db): M1 = np.mean(A_pr,axis=2) M2 = np.mean(A_po,axis=2) # Get rid of infinity M1[M1==-inf] = 0 M2[M2 == -inf] = 0 # Compute inflated p values pval = [0]*len(HFB_db.info['ch_names']) degf = [0]*len(HFB_db.info['ch_names']) tstat = [0]*len(HFB_db.info['ch_names']) for i in range(0,len(HFB_db.info['ch_names'])): tstat[i], pval[i] = spstats.wilcoxon(M1[:,i], M2[:,i], zero_method='zsplit') # Non normal distrib # Correct multiplt testing reject, pval_correct = fdrcorrection(pval, alpha=0.05) # Compute effect size: Cohen d MC1 = np.mean(M1, axis=0) MC2 = np.mean(M2, axis=0) std1 = np.std(M1, axis=0) std2 = np.std(M2, axis=0) n1 = M1.shape[1] n2 = M2.shape[1] std = np.sqrt(np.divide((n1-1)*std1**2+(n2-1)*std2**2,(n1+n2-2))) cohen = np.divide(MC1-MC2, std) # Return visual channels idx = np.where(reject==True) idx = idx[0] visual_chan = [] visual_cohen = [] for i in list(idx): visual_chan.append(HFB_db.info['ch_names'][i]) visual_cohen.append(np.abs(cohen[i])) return reject, pval_correct, visual_chan
def multiple_testing_correction(ps, alpha=0.05, method='benjamini-hochberg', **kwargs): """ correct pvalues for multiple testing and add corrected `q` value :param ps: list of pvalues :param alpha: significance level default : 0.05 :param method: multiple testing correction method [bonferroni|benjamini-hochberg] :returns (q, rej): two lists of q-values and rejected nodes """ _p = np.array(ps) q = _p.copy() rej = _p.copy() mask = ~np.isnan(_p) p = _p[mask] if method == 'bonferroni': q[mask] = p / len(p) rej[mask] = q[mask] < alpha elif method == 'benjamini-hochberg': _rej, _q = fdrcorrection(p, alpha) rej[mask] = _rej q[mask] = _q else: raise ValueError(method) return q, rej
def pg_ttest(data, group_col, group1, group2, fdr=0.05, value_col='MS signal [Log2]'): ''' data: long data format with ProteinID as index, one column of protein levels, other columns of grouping. ''' df = data.copy() proteins = data.index.unique() columns = pg.ttest(x=[1, 2], y=[3, 4]).columns scores = pd.DataFrame(columns=columns) for i in proteins: df_ttest = df.loc[i] x = df_ttest[df_ttest[group_col] == group1][value_col] y = df_ttest[df_ttest[group_col] == group2][value_col] difference = y.mean() - x.mean() result = pg.ttest(x=x, y=y) result['protein'] = i result['difference'] = difference scores = scores.append(result) scores = scores.assign(new_column=lambda x: -np.log10(scores['p-val'])) scores = scores.rename({'new_column': '-Log pvalue'}, axis=1) #FDR correction reject, qvalue = multi.fdrcorrection(scores['p-val'], alpha=0.05, method='indep') scores['qvalue'] = qvalue scores['rejected'] = reject scores = scores.set_index('protein') return scores
def main(infile, val_col, sep, tail, outfile): """ P values were estimated based on Z-transformed values using the standard normal distribution, and were further corrected by multiple testing using the Benjamini–Hochberg false discovery rate (FDR) method """ if not sep: df = pd.read_csv(infile, sep='\t', dtype={val_col: float}) else: df = pd.read_csv(infile, sep=sep, dtype={val_col: float}) print(f'data loaded: {df.shape[0]} rows, {df.shape[1]} columns') df = df.dropna(subset=[val_col]) print(f'data after dropna: {df.shape[0]} rows, {df.shape[1]} columns') mean = df[val_col].mean() std = df[val_col].std() print(f'mean: {mean}, std: {std}') df['Z-score'] = zscore(df[val_col].values) if tail == 'right': df['Pvalue'] = df['Z-score'].apply( lambda x: norm.sf(x) ) # Survival function (also defined as 1 - cdf, but sf is sometimes more accurate). elif tail == 'left': df['Pvalue'] = df['Z-score'].apply( lambda x: norm.cdf(x)) # Cumulative distribution function. # df['FDR'] = multicomp(df['Pvalue'].values, method='fdr_bh')[1] df['FDR'] = fdrcorrection(df['Pvalue'].values, alpha=0.05, method='indep', is_sorted=False)[1] df.to_csv(outfile, sep='\t', index=False)
def global_fdr(self, df, alpha_fdr): """Determine the global_padj values through FDR multiple testing correction over all gene - GO annotation pairs present in the output file. """ global_stats = { 'global_padj': [], 'cilow_global_padj': [], 'ciupp_global_padj': [] } colloc = { 'global_padj': 8, 'cilow_global_padj': 4, 'ciupp_global_padj': 4 } ids = df[~df['pval_rep0'].isna()].index qvals = np.empty((len(ids), len(self.nvs))) qvals[:] = np.nan for i in range(len(self.nvs)): _, qvals[:, i] = fdrcorrection(df['pval_rep' + str(i)][ids], alpha=alpha_fdr, method='indep') for i in range(qvals.shape[0]): mean_padj, low_padj, upp_padj = self.log_stats(qvals[i, :]) global_stats['global_padj'].append(mean_padj) global_stats['cilow_global_padj'].append(low_padj) global_stats['ciupp_global_padj'].append(upp_padj) for key in global_stats.keys(): df.insert((len(df.columns) - colloc[key] - len(self.nvs)), key, np.nan) df.loc[ids, key] = global_stats[key] return df
def simper_mothur(fn, order, meta, tax, rng): with open(order, 'rU') as f: rows = [] for r in csv.reader(f): rows.append(r) simp_order, cont = [], [] for row in range(len(rows)): if row > 0 and row < 6: simp_order.append(rows[row][0]) cont.append(float(rows[row][1])*100) with open(tax, 'rU') as f: rows = [] for row in csv.reader(f): rows.append(row) tax = [] for a in range(len(simp_order)): for b in range(len(rows)): if simp_order[a] == rows[b][0]: phylo = [rows[b][2], rows[b][3], rows[b][4], rows[b][5], rows[b][6]] if phylo[4][-12:] != 'unclassified': this_tax = r'$'+str(phylo[4])+'$' else: this_tax = phylo[4][:-13] tax.append(this_tax) print_otus = [] for c in simp_order: totu = 'OTU' d = 0 while d < len(c): if d > 2 and c[d] != '0': totu += c[d:] d = len(c) d += 1 totu += '\n' print_otus.append(totu) with open(fn, 'rU') as f: rows = [] for row in csv.reader(f): rows.append(row) simp_rows = [] for e in range(len(simp_order)): for f in range(len(rows)): if rows[f][0] == simp_order[e]: simp_rows.append(rows[f]) krusk, krusk_p, treat_mean, treat_sd = [], [], [], [] for g in simp_rows: krusk.append(float(g[-2])) krusk_p.append(float(g[-1])) this_mean, this_sd = [], [] for h in range(rng): h += 1 if h % 2 != 0: this_mean.append(float(g[h])*100) else: this_sd.append(float(g[h])*100) treat_mean.append(this_mean) treat_sd.append(this_sd) krusk_p = smm.fdrcorrection(krusk_p)[1] return krusk, krusk_p, treat_mean, treat_sd, cont, print_otus, tax
def full_fdr(p_val_n): s = p_val_n.shape #print(p_val_n.shape) temp = copy.deepcopy(p_val_n) pval = np.ravel(temp) _, pval_fdr = mul.fdrcorrection(pval) pval_fdr_shape = pval_fdr.reshape(s) return pval_fdr_shape
def _fdrcorrect(pvals): """ Perform FDR correction with nan's. """ fdr = np.ones(pvals.shape[0]) _, fdr[~np.isnan(pvals)] = fdrcorrection(pvals[~np.isnan(pvals)]) return fdr
def test_multi_pvalcorrection(): #test against R package multtest mt.rawp2adjp #because of sort this doesn't check correct sequence - TODO: rewrite DONE rmethods = {'rawp':(0,'pval'), 'Bonferroni':(1,'b'), 'Holm':(2,'h'), 'Hochberg':(3,'sh'), 'SidakSS':(4,'s'), 'SidakSD':(5,'hs'), 'BH':(6,'fdr_i'), 'BY':(7,'fdr_n')} for k,v in rmethods.items(): if v[1] in ['b', 's', 'sh', 'hs', 'h', 'fdr_i', 'fdr_n']: #pvalscorr = np.sort(multipletests(pval0, alpha=0.1, method=v[1])[1]) r_sortindex = [6, 8, 9, 7, 5, 1, 2, 4, 0, 3] pvalscorr = multipletests(pval0, alpha=0.1, method=v[1])[1][r_sortindex] assert_almost_equal(pvalscorr, res_multtest[:,v[0]], 15) pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1]) assert_almost_equal(pvalscorr, res_multtest[:,7], 15) pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1]) assert_almost_equal(pvalscorr, res_multtest[:,6], 15)
def test_multi_pvalcorrection(self): #test against R package multtest mt.rawp2adjp res_multtest = self.res2 pval0 = res_multtest[:,0] for k,v in iteritems(rmethods): if v[1] in self.methods: reject, pvalscorr = multipletests(pval0, alpha=self.alpha, method=v[1])[:2] assert_almost_equal(pvalscorr, res_multtest[:,v[0]], 15) assert_equal(reject, pvalscorr <= self.alpha) pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1]) assert_almost_equal(pvalscorr, res_multtest[:,7], 15) pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1]) assert_almost_equal(pvalscorr, res_multtest[:,6], 15)
def multiple_testing_correction(G, pvalues, alpha=0.05, method='benjamini-hochberg', **kwargs): """ correct pvalues for multiple testing and add corrected `q` value :param alpha: significance level default : 0.05 :param method: multiple testing correction method [bonferroni|benjamini-hochberg] """ G.graph.update({ 'multiple-testing-correction': method, 'alpha' : alpha }) if method == 'bonferroni': n = len(pvalues.values()) for term,p in pvalues.items(): node = G.node[term] q = p * n node['q'] = q node['significant'] = q < 0.05 elif method == 'benjamini-hochberg': terms, ps = zip(*pvalues.items()) rejs, qs = fdrcorrection(ps, alpha) for term, q, rej in zip(terms, qs, rejs): node = G.node[term] node['q'] = q node['significant'] = rej else: raise ValueError(method)
def calculate_enrichment(self, genes, reference=None, evidence_codes=None, aspect=None, use_fdr=True): """ Parameters ---------- genes : list list of genes reference : list reference list of species to calculate enrichment evidence_codes : list GO evidence codes use_fdr : bool Correct for multiple hypothesis testing Returns ------- """ # TODO check for alias for genes genes = set(genes) # TODO add aspects term_reference = self.go_to_gene.keys() aspect_dict = { 'P': 'biological_process', 'C': 'cellular_component', 'F': 'molecular_function' } if aspect is None: term_reference = self.go_to_gene gene_reference = self.gene_to_go else: term_reference = dict() gene_reference = dict() if aspect is not None: for i in aspect: if i not in ['P', 'C', 'F']: print("Error: Aspects are only 'P', 'C', and 'F' \n") quit() for i in ['P', 'C', 'F']: if i in aspect: term_reference = None # TODO add reference if reference: # TODO check for reference alias reference = set(reference) reference.intersection_update(set(self.gene_to_go.keys())) else: reference = set(self.gene_to_go.keys()) # TODO add evidence_codes terms = set() for i in genes: if i in self.gene_to_go: for t in self.gene_to_go[i]: terms.add(t) n_genes = len(genes) n_ref = float(len(reference)) res = {} for term in terms: all_annotated_genes = set(self.go_to_gene[term]) mapped_genes = genes.intersection(all_annotated_genes) n_mapped_genes = len(mapped_genes) if n_ref > len(all_annotated_genes): mapped_reference_genes = \ reference.intersection(all_annotated_genes) else: mapped_reference_genes = \ all_annotated_genes.intersection(reference) n_mapped_ref = len(mapped_reference_genes) prob = float(n_mapped_ref) / n_ref p_value = binom_test(n_mapped_genes, n_genes, prob, 'larger') res[term] = ([i for i in mapped_genes], p_value, n_mapped_ref) if use_fdr: res = sorted(res.items(), key=lambda x: x[1][1]) fdr = fdrcorrection([p for _, (_, p, _) in res], is_sorted=True) values = fdr[1] res = dict([(index, (genes, p, ref)) for (index, (genes, _, ref)), p in zip(res, values)]) return res
def enrichment(genes, popfile, fgname, generegulation=None, myfilter=[5, 2000], org='hsa', go=None, kegg=None, pvalue=0.1, anno=None, **kwargs): """富集分析主程序 Parameters ---------- kwargs:其他参数 anno:基因,Term的注释信息 pvalue:网页文件中的Pvalue阈值 kegg:KEGG文件夹 go:GO文件 org:物种 myfilter:过滤条件 generegulation:基因上下调情况 fgname:分组名称 popfile:数据库 genes:差异基因list """ dbname = os.path.basename(popfile)[:-4] head = ( 'Term_ID\tTerm_description\tTerm_url\tListHit\tListTotal\tPopHit\tPopTotal' '\tFoldEnrichment\tGenes\tGeneSymbols\tP_value\t -log10(pvalue)' ).split('\t') #数据库中的基因 allgenes = {n.split('\t')[0]: n.strip().split('\t')[1] for n in open(popfile)} #差异基因在数据库中的基因 listgenes = {n: allgenes.get(n) for n in genes if allgenes.get(n)} if len(listgenes) == 0: logging.warn(u'差异基因在%s数据库中没有注释' % dbname) try: dbid = kwargs['iddb'] genes = [dbid.get(n) for n in genes if dbid.get(n)] listgenes = {n: allgenes.get(n) for n in genes if allgenes.get(n)} logging.info('经过ID转换后共计%d个基因转换成功!' % len(listgenes)) except Exception as e: logging.warn('ID转换不成功,error:%s' % e) raise ValueError else: logging.info('%d个差异基因在%s数据库中有注释。' % (len(listgenes), dbname)) poptotal, listtotal = len(allgenes), len(listgenes) popterms, listterms = count(allgenes), count(listgenes) data = [] for term in listterms: listhit, pophit = len(listterms[term]), len(popterms[term]) if isinstance(myfilter, list): if pophit < min(myfilter) or pophit > max(myfilter): continue else: if pophit < myfilter: continue table = ([listhit, listtotal - listhit], [pophit, poptotal - pophit]) oddsratio, p_value = fisher_exact(table, 'greater') gene = listterms[term] genesy = [anno.get(n, n) for n in gene] url = geturl(dbname, term, gene, generegulation) vv = -log10(p_value) line = (term, anno[term], url, listhit, listtotal, pophit, poptotal, oddsratio, ';'.join(gene), ';'.join(genesy), p_value, str(vv)) data.append(line) if len(data) == 0: logging.debug('Pvalue计算过程没有出结果!') raise ValueError('Pvalue计算不出结果') df = pd.DataFrame(data, columns=head) df = df.sort_values(by='P_value') fdr = df['P_value'] reject, pvals_corrected = mul.fdrcorrection(fdr) df['FDR_bh'] = pvals_corrected tar = kegg if "KEGG" in dbname else go df.to_csv(r'%s\%s\%s_%s.csv' % (tar, fgname, fgname, dbname), index=False) plot.plmyfig(df, dbname, fgname, tar, count=20) df = df[df['P_value'] <= pvalue] HTML.df2html(df, fgname, dbname, tar)