def test_multi_pvalcorrection(): # test against R package multtest mt.rawp2adjp # because of sort this doesn't check correct sequence - TODO: rewrite DONE rmethods = { "rawp": (0, "pval"), "Bonferroni": (1, "b"), "Holm": (2, "h"), "Hochberg": (3, "sh"), "SidakSS": (4, "s"), "SidakSD": (5, "hs"), "BH": (6, "fdr_i"), "BY": (7, "fdr_n"), } for k, v in rmethods.items(): if v[1] in ["b", "s", "sh", "hs", "h", "fdr_i", "fdr_n"]: # pvalscorr = np.sort(multipletests(pval0, alpha=0.1, method=v[1])[1]) r_sortindex = [6, 8, 9, 7, 5, 1, 2, 4, 0, 3] pvalscorr = multipletests(pval0, alpha=0.1, method=v[1])[1][r_sortindex] assert_almost_equal(pvalscorr, res_multtest[:, v[0]], 15) pvalscorr = np.sort(fdrcorrection0(pval0, method="n")[1]) assert_almost_equal(pvalscorr, res_multtest[:, 7], 15) pvalscorr = np.sort(fdrcorrection0(pval0, method="i")[1]) assert_almost_equal(pvalscorr, res_multtest[:, 6], 15)
def run_script(data_file, options): k = 0 genes, p_vals, gene_data = [], [], [] overs, unders, obs = [], [], {} pm_under, pn_under, pm_over, pn_over = [], [], [], [] for line in open(data_file): line = line.split() if line[0] == '---': header = line else: if k == 0: dex = out_line(line) if '|' in line[0] or ';RP' in line[0]: continue dex.add_line(line) genes.append(dex.gene) obs[dex.gene] = dex.OR overs.append(dex.over) unders.append(dex.under) pn_under.append(dex.under[-2]) pm_under.append(dex.under[-1]) pn_over.append(dex.over[-2]) pm_over.append(dex.over[-1]) k += 1 # genes.append(line[0]) # gene_data.append(line[6]) # p = float(line[-1]) # p = float(line[10]) # p_vals.append(p) #k+=1 #if k > 10: break over_fdr_bool, over_fdr_res = mpt.fdrcorrection0(pm_over, alpha=0.05) under_fdr_bool, under_fdr_res = mpt.fdrcorrection0(pm_under, alpha=0.05) # over_fdr_bool, over_fdr_res = mpt.fdrcorrection0(pn_over,alpha=0.05) # under_fdr_bool, under_fdr_res = mpt.fdrcorrection0(pn_under,alpha=0.05) #print mpt.multipletests(p_vals,alpha=0.05,method='fdr_bh') #print 'hu' # for g,b,f in zip(genes,over_fdr_bool,over_fdr_res): # if b == True: # print g,b,f # print len(genes), len(under_fdr_bool), len(under_fdr_res) # sys.exit() for g, b, f, d in zip(genes, over_fdr_bool, over_fdr_res, overs): #if b == True: print g, obs[g], 'over', b, f, d[0] for g, b, f, d in zip(genes, under_fdr_bool, under_fdr_res, unders): #if b == True: print g, obs[g], 'under', b, f, d[0]
def tabulate_week_to_week0_paired_stats(df, sample_type, metric, test_fn=scipy.stats.wilcoxon): # alternative test_fn: partial(scipy.stats.ttest_1samp, popmean=0) asd_data = filter_sample_md(df, [('SampleType', sample_type), ('Group', 'autism')]) results = [] asd_data['week'] = pd.to_numeric(asd_data['week'], errors='coerce') asd_data = asd_data.sort_values(by='week') weeks = asd_data['week'].unique() for i in weeks: g = asd_data[np.logical_or(asd_data['week'] == 0, asd_data['week'] == i)].groupby('SubjectID') paired_diffs = g.diff()[metric].dropna() t, p = test_fn(paired_diffs) results.append((len(paired_diffs), np.median(paired_diffs), t, p)) result = pd.DataFrame(results, index=pd.Index(weeks, name='week'), columns=['n', metric, 'test-statistic', 'p-value']) # Masking motiviated by the issue presented here (which is also where the # masking solution is derived from): # https://github.com/statsmodels/statsmodels/issues/2899 pvals = result['p-value'].values mask = np.isfinite(pvals) qvals = np.empty(len(pvals)) qvals.fill(np.nan) qvals[mask] = fdrcorrection0(pvals[mask])[1] result['q-value'] = qvals return result
def findBestKnockdownUsingMarginalLinearRegression(X, y, knockdownZeros): d = X.shape[1] best_objval = 0.0 best_knockdown = None obj_vals = [] p_vals = [] for feat in range(d): #objval, pVal = stats.pearsonr(X[:,feat], y[:,0]) objval, intercept, r_value, pVal, std_err = stats.linregress(X[:,feat], y[:,0]) obj_vals.append(objval * (knockdownZeros[feat]-np.mean(X[:,feat]))) p_vals.append(pVal) #print("Coeffs:", obj_vals) #print("Pvals:", p_vals) rejected, p_vals_corrected = mcp.fdrcorrection0(p_vals, alpha=0.05) #print("Corrected:", p_vals_corrected) #print("Rejected:", rejected) for feat in range(d): if rejected[feat] == True and obj_vals[feat] > best_objval: best_objval = obj_vals[feat] best_knockdown = feat if rejected[feat] == False: obj_vals[feat] = 0 if best_knockdown is None: print("warning (indep. marg. regr.): no good knockdown identified") return((best_knockdown, best_objval, obj_vals))
def adjust_r(r, n=3539, **fdr_params): from statsmodels.sandbox.stats.multicomp import fdrcorrection0 from scipy.stats import betai df = n - 2 t_squared = r * r * (df / ((1.0 - r) * (1.0 + r))) prob = betai(0.5 * df, 0.5, df / (df + t_squared)) return fdrcorrection0(prob)
def p_adj_map_from_scores(r, n=3539): '''Creates a p map with adjusted p values from scores (correlations)''' from scipy.stats import betai df = n-2 t_squared = r*r * (df / ((1.0 - r) * (1.0 + r))) prob = betai(0.5*df, 0.5, df / (df+t_squared)) return fdrcorrection0(prob)
def main(algo_sample=None, dataset_sample=None, tsv_file_name=os.path.join(constants.OUTPUT_GLOBAL_DIR, "emp_fdr", "MAX", "emp_diff_{}_{}_md.tsv")): output_md = pd.read_csv(tsv_file_name.format(dataset_sample, algo_sample), sep='\t', index_col=0).dropna() output_md = output_md.rename(columns={"filtered_pval": "hg_pval"}) filtered_genes = output_md.loc[np.logical_and.reduce( [output_md["n_genes"].values > 5, output_md["n_genes"].values < 500] ), ["GO name", "hg_pval", "emp_pval", "passed_oob_permutation_test"]] print "total n_genes with pval:{}/{}".format( np.size(filtered_genes["hg_pval"].values), 7435) sorted_genes_hg = filtered_genes.sort_values(by=['hg_pval'], ascending=False) sig_genes_hg_pval = np.append( sorted_genes_hg["hg_pval"].values, np.zeros(7435 - np.size(sorted_genes_hg["hg_pval"].values))) sig_genes_hg_pval = [10**(-x) for x in sig_genes_hg_pval] fdr_results = fdrcorrection0(sig_genes_hg_pval, alpha=0.05, method='indep', is_sorted=False) n_hg_true = len([cur for cur in fdr_results[0] if cur]) sig_hg_genes = sorted_genes_hg.iloc[:n_hg_true, :] if n_hg_true > 0 else 0 HG_CUTOFF = 10**(-sig_hg_genes.iloc[-1]["hg_pval"]) print "HG cutoff: {}, n={}".format(HG_CUTOFF, len(sig_hg_genes.index)) sorted_genes_emp = filtered_genes.sort_values(by=['emp_pval']) sorted_genes_emp.loc[sorted_genes_emp['emp_pval'] == 0, 'emp_pval'] = 1.0 / 1000 sig_genes_emp_pval = sorted_genes_emp["emp_pval"].values fdr_results = fdrcorrection0(sig_genes_emp_pval, alpha=0.05, method='indep', is_sorted=False) n_emp_true = len([cur for cur in fdr_results[0] if cur]) sig_emp_genes = sorted_genes_emp.iloc[:n_emp_true, :] EMP_CUTOFF = sig_emp_genes.iloc[-1]["emp_pval"] if n_emp_true > 0 else 0 print "EMP cutoff: {}, n={}".format(EMP_CUTOFF, len(sig_emp_genes.index)) # genes_oob = filtered_genes.loc[filtered_genes["passed_oob_permutation_test"],:] # print "OOB genes: n={}".format(len(genes_oob.index)) return sig_hg_genes, sig_emp_genes
def FDR(pvalues): """ pvalues - list of p p-values returns a list of q-values """ # res[1] is a vector with the "q-values" (these are the FDR-adjusted p-values) res = fdrcorrection0(pvalues) return res[1]
def calc_sliding_fdr_correct(feat_df, unannotated_only=False): df = feat_df.fillna(0.0) if unannotated_only: df = df[~df.annotated] #df['sliding_pvalues'] = stats.norm.sf(abs(df['sliding_zscore'].values)) df['sliding_pvalues'] = stats.norm.sf(df['sliding_zscore'].values) df['sliding_pvalues_fdrcor'] = fdrcorrection0(df['sliding_pvalues'])[1] return df[['sliding_pvalues', 'sliding_pvalues_fdrcor']]
def anova(self, sub=None, bottom=0, top=None, step=1, verbose=True): """ perform Anova tests (omnibus test for difference in mean) for each epoch with mask as factor Args: sub: list of integers, specifies a subsample of the masks bottom: lower-range for printing epochs (inclusive) top: upper range for printing epochs (exclusive) step: step size for priniting epochs Returns: F p-values (uncorrected) p-values (corrected) """ if not top: top = self.epochs assert [type(x) for x in [bottom, top, step] ] == [int, int, int ], 'bottom, top, step must be of type int, got %s' % str( [type(x) for x in [bottom, top, step]]) if not sub: sub = range(self.masks) res = [ ] # will become a list of length = number of epochs. each element in list is a tuple (F, p) for i in xrange(self.epochs): args = np.split( self.data[sub, :, i], len(sub), axis=0 ) # split results into multiple arrays, one for each mask args = [x.reshape((x.shape[1])) for x in args] F, p = stats.f_oneway(*args) # F: test statistic, p: p-value res.append([F, p]) _, p_corr_list = multicomp.fdrcorrection0( [x[1] for x in res], alpha=0.05, method='indep' ) # correct p-value for multiple comparison (Benjamini-Hochberg) if verbose: print "ANOVA: %s" % self.name print "Epoch\t", for i in sub: label = self.labels[i] print "MEAN(%s)\t" % label[0:9], print "F\tp\tSignif.\tp corr\tSignif" for i in xrange(bottom, top, step): print "%.0f\t" % i, for j in sub: print "%.4f\t\t" % self.mean[j, i], F, p = res[i] p_corr = p_corr_list[i] print "%.2f\t%.4f\t%s\t%.2f\t%s" % (F, p, p_symbol(p), p_corr, p_symbol(p_corr)) return [x[0] for x in res][bottom:top:step], [ x[1] for x in res ][bottom:top:step], p_corr_list[bottom:top:step]
def test_norm_intensity(df, samp_grps, paired, parametric): """ run t-tests (or nonparametric tests) on dataframe. :param df: intensity df, missing values are NaN :param samp_grps: is a SampleGroups() object :param paired: whether or not to use a paired test :param parametric: Whether or not to use a parametric test :return: dataframe with appended pvalue columns """ grp1_intcols = samp_grps.sample_names[samp_grps.grp_names[0]] grp2_intcols = samp_grps.sample_names[samp_grps.grp_names[1]] # change any zeros back to NaN df.replace(0, np.nan, inplace=True) # make copy, so df isn't changed by this function test_df = df.copy() # test, using logged df if parametric: # don't need to split into paired/unpaired, because both are available in ttest_ind test_results = test_df.apply( lambda x: sps.stats.ttest_ind(x[grp1_intcols].dropna(), x[grp2_intcols].dropna(), equal_var=paired).pvalue, axis=1) else: if paired: # wilcoxon is non-parametric equivalent of paired t-test test_results = test_df.apply(lambda x: sps.wilcoxon( x[grp1_intcols].dropna(), x[grp2_intcols].dropna()).pvalue, axis=1) else: # rank sum test is nonparametric equivalent of unpaired t-test test_results = test_df.apply(lambda x: sps.ranksums( x[grp1_intcols].dropna(), x[grp2_intcols].dropna()).pvalue, axis=1) # append fold changes to df df_means = log2_fold_change(df, samp_grps) # p values, uncorrected for multiple comparisons #df_means[P_COLNAME] = test_results df_means[P_COLNAME + "_" + samp_grps.fc_name.replace("log2fc_", "")] = test_results # fdr correction #df_means[P_CORR_COLNAME] = mc.fdrcorrection0(test_results, method='indep')[1] df_means[P_CORR_COLNAME + "_" + samp_grps.fc_name.replace("log2fc_", "")] = mc.fdrcorrection0( test_results, method='indep')[1] # reset the index to be 'id' - this is mostly for testing, and doesn't affect the output file df_means.set_index('id', drop=False, inplace=True) return df_means
def retain_relevant_slices(G_original, module_sig_th): global G_modularity pertubed_nodes = [] for cur_node in G_modularity.nodes(): if G_modularity.nodes[cur_node]["pertubed_node"]: pertubed_nodes.append(cur_node) ccs = [ G_modularity.subgraph(c) for c in connected_components(G_modularity) ] params = [] p = multiprocessing.Pool(constants.N_OF_THREADS) n_G_original = len(G_original) n_pertubed_nodes = len(pertubed_nodes) pertubed_nodes_in_ccs = [] print(f"number of slices: {len(list(ccs))}") for i_cur_cc, cur_cc in enumerate(ccs): pertubed_nodes_in_ccs.append( len([ cur_node for cur_node in cur_cc if G_modularity.nodes[cur_node]["pertubed_node"] ])) perturbation_factor = min(0.7, (float(n_pertubed_nodes) / n_G_original) * (1 + 100 / n_G_original**0.5)) for i_cur_cc, cur_cc in enumerate(ccs): params.append([ n_G_original, cur_cc, i_cur_cc, n_pertubed_nodes, perturbation_factor ]) res = [a for a in p.map(pf_filter, params) if a is not None] print(f'# of slices after perturbation TH: {len(res)}/{len(params)}') p.close() if len(res) == 0: return nx.Graph(), [], [] large_modules, sig_scores = zip(*res) fdr_bh_results = fdrcorrection0(sig_scores, alpha=module_sig_th, method='indep', is_sorted=False) # print(fdr_bh_results) # print(f'min: {min(list(fdr_bh_results[1]))}') passed_modules = [ cur_cc for cur_cc, is_passed_th in zip(large_modules, fdr_bh_results[0]) if is_passed_th ] return nx.algorithms.operators.union_all(passed_modules) if len(passed_modules) > 0 else nx.Graph(), [list(m.nodes) for m in passed_modules], \ fdr_bh_results[1]
def p_adj_map_from_predictions(preds_pc, data_to_map): '''Creates a p map with adjusted p values from predictions''' from sklearn.preprocessing import StandardScaler from scipy.stats import betai mx = StandardScaler().fit_transform(preds_pc) my = StandardScaler().fit_transform(data_to_map) n = mx.shape[0] r = (1/(n-1))*((mx*my).sum(axis=0)) df = n-2 t_squared = r*r * (df / ((1.0 - r) * (1.0 + r))) prob = betai(0.5*df, 0.5, df / (df+t_squared)) return fdrcorrection0(prob)
def save_heads(heads_path, t_data, p_data, sensor_type, freqs): if not os.path.isdir(heads_path): os.makedirs(heads_path) mask, adjusted_p = fdrcorrection0(p_data.flatten(), 0.3) mask = mask.reshape(p_data.shape) t_data[~mask] = 0.001 # Because topomap can't drow zero heads for freq_indx in range(t_data.shape[1]): title = visualise(t_data[:, freq_indx], p_data[:, freq_indx], sensor_type, freqs[freq_indx]) plt.savefig(os.path.join(heads_path, title + '.png')) plt.close()
def _fdr_correct(self, pvalue_matrix, dom_shape): logger.info('FDR correcting the pvalues') pvalue_matrix[np.isnan(pvalue_matrix)] = 1.0 corrected = fdrcorrection0(pvalue_matrix.flatten(), self.threshold)[1].reshape(dom_shape) #np.save(open('pval.npy','wb'), pvalue_matrix) corr_sigs = [] for i in range(dom_shape[0]): for j in range(dom_shape[1]): pval = corrected[i, j] if pval < self.threshold: corr_sigs.append((i, j, pval)) return corr_sigs
def calc(self, regs, hicmap): """ Calculates weighted intensities, p-values and q-values (FDR-corrected p-values) for region intensity profiles. :param hicmap: HiC map to run calculations for :param regs: list ot Bins (with regions) """ pvals = [] ints = [] for region in regs: pvals.append([]) for i in range(-self.num_bins, self.num_bins + 1): if hicmap.fits[i] is not None: pvals[-1].append( ss.exponweib.sf( region.intensities[len(region.pvalues)][len( pvals[-1])], *hicmap.fits[i])) else: pvals[-1].append(1.0) ints.append(region.intensities[len(region.pvalues)]) p = np.array(pvals[-1]) pvals[-1] = p p[p == 0.0] = 0.000000000000000000000000000001 # TODO delete that? useful for log representation region.pvalues.append(p) logger.debug('Calculated pvalues for map ' + hicmap.get_name()) pvals, ints, corr_big = np.array(pvals), np.array(ints), np.array(ints) corrected = fdrcorrection0( np.array(pvals)[ints.nonzero()], self.threshold)[1] logger.debug('Calculated qvalues for map ' + hicmap.get_name()) corrected[ corrected == 0.0] = 0.000000000000000000000000000001 # TODO delete that? useful for log representation corr_big[corr_big.nonzero()] = corrected corr_big[np.nonzero(corr_big == 0.0)] = 1.0 corr_big.reshape(pvals.shape) for r in range(len(regs)): x = len(regs[r].corrected_pvalues) regs[r].corrected_pvalues.append(corr_big[r]) regs[r].weighted.append([ 0.0 if np.isnan( hicmap.means[-(int(len(regs[r].intensities[x]) / 2.0) - regs[r].intensities[x].index(y))]) else y / hicmap.means[-(int(len(regs[r].intensities[x]) / 2.0) - regs[r].intensities[x].index(y))] for y in regs[r].intensities[x] ])
def tip_fdr(a, alpha=0.05): """ Returns adjusted TIP p-values for a particular `alpha`. (see :func:`tip_zscores` for more info) :param a: NumPy array, where each row is the signal for a feature :param alpha: False discovery rate """ zscores = tip_zscores(a) pvals = stats.norm.pdf(zscores) rejected, fdrs = fdrcorrection0(pvals) return fdrs
def plot_dists(original, bg, interactions=None, file_name="", p_factor=10, file_suffix=None, use_cache=1): if interactions is None: interactions = sorted(list(original.index)) if len(interactions) > 9: print "splitting interactions (total: {})".format(len(interactions)) interaction_sets = [ interactions[a * 9:min((a + 1) * 9, len(interactions))] for a in np.arange(len(interactions) / 9 + 1) ] else: interaction_sets = [interactions] df_emp_pvals = pd.DataFrame() params = [] p = multiprocessing.Pool(p_factor) output = multiprocessing.Manager().dict() for i_sets, interactions in enumerate(interaction_sets): params.append([ bg, file_name, i_sets, interactions, original, file_suffix, output ]) # plot_interaction_set([bg, file_name, interactions, original, output]) p.map(plot_interaction_set, params) p.close() print "done!" for k, v in dict(output).iteritems(): df_emp_pvals.loc[k, "pval"] = v df_emp_pvals['pval'][df_emp_pvals['pval'] == 0] = 1.0 / bg.shape[1] df_emp_pvals['qval'] = fdrcorrection0(df_emp_pvals.loc[:, 'pval'])[1] df_emp_pvals['enrichment_score'] = -np.log10(df_emp_pvals.loc[:, 'pval']) df_emp_pvals['zscore'] = zscore(df_emp_pvals.loc[:, 'enrichment_score']) df_emp_pvals['rank'] = rankdata(df_emp_pvals.loc[:, 'pval']) df_emp_pvals.to_csv(os.path.join( constants.OUTPUT_FOLDER, "emp_pvals_summary_{}_{}.tsv".format(file_suffix, file_name)), sep='\t') return df_emp_pvals
def calc_ttest(dataset=constants.DATASET_NAME, gene_expression_file_name="ge.tsv"): h_rows, h_cols, ge_dataset = infra.separate_headers( infra.load_gene_expression_profile_by_genes( gene_expression_file_name=gene_expression_file_name)) classes = np.array(infra.load_classes()).astype(np.int) pvals = [] rows_to_delete = [] pval_dict = {} for i, cur in enumerate(list(h_rows)): pval_dict[cur] = ttest_ind(ge_dataset[i, classes == 1], ge_dataset[i, classes == 2]).pvalue if np.isnan(pval_dict[cur]): print "case: {}, wt: {}".format(ge_dataset[i, classes == 1], ge_dataset[i, classes == 2]) rows_to_delete.append(i) else: pvals.append(pval_dict[cur]) ind = np.ones((len(h_rows), ), bool) ind[rows_to_delete] = False h_rows = h_rows[ind] ge_dataset = ge_dataset[ind, :] # print pvals qvals = fdrcorrection0(pvals, alpha=0.05, method='indep', is_sorted=False)[1] qscores = [] for i, cur in enumerate(h_rows): qscores.append(-log10(qvals[i])) output_h_cols = ["id"] + list(h_cols) + ["pval", "qval", "qscore"] output_matrix = np.c_[h_rows, ge_dataset, pvals, qvals, qscores] output_matrix = np.r_[np.reshape(output_h_cols, (1, len(output_h_cols))), output_matrix] lines = [] for i, cur in enumerate(output_matrix): lines.append("\t".join(cur)) file(os.path.join(constants.CACHE_DIR, "deg_t.tsv"), "w+").write("\n".join(lines)) return { "result": pd.read_csv(os.path.join(constants.CACHE_DIR, "deg_t.tsv"), sep="\t", index_col=0) }
def tabulate_week_to_control_stats(df, sample_type, metric, test_fn=scipy.stats.mannwhitneyu): # alternative test_fn: partial(scipy.stats.ttest_ind, equal_var=False) control_week0 = control_metric(df, sample_type, metric=metric) asd_data = filter_sample_md(df, [('SampleType', sample_type), ('Group', 'autism')]) results = [] asd_data['week'] = pd.to_numeric(asd_data['week'], errors='coerce') asd_data = asd_data.sort_values(by='week') weeks = asd_data['week'].unique() for i in weeks: weeki = asd_data[metric][asd_data['week'] == i].dropna() t, p = test_fn(weeki, control_week0) results.append((len(weeki), np.median(weeki), t, p)) result = pd.DataFrame(results, index=pd.Index(weeks, name='week'), columns=['n', metric, 'test-statistic', 'p-value']) result['q-value'] = fdrcorrection0(result['p-value'])[1] return result
def calc_pval_dist(mat, aneu_types, chr_interaction, pval_method_name, label): hg_pvals = [] for a in np.arange(1, 23): for arm_a in ['p', 'q']: for b in np.arange(a, 23): for arm_b in ['p', 'q']: for aneu_type_a, aneu_type_b in aneu_types: if (arm_a <= arm_b and b == a) or (chr_interaction and b == a) or (not chr_interaction and b != a): continue if "{}{}".format(a, arm_a) not in mat.columns: continue if "{}{}".format(b, arm_b) not in mat.columns: continue pval = PVAL_METHODS[pval_method_name](mat, a, arm_a, aneu_type_a, b, arm_b, aneu_type_b) hg_pvals.append({ "{}{}{}-{}{}{}".format(a, arm_a, aneu_type_a, b, arm_b, aneu_type_b): pval }) qvals = fdrcorrection0([a.values()[0] for a in hg_pvals])[1] hg_qvals = [] for i, a in enumerate(hg_pvals): hg_qvals.append({a.keys()[0]: qvals[i]}) df = pd.DataFrame(data=[[a.values()[0], hg_qvals[i].values()[0]] for i, a in enumerate(hg_pvals)], columns=['pval', 'qval'], index=[a.keys()[0] for a in hg_pvals]) df["enrichment_score"] = -np.log10(df.loc[:, "pval"]) df["zscore"] = zscore(df.loc[:, "enrichment_score"]) df["rank"] = rankdata(df.loc[:, "pval"]) df = df.sort_values(by=["pval"]) df.to_csv(os.path.join( constants.OUTPUT_FOLDER, "hgs_emp_dist_{}_{}_{}.tsv".format( constants.CHR_INTERACTION_NAMES[chr_interaction], pval_method_name, label)), sep='\t') return df
def main(): res_path = os.path.join( '/home/dsoto/public/Fantasmas_MRI_analysis/images_forAnalysis/Functional/results-univar-FS' ) formatted = format_results(res_path) # paired t-test for comparison of read and reenact # classification performance in all ROIs p_values = ttest_1samp(formatted, 0.5)[1] corr_p_mask, corr_p_values = fdrcorrection0(p_values) print pd.DataFrame( { "mean": formatted.mean(), 'corrected': corr_p_values, 'uncorrected': p_values }, index=formatted.columns).T
def ttest_fdr_corrected(X, y, alpha=0.05, return_as_df=False): """ FDR corrected ttest pvalue (created 11/20/2015) http://statsmodels.sourceforge.net/devel/generated/ statsmodels.sandbox.stats.multicomp.fdrcorrection0.html#statsmodels.sandbox.stats.multicomp.fdrcorrection0 Updates ------- - 11/20/2015: created function - 01/25/2016: added option ``return_as_df`` Parameters ---------- X : ndarray of shape [n,p] Data matrix (samples as row vectors) y : ndarray of shape [n,] Label vector return_as_df : bool (default=False) Return result as [n,3] shaped pandas DataFrame Returns ------- tstats : ndarray of shape [n,] Vector of tstats pval_corr : ndarray of shape [n,] pvalues adjusted for multiple hypothesis testing to limit FDR idx_rejected : array, bool (shape [n,]) True if a hypothesis is rejected, False if not """ tstats, pval = ttest_twosample_fixnan(X, y) from statsmodels.sandbox.stats.multicomp import fdrcorrection0 idx_rejected, pval_corr = fdrcorrection0(pval, alpha=0.05) idx_rejected = idx_rejected.astype(int) #tstats[~idx_rejected] = 0 #^^^commented out on 12/03/2015 if return_as_df: df = pd.DataFrame([tstats, pval_corr, idx_rejected], index=['tstats', 'pval', 'rejected']).T return df else: return tstats, pval_corr, idx_rejected
def doHyperG(genelist, allgenes, allterms, assocname): geneswithterms = allgenes.keys() termswithgenes = allterms.keys() M=len(geneswithterms) N=len(list(set(geneswithterms).intersection(set(genelist)))) pvalues=[] termsingenelist=[] termsinbackground=[] termname=[] for t in termswithgenes: n = len(allterms[t]) x = len(list(set(allterms[t]).intersection(set(genelist)))) if x == 0: continue pvalue = 1.0 - hypergeom.cdf(x,M,n,N) pvalues.append(pvalue) termsingenelist.append(x) termsinbackground.append(n) termname.append(t) adjpvalue = list(fdrcorrection0(pvalues)[1]) print("\t".join(["Term annotation", "pvalue", "fdr adj pvalue","Background","Expected","GeneList","Observed","Genes"])) for u in range(0,len(adjpvalue)): gotermname = termname[u] if termname[u] in assocname.keys(): gotermname = assocname[termname[u]] print("\t".join([gotermname, str(pvalues[u]), str(adjpvalue[u]), str(M), str(termsinbackground[u]), str(N), str(termsingenelist[u]), ",".join(list(set(allterms[termname[u]]).intersection(set(genelist))))] ) )
def RFE(tested_gene_list_file_name, expression_profile_file_name, phenotype_file_name, rank_method=LOGISTIC_REGRESSION, gene_filter_file_name="protein_coding.txt", rounds=2, recursion_step_size=2, start_index=0, recursion_number_of_steps=20, pval_preprocessing_file_name=None, permutation=NORMAL, groups=None, classification_method="svm_rbf_default", tuning_parameters={ 'C': [10], 'kernel': ['rbf'] }): thismodule = sys.modules[__name__] clf = getattr(thismodule, classification_method)(tuning_parameters) print "about ot analyse: {}".format(tested_gene_list_file_name) # fetch gene expression by gene_id, divided by tumor type # test pval for significance differentiation between label values (primar vs metastatic) data, labels, groups, gene_ids = load_svm_data( tested_gene_list_file_name, expression_profile_file_name, phenotype_file_name, gene_filter_file_name=gene_filter_file_name, groups=groups) if os.path.isfile(os.path.join( CACHE_DIR, pval_preprocessing_file_name)) and USE_CACHE: gene_pval_pair = load_sets( os.path.join(CACHE_DIR, pval_preprocessing_file_name)) print "pval loaded from file" else: group_0_expression = groups[0] group_1_expression = groups[1] pvals = [] gene_symbols = [] for i in range(1, len(group_0_expression)): cur_pval = scipy.stats.ttest_ind( [float(c) for c in group_0_expression[i][1:]], [float(c) for c in group_1_expression[i][1:]])[1] if not math.isnan(cur_pval): pvals.append(cur_pval) gene_symbols.append(group_0_expression[i][0]) # sort gene_id-pval pairs by pval gene_pval_pair = zip(gene_symbols, pvals) gene_pval_pair.sort(key=lambda x: x[1], reverse=False) save_sets( gene_pval_pair, os.path.join(CACHE_DIR, os.path.join(CACHE_DIR, pval_preprocessing_file_name))) print "pval saved to file" # calculate number of true hyphothesis after correction pvals = [cur[1] for cur in gene_pval_pair] fdr_results = fdrcorrection0(pvals, alpha=0.05, method='indep', is_sorted=True) true_counter = len([cur for cur in fdr_results[0] if cur == True]) print "true hypothesis: {}/{}".format(true_counter, np.size(fdr_results[0])) gene_ids_ranked = [cur[0] for cur in gene_pval_pair] gene_ids_ranked = gene_ids_ranked[:true_counter] if permutation == RANDOMIZED: random.shuffle(gene_ids_ranked) elif permutation == REVERSED: gene_ids_ranked = list( reversed(gene_ids_ranked)) # random.shuffle(gene_ids_ranked) train_scores = [] test_auPR_scores = [] test_auROC_scores = [] for j in range(recursion_number_of_steps): train_scores.append([]) test_auPR_scores.append([]) test_auROC_scores.append([]) genelist_datasets = filter_gene_expressions_preprocessed( data, gene_ids_ranked, recursion_number_of_steps, recursion_step_size, start_index, gene_ids) for i in range(rounds): genelist_datasets = np.rot90(genelist_datasets, k=1, axes=(1, 0)) genelist_datasets, labels = randonize_patients(genelist_datasets, labels) genelist_datasets = np.rot90(genelist_datasets, k=-1, axes=(1, 0)) for j in range(recursion_number_of_steps): # cur_dataset = filter_gene_expressions(genelist_dataset, gene_ids_ranked[:recursion_step_size*(j+1)], gene_ids) cur_dataset = genelist_datasets[j] data_train, data_test, labels_train, labels_test = divide_train_and_test_groups( cur_dataset, labels) test_auPR, test_auROC = apply_svm(clf, data_train, labels_train, data_test, labels_test, rank_method) test_auPR_scores[j].append(test_auPR) test_auROC_scores[j].append(test_auROC) print "#######################################" print "AUPR results:" pr_avgs, pr_vars = print_fre_results(test_auPR_scores, float(rounds), tested_gene_list_file_name, rank_method, permutation) print "AUROC results:" roc_avgs, roc_vars = print_fre_results(test_auROC_scores, float(rounds), tested_gene_list_file_name, rank_method, permutation) return (test_auPR_scores, test_auROC_scores)
from scipy.stats import norm, ttest_1samp from statsmodels.sandbox.stats.multicomp import fdrcorrection0 models = ['logBSC_H200', 'logMFS'] mask = 'temporal_lobe_mask_grp_7T_test.nii.gz' threshold = 0.001 for model in models: # scores = np.arctanh(apply_mask(glob.glob('MaThe/avg_maps/model_{}_p_adj_subj_*'.format(model)), mask_img=mask)).mean(axis=0) # mean_scores = scores.mean(axis=0) # t_values, p_values = ttest_1samp(scores, 0, axis=0) # corr_p_values = fdrcorrection0(p_values, alpha=0.05) ## threshold = np.min(mean_scores[corr_p_values<0.05]) # threshold = 0.001 # fsf.save_map_avg('avg_unthresh', mean_scores, threshold=threshold, model=model) # mean_scores[corr_p_values>=0.05] = 0 # display = fsf.plot_avg(mean_scores, threshold) # fsf.save_map_avg('avg', mean_scores, threshold=threshold, model=model) # display.savefig('mean_scores_model_{}.svg'.format(model)) # display.savefig('mean_scores_model_{}.png'.format(model)) for pc in xrange(1, 4): scores = np.arctanh(apply_mask(glob.glob('MaThe/avg_maps/model_{}_p_adj_pc_{}_subj_*'.format(model, pc)), mask_img=mask)) mean_scores = scores.mean(axis=0) t_values, p_values = ttest_1samp(scores, 0, axis=0) corr_p_values = fdrcorrection0(p_values, alpha=0.05) mean_scores[corr_p_values>=0.05] = 0 display = fsf.plot_avg(mean_scores, threshold, vmax=0.27) display.savefig('mean_scores_model_{}_pc_{}.svg'.format(model, pc)) display.savefig('mean_scores_model_{}_pc_{}.png'.format(model, pc)) fsf.save_map_avg('avg', mean_scores, threshold=threshold, model=model+'_pc_'+str(pc))
def run_stats(y_design, coord_mat, design_data, var_type): n, l, m = y_design.shape print("+++++++Construct the design matrix: normalization+++++++") x_design = read_x(design_data, var_type) p = x_design.shape[1] print("The dimension of design matrix is ", str(x_design.shape)) """+++++++++++++++++++++++++++++++++++""" """Step 2. Statistical analysis: including (1) smoothing and (2) hypothesis testing""" print("+++++++Local linear kernel smoothing+++++++") start = timeit.default_timer() efit_beta, efity_design, h_opt = lpks(coord_mat, x_design, y_design) stop = timeit.default_timer() delta_time = str(stop - start) # print(h_opt) print("Elapsed time is " + delta_time) print( "+++++++Kernel smoothing (order = 1) for smooth functions (eta)+++++++" ) start = timeit.default_timer() resy_design = y_design - efity_design print(np.amax(resy_design)) print(np.amin(resy_design)) efit_eta, res_eta, esig_eta = sif(coord_mat, resy_design, h_opt) print(np.amax(res_eta)) print(np.amin(res_eta)) stop = timeit.default_timer() delta_time = str(stop - start) print("Elapsed time is " + delta_time) print("+++++++Hypothesis testing+++++++") # hypothesis: beta_pj(d)=0 v.s. beta_pj(d)~=0 for all j and d start = timeit.default_timer() lpvals = np.zeros((l, p - 1)) lpvals_fdr = np.zeros((l, p - 1)) gpvals = np.zeros((1, p - 1)) clu_pvals = np.zeros((1, p - 1)) areas = np.zeros((1, p - 1)) num_bstrp = 500 # number of bootstrap samples thres = 2 for pp in range(p - 1): print("Testing whether the covariate " + str(pp + 1) + " is zero or not...") """ local and global statistics calculation """ cdesign = np.zeros((1, p)) cdesign[0, pp + 1] = 1 gstat, lstat = wald_ht(x_design, efit_beta, esig_eta, cdesign) lpvals[:, pp] = 1 - np.squeeze(stats.chi2.cdf(lstat, m)) lpvals_fdr[:, pp] = fdrcorrection0(lpvals[:, pp])[1] ind_thres = -np.log10(lpvals[:, pp]) >= thres area = np.sum(ind_thres) """ Generate random samples and calculate the corresponding statistics and pvalues """ gpval, clu_pval = bstrp_pvalue(coord_mat, x_design, y_design, cdesign, gstat, num_bstrp, thres, area) gpvals[0, pp] = gpval areas[0, pp] = area clu_pvals[0, pp] = clu_pval print("the global p-value for covariate " + str(pp + 1) + " is " + str(gpvals[0, pp]) + "...") print("the p-value of most significant subregion for covariate " + str(pp + 1) + " is " + str(clu_pvals[0, pp]) + "...") stop = timeit.default_timer() delta_time = str(stop - start) print("Elapsed time is " + delta_time) return gpvals, lpvals_fdr, clu_pvals, efit_beta, efity_design, efit_eta
def run_script(input_dir, output_dir): """ Run the commandline script for MFSDA. Args: input_dir (str): full path to the data folder output_dir (str): full path to the output folder """ """+++++++++++++++++++++++++++++++++++""" """Step 1. load dataset """ print("loading data ......") print("+++++++Read the surface shape data+++++++") shape_file_name = input_dir + "aligned_shapes.mat" mat = loadmat(shape_file_name) y_design = mat['aligned_shape'] n, l, m = y_design.shape print("The dimension of shape matrix is " + str(y_design.shape)) print("+++++++Read the sphere coordinate data+++++++") template_file_name = input_dir + "template.mat" mat = loadmat(template_file_name) coord_mat = mat['template'] # d = coord_mat.shape[1] print("+++++++Read the design matrix+++++++") design_data_file_name = input_dir + "design_data.txt" design_data = np.loadtxt(design_data_file_name) # read the covariate type var_type_file_name = input_dir + "var_type.txt" var_type = np.loadtxt(var_type_file_name) print("+++++++Construct the design matrix: normalization+++++++") x_design = read_x(design_data, var_type) p = x_design.shape[1] print("The dimension of design matrix is ", str(x_design.shape)) """+++++++++++++++++++++++++++++++++++""" """Step 2. Statistical analysis: including (1) smoothing and (2) hypothesis testing""" print("+++++++Local linear kernel smoothing+++++++") start = timeit.default_timer() efit_beta, efity_design, h_opt = lpks(coord_mat, x_design, y_design) stop = timeit.default_timer() delta_time = str(stop - start) # print(h_opt) print("Elapsed time is " + delta_time) print("+++++++Kernel smoothing (order = 1) for smooth functions (eta)+++++++") start = timeit.default_timer() resy_design = y_design - efity_design print(np.amax(resy_design)) print(np.amin(resy_design)) efit_eta, res_eta, esig_eta = sif(coord_mat, resy_design, h_opt) print(np.amax(res_eta)) print(np.amin(res_eta)) stop = timeit.default_timer() delta_time = str(stop - start) print("Elapsed time is " + delta_time) print("+++++++Hypothesis testing+++++++") # hypothesis: beta_pj(d)=0 v.s. beta_pj(d)~=0 for all j and d start = timeit.default_timer() lpvals = np.zeros((l, p-1)) lpvals_fdr = np.zeros((l, p-1)) gpvals = np.zeros((1, p-1)) clu_pvals = np.zeros((1, p-1)) areas = np.zeros((1, p-1)) num_bstrp = 500 # number of bootstrap samples thres = 2 for pp in range(p-1): print("Testing whether the covariate " + str(pp+1) + " is zero or not...") """ local and global statistics calculation """ cdesign = np.zeros((1, p)) cdesign[0, pp+1] = 1 gstat, lstat = wald_ht(x_design, efit_beta, esig_eta, cdesign) lpvals[:, pp] = 1 - np.squeeze(stats.chi2.cdf(lstat, m)) lpvals_fdr[:, pp] = fdrcorrection0(lpvals[:, pp])[1] ind_thres = -np.log10(lpvals[:, pp]) >= thres area = np.sum(ind_thres) """ Generate random samples and calculate the corresponding statistics and pvalues """ gpval, clu_pval = bstrp_pvalue(coord_mat, x_design, y_design, cdesign, gstat, num_bstrp, thres, area) gpvals[0, pp] = gpval areas[0, pp] = area clu_pvals[0, pp] = clu_pval print("the global p-value for covariate " + str(pp+1) + " is " + str(gpvals[0, pp]) + "...") print("the p-value of most significant subregion for covariate " + str(pp+1) + " is " + str(clu_pvals[0, pp]) + "...") stop = timeit.default_timer() delta_time = str(stop - start) print("Elapsed time is " + delta_time) """+++++++++++++++++++++++++++++++++++""" """Step3. Save all the results""" gpvals_file_name = output_dir + "global_pvalue.txt" np.savetxt(gpvals_file_name, gpvals) lpvals_fdr_file_name = output_dir + "local_pvalue_fdr.txt" np.savetxt(lpvals_fdr_file_name, lpvals_fdr) clu_pvals_file_name = output_dir + "cluster_pvalue.txt" np.savetxt(clu_pvals_file_name, clu_pvals)
def deg(tested_gene_file_name, total_gene_file_name, gene_expression_file_name, phenotype_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_path=None, groups=None, groups_name=None): print "about ot analyse: {}".format(tested_gene_file_name) # fetch gene expression by gene_id, divided by tumor type11111 groups_results = load_expression_profile_by_labelling( gene_list_file_name=total_gene_file_name, gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, gene_filter_file_name=gene_filter_file_name, tested_gene_path=total_gene_list_path, gene_expression_path=gene_expression_path, phenotype_path=phenotype_path, gene_filter_path=gene_filter_path, groups=groups) print "total # of groups; {}".format(len(groups_results)) group_0_expression = np.array(groups_results[0]).T group_1_expression = np.array(groups_results[1]).T print "# patient in groups 1: {}. # of patients in groups #2: {}".format( group_0_expression.shape[1], group_1_expression.shape[1]) pvals = [] for i in range(1, len(group_0_expression)): mean_differences = np.average([ float(c) for c in group_0_expression[i][1:] ]) - np.average([float(c) for c in group_1_expression[i][1:]]) mean_foldchange = max( np.average([float(c) for c in group_0_expression[i][1:]]), 1) / max(np.average([float(c) for c in group_1_expression[i][1:]]), 1) cur_pval = scipy.stats.ttest_ind( [float(c) for c in group_0_expression[i][1:]], [float(c) for c in group_1_expression[i][1:]])[1] direction = None if not math.isnan(cur_pval): if mean_differences > 0: direction = "downregulated" if mean_differences < 0: direction = "upregulated" pvals.append((group_0_expression[i][0], direction, mean_differences, cur_pval, mean_foldchange)) pvals.sort(key=lambda x: (x[3]), reverse=False) # x[1], fdr_results = fdrcorrection0([x[3] for x in pvals], alpha=0.05, method='indep', is_sorted=False) pvals = [(cur_pval[0], cur_pval[1], cur_pval[2], cur_pval[3], fdr_results[1][i], cur_pval[4]) for i, cur_pval in enumerate(pvals)] true_counter = len([cur for cur in fdr_results[0] if cur == True]) print "true hypothesis: {}/{}".format(true_counter, np.size(fdr_results[0])) # sort gene_id-pval pairs by pval with file( os.path.join( constants.OUTPUT_GLOBAL_DIR, "deg", "deg_{}_{}_{}.txt".format(constants.CANCER_TYPE, groups_name, time.time())), "w+") as f: output = "" df_deg = pd.DataFrame() for cur_pval in pvals: df_deg = df_deg.append( { "id": cur_pval[0], "direction": cur_pval[1], "mean_differences": cur_pval[2], "pval": cur_pval[3], "qval": cur_pval[4], "foldchange": cur_pval[5] }, ignore_index=True) output += "{}\t{}\t{}\t{}\t{}\t{}\n".format(*cur_pval) df_deg = df_deg.set_index("id") df_deg.to_csv(os.path.join(constants.OUTPUT_GLOBAL_DIR, "deg", "deg_{}.tsv").format(constants.CANCER_TYPE), sep='\t') f.write(output) print "pval saved to file" return df_deg
def deg(tested_gene_file_name, total_gene_file_name, gene_expression_file_name, phenotype_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_path=None, groups=None, groups_name=None): print "about ot analyse: {}".format(tested_gene_file_name) # fetch gene expression by gene_id, divided by tumor type11111 groups_results = load_expression_profile_by_labelling( gene_list_file_name=total_gene_file_name, gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, gene_filter_file_name=gene_filter_file_name, tested_gene_path=total_gene_list_path, gene_expression_path=gene_expression_path, phenotype_path=phenotype_path, gene_filter_path=gene_filter_path, groups=groups) group_0_expression = groups_results[0] group_1_expression = groups_results[1] group_0_expression = np.rot90(np.flip(group_0_expression, 1), k=-1, axes=(1, 0)) group_1_expression = np.rot90(np.flip(group_1_expression, 1), k=-1, axes=(1, 0)) # test pval for significance differentiation between label values (primar vs metastatic) pvals = [] gene_symbols = [] for i in range(1, len(group_0_expression)): mean_differences = np.average([ float(c) for c in group_0_expression[i][1:] ]) - np.average([float(c) for c in group_1_expression[i][1:]]) mean_foldchange = max( np.average([float(c) for c in group_0_expression[i][1:]]), 1) / max(np.average([float(c) for c in group_1_expression[i][1:]]), 1) cur_pval = scipy.stats.ttest_ind( [float(c) for c in group_0_expression[i][1:]], [float(c) for c in group_1_expression[i][1:]])[1] direction = None if not math.isnan(cur_pval): if mean_differences > 0: direction = "downregulated" if mean_differences < 0: direction = "upregulated" pvals.append((group_0_expression[i][0], direction, mean_differences, cur_pval, mean_foldchange)) pvals.sort(key=lambda x: (x[1], x[3]), reverse=False) fdr_results = fdrcorrection0([x[3] for x in pvals], alpha=0.05, method='indep', is_sorted=False) pvals = [(cur_pval[0], cur_pval[1], cur_pval[2], cur_pval[3], fdr_results[1][i], cur_pval[4]) for i, cur_pval in enumerate(pvals)] true_counter = len([cur for cur in fdr_results[0] if cur == True]) print "true hypothesis: {}/{}".format(true_counter, np.size(fdr_results[0])) # sort gene_id-pval pairs by pval with file( os.path.join( constants.OUTPUT_DIR, "deg_{}_{}_{}.txt".format(constants.CANCER_TYPE, groups_name, time.time())), "w+") as f: output = "" for cur_pval in pvals: output += "{}\t{}\t{}\t{}\t{}\t{}\n".format(*cur_pval) f.write(output) print "pval saved to file"
(t, p) = ttest_ind(vfdb_parameters['scores'], control_parameters['scores']) pvalue = p / 2 #one tailed test_pvalue[hg_id] = vfdb_parameters test_pvalue[hg_id]['pvalue'] = pvalue if t < 0: false_positives.append(hg_id) tmp_pvalue = [] sorted_hg_ids = [] for hg_id in test_pvalue: tmp_pvalue.append(test_pvalue[hg_id]['pvalue']) sorted_hg_ids.append(hg_id) test_pvalue[hg_id].pop('pvalue') corrected_pvalue = fdrcorrection0(tmp_pvalue)[1] for position in range(len(corrected_pvalue)): hg_id = sorted_hg_ids[position] if corrected_pvalue[position] <= 0.05 and hg_id not in false_positives: true_positives[hg_id] = test_pvalue[hg_id] elif corrected_pvalue[position] > 0.05 and hg_id not in false_positives: false_positives.append(hg_id) out = open('true_positives.pkl', 'wb') dump(true_positives, out) out.close() agreement = [] groups_description = {} descriptions = set() for hg_id in true_positives:
def fdr_correction_and_viz(Pvals_path, Tvals_path, C1_path, C2_path, mask_path, save_destination, affine, header, combination): alpha = 0.05 Pvals = np.load(Pvals_path) Tvals = np.load(Tvals_path) C1 = np.load(C1_path) C2 = np.load(C2_path) mask = nib.load(mask_path).get_data() brain_indices = np.where(mask != 0) from statsmodels.sandbox.stats.multicomp import fdrcorrection0 Pvals_shape = Pvals.shape Qvals = np.zeros(Pvals_shape) map_C1MinusC2 = C1 - C2 # sign(c1-c2) * -1 * log10(p) map_logp = np.multiply(np.sign(map_C1MinusC2), (-1 * np.log10(Pvals))) roi_voxel_stats_matrix = np.zeros( (Pvals_shape[3], 14)) # cozthere are 14 statistical attributes for roi in range(Pvals_shape[3]): print('Computing Stats for ROI: ', roi) # pvals = ma.masked_array(Pvals[0], mask = mask, fill_value = 0) pvals = Pvals[:, :, :, roi] pvals_shape = pvals.shape # inp = pvals[~pvals.mask] # Flatten inp and check if you get back the original matrix after # inp = inp.ravel() pvals_list = pvals[brain_indices] _, qvals_list = fdrcorrection0(pvals_list, alpha) # from IPython.core.debugger import Tracer; Tracer()() # map_logq_list = map_logq[brain_indices] map_logp_list = map_logp[:, :, :, roi][brain_indices] # print("Size of map_logp_list ",map_logp_list.shape) # print("Brain Indices: ", brain_indices) map_C1MinusC2_list = map_C1MinusC2[:, :, :, roi][brain_indices] # Calculate voxel stats using the below function Qvals[:, :, :, roi][brain_indices] = qvals_list map_logq_list = np.multiply(np.sign(map_C1MinusC2_list), (-1 * np.log10(qvals_list))) # print("Size of map_logq_list ",map_logq_list.shape) roi_voxel_stats_matrix[roi, :] = count_voxel_stats( pvals_list, qvals_list, map_logp_list, map_logq_list) # print('Stats Computed for ROI: ',roi) # Save the CSV file and the Additional Brain file to visualize # sign(c1-c2) * -1 * log10(q) map_logq = np.multiply(np.sign(map_C1MinusC2), (-1 * np.log10(Qvals))) save_destination_new = opj(save_destination, combination) if not os.path.exists(save_destination_new): os.mkdir(save_destination_new) print('Saving Files in directory: ', save_destination_new) print('Saving Stats CSV : ', ) csv_name = 'roi_voxel_stats_' + combination + '.csv' np.savetxt( csv_name, roi_voxel_stats_matrix, delimiter=',', header= 'min_pval,min_qval,p_lt_point_1,p_lt_point_01, p_lt_point_05, q_lt_point_1, q_lt_point_01,q_lt_point_05, logq_gt_1point3, logq_gt_1 ,logq_gt_2 ,logp_gt_1point3, logp_gt_1, logp_gt_2' ) print('Saving Pvals.nii.gz') Pvals_name = opj(save_destination_new, 'Pvals.nii.gz') Pvals_brain_with_header = nib.Nifti1Image(Pvals, affine=affine, header=header) nib.save(Pvals_brain_with_header, Pvals_name) print('Saving Tvals.nii.gz') Tvals_name = opj(save_destination_new, 'Tvals.nii.gz') Tvals_brain_with_header = nib.Nifti1Image(Tvals, affine=affine, header=header) nib.save(Tvals_brain_with_header, Tvals_name) print('Saving Qvals.nii.gz') Qvals_name = opj(save_destination_new, 'Qvals.nii.gz') Qvals_brain_with_header = nib.Nifti1Image(Qvals, affine=affine, header=header) nib.save(Qvals_brain_with_header, Qvals_name) print('Saving C1MinusC2.nii.gz') C1MinusC2_name = opj(save_destination_new, 'C1MinusC2.nii.gz') C1MinusC2_brain_with_header = nib.Nifti1Image(map_C1MinusC2, affine=affine, header=header) nib.save(C1MinusC2_brain_with_header, C1MinusC2_name) print('Saving map_logp.nii.gz') map_logp_name = opj(save_destination_new, 'map_logp.nii.gz') map_logp_brain_with_header = nib.Nifti1Image(map_logp, affine=affine, header=header) nib.save(map_logp_brain_with_header, map_logp_name) print('Saving map_logq.nii.gz') map_logq_name = opj(save_destination_new, 'map_logq.nii.gz') map_logq_brain_with_header = nib.Nifti1Image(map_logq, affine=affine, header=header) nib.save(map_logq_brain_with_header, map_logq_name)
def parallel_positive_selection_test(self, in_dir, tree_file, out_dir, results_file, seq_type="codons", codon_frequency="F3X4", noisy=3, verbose="concise", runmode=0, clock=0, aminoacid_distance=None, genetic_code=0, fix_kappa=False, kappa=5, getSE=0, RateAncestor=0, small_difference=0.000001, clean_data=True, method=0): """ This function implements positive selection test (branch-site model) for branch labeled in tree file using model_A vs model_A_null(omega fixed to 1) comparison """ FileRoutines.safe_mkdir(out_dir) alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir) tree_file_abs_path = os.path.abspath(tree_file) options_list = [] dir_list = [] basename_dir_list = [] model_list = ["Model_A", "Model_A_null"] fix_omega_dict = {"Model_A": False, "Model_A_null": True} for filename in alignment_files_list: directory, basename, extension = FileRoutines.split_filename( filename) filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename)) basename_dir_list.append(basename) FileRoutines.safe_mkdir(filename_out_dir) for model in model_list: model_dir = "%s/%s/" % (filename_out_dir, model) FileRoutines.safe_mkdir(model_dir) out_file = "%s/%s/%s.out" % (filename_out_dir, model, basename) ctl_file = "%s/%s/%s.ctl" % (filename_out_dir, model, basename) options_list.append("%s.ctl" % basename) dir_list.append(model_dir) self.generate_ctl_file(os.path.abspath(filename), tree_file_abs_path, out_file, ctl_file, seq_type=seq_type, codon_frequency=codon_frequency, noisy=noisy, verbose=verbose, runmode=runmode, clock=clock, aminoacid_distance=aminoacid_distance, model=2, nssites=2, genetic_code=genetic_code, fix_kappa=fix_kappa, kappa=kappa, fix_omega=fix_omega_dict[model], omega=1, getSE=getSE, RateAncestor=RateAncestor, Mgene=0, small_difference=small_difference, clean_data=clean_data, method=method) self.parallel_execute(options_list, dir_list=dir_list) results_dict = OrderedDict() double_delta_dict = OrderedDict() raw_pvalues_dict = OrderedDict() raw_pvalues_list = [] for basename in basename_dir_list: results_dict[basename] = OrderedDict() for model in model_list: output_file = "%s/%s/%s/%s.out" % (out_dir, basename, model, basename) codeml_report = CodeMLReport(output_file) results_dict[basename][model] = codeml_report.LnL skipped_genes_set = set() for basename in basename_dir_list: for model in model_list: if results_dict[basename][model] is None: print("LnL was not calculated for %s" % basename) skipped_genes_set.add(basename) break else: doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"]) p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1 double_delta_dict[basename] = doubled_delta raw_pvalues_dict[basename] = p_value raw_pvalues_list.append(p_value) adjusted_pvalues_list = fdrcorrection0(raw_pvalues_list)[1] #print adjusted_pvalues_list i = 0 with open(results_file, "w") as out_fd: out_fd.write( "id\tmodel_a_null,LnL\tmodel_a,LnL\t2*delta\traw p-value\tadjusted p-value\n" ) for basename in basename_dir_list: for model in model_list: if results_dict[basename][model] is None: print("LnL was not calculated for %s" % basename) break else: #doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"]) #p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1 #print basename, results_dict[basename]["Model_A_null"],results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i] out_fd.write( "%s\t%f\t%f\t%f\t%f\t%f\n" % (basename, results_dict[basename]["Model_A_null"], results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i])) i += 1
def calculate_sig(algo_sample=None, dataset_sample=None, n_dist_samples=300, n_total_samples=None, n_start_i=None, limit=10000, md_path=None, dist_path=None, filtered_go_ids_file="", hg_th=0.0001): filtered_go_ids = open(filtered_go_ids_file, 'r').read().split() + [constants.ROOT_GO_ID] try: output_md = pd.read_csv(md_path.format(dataset_sample, algo_sample), sep='\t', index_col=0).reindex(filtered_go_ids).dropna() except Exception: return None max_genes_pvals = np.power(10, -output_md.loc[:, "hg_pval_max"]) print("total n_genes with pval less than one: {}/{}".format( np.size(max_genes_pvals), len(filtered_go_ids))) max_genes_pvals = np.append( max_genes_pvals, np.ones(len(filtered_go_ids) - np.size(max_genes_pvals))) fdr_results = fdrcorrection0(max_genes_pvals, alpha=hg_th, method='indep', is_sorted=False) n_hg_true = len([cur for cur in fdr_results[0] if cur == True]) HG_CUTOFF = (np.sort(max_genes_pvals)[n_hg_true - 1] if n_hg_true > 0 else -1) print("HG cutoff: {}, (ES={}, n={})".format(HG_CUTOFF, -np.log10(HG_CUTOFF), n_hg_true)) output_md = output_md.loc[ output_md.loc[:, "hg_pval_max"].values >= -np.log10(HG_CUTOFF), :] print(dist_path.format(dataset_sample, algo_sample)) output = pd.read_csv(dist_path.format(dataset_sample, algo_sample), sep='\t', index_col=0).dropna() output = output.loc[output_md.index.values, :] counter = 0 emp_dists = [] emp_pvals = [] n_total_samples = n_total_samples if n_total_samples is not None else len( output.iloc[0].loc["dist_n_samples"][1:-1].split(", ")) np.random.seed(int(random.random() * 1000)) i_choice = np.random.choice(n_total_samples, n_dist_samples, replace=False) i_dist = i_choice[:n_dist_samples] for index, cur in output.iterrows(): if counter == limit: break pval = np.array([ float(x) for x in cur["dist_n_samples"][1:-1].split(", ") ])[i_dist] emp_pvals.append(calc_emp_pval(cur["hg_pval"], pval)) output_md.loc[index, 'emp_pval'] = emp_pvals[-1] emp_dists.append(pval) counter += 1 mask_ids = output.index.values emp_pvals_mat = [ np.array([x]) if type(x) != str else np.array(x[1:-1].split(", ")).astype(np.float32) for x in emp_pvals ] n_modules = 0 if len(emp_pvals) != 0: n_modules = emp_pvals_mat[0].shape[0] max_emp_original_pvals = reduce(lambda a, x: np.append(a, np.min(x)), emp_pvals_mat, np.array([])) emp_pvals = reduce(lambda a, x: np.append(a, x), emp_pvals_mat, np.array([])) df_dists = pd.DataFrame(index=output.index) df_dists["emp"] = pd.Series(emp_dists, index=output.index[:limit]) max_emp_pvals = np.sort([ x if x != 0 else 1.0 / n_dist_samples for x in max_emp_original_pvals ]) print("max emp pvals len: {}".format(len(max_emp_pvals))) print("min vals", 1.0 / n_dist_samples, np.min(list(max_emp_pvals) + [1])) print("max_genes_pvals: {}".format(max_genes_pvals.shape[0])) fdr_bh_results = fdrcorrection0(max_emp_pvals, alpha=0.05, method='indep', is_sorted=False)[0] n_emp_true = np.sum(fdr_bh_results) print("n_emp_true: {}".format(n_emp_true)) if n_emp_true == 0: EMP_TH = -1 else: EMP_TH = (np.sort(max_emp_pvals)[n_emp_true - 1] if n_emp_true > 0 else -1) mask_terms = np.array( [max(a, 1.0 / n_dist_samples) <= EMP_TH for a in emp_pvals]) go_ids_result = np.array([]) go_names_result = np.array([]) n_emp_true_in_modules = 0 if len(mask_terms) > 0: mask_terms = np.array(mask_terms).reshape(-1, n_modules) go_ids_result = output.index.values[mask_terms.any(axis=1)] go_names_result = output["GO name"].values[mask_terms.any(axis=1)] n_emp_true_in_modules = np.sum(mask_terms) print("EMP cutoff: {}. # true terms passed EMP cutoff: {}".format( EMP_TH, n_emp_true)) print("# true terms passed EMP cutoff across modules: {}".format( n_emp_true_in_modules)) print("EHR :{}".format(n_emp_true / float(n_hg_true))) return EMP_TH, n_emp_true, HG_CUTOFF, n_hg_true, go_ids_result, go_names_result, mask_ids, mask_terms, emp_pvals_mat
else: (t, p) = ttest_ind(vfdb_parameters['scores'], control_parameters['scores']) pvalue = p/2 #one tailed test_pvalue[hg_id] = vfdb_parameters test_pvalue[hg_id]['pvalue']= pvalue if t < 0: false_positives.append(hg_id) tmp_pvalue = [] sorted_hg_ids = [] for hg_id in test_pvalue: tmp_pvalue.append(test_pvalue[hg_id]['pvalue']) sorted_hg_ids.append(hg_id) test_pvalue[hg_id].pop('pvalue') corrected_pvalue= fdrcorrection0(tmp_pvalue)[1] for position in range(len(corrected_pvalue)): hg_id = sorted_hg_ids[position] if corrected_pvalue[position] <= 0.05 and hg_id not in false_positives: true_positives[hg_id] = test_pvalue[hg_id] elif corrected_pvalue[position] > 0.05 and hg_id not in false_positives: false_positives.append(hg_id) out = open('true_positives.pkl', 'wb') dump(true_positives, out) out.close() agreement = [] groups_description = {} descriptions = set() for hg_id in true_positives:
def main(dataset="SOC", algo="jactivemodules_sa", n_permutations=300, csv_file_name=os.path.join(constants.OUTPUT_GLOBAL_DIR, "emp_fdr", "MAX/emp_diff_{dataset}_{algo}.tsv")): dataset_data = pd.read_csv(os.path.join(constants.DATASETS_DIR, "GE_{}".format(dataset), "data", "ge.tsv"), sep='\t', index_col=0) classes_data = np.array( file( os.path.join( constants.DATASETS_DIR, "GE_{}".format(dataset), "data", "classes.tsv")).readlines()[0].strip().split("\t")).astype( np.int) csv_file_name = csv_file_name.format(dataset=dataset, algo=algo) df = None try: df = pd.read_csv(csv_file_name, sep='\t', index_col=0) except: return None df = df.dropna() n_genes = [ len( get_all_genes_for_term(vertices, cur_go_id, cur_go_id, cur_go_id == cur_go_id)) for i, cur_go_id in enumerate(df.index.values) ] depth = [ dict_result.values()[0]['vertices'][cur_go_id]['D'] for i, cur_go_id in enumerate(df.index.values) ] df["n_genes"] = pd.Series(n_genes, index=df.index) df["depth"] = pd.Series(depth, index=df.index) df = df.rename(columns={"filtered_pval": "hg_pval"}) n_genes_pvals = df.loc[np.logical_and.reduce( [df["n_genes"].values > 5, df["n_genes"].values < 500]), "hg_pval"].values print "total n_genes with pval:{}/{}".format(np.size(n_genes_pvals), 7435) n_genes_pvals = np.append(n_genes_pvals, np.zeros(7435 - np.size(n_genes_pvals))) n_genes_pvals = [10**(-x) for x in n_genes_pvals] fdr_results = fdrcorrection0(n_genes_pvals, alpha=0.05, method='indep', is_sorted=False) true_counter = len([cur for cur in fdr_results[0] if cur == True]) HG_CUTOFF = -np.log10(np.sort(n_genes_pvals))[true_counter - 1] if true_counter > 0 else 0 print "cutoff: {}".format(HG_CUTOFF) df_filtered_in = df.loc[np.logical_and.reduce([ df["n_genes"].values > 5, df["n_genes"].values < 500, df["hg_pval"]. values >= HG_CUTOFF ]), :] df_filtered_out = df.loc[~np.logical_and.reduce([ df["n_genes"].values > 5, df["n_genes"].values < 500, df["hg_pval"]. values >= HG_CUTOFF ]), :] df_filtered_in["index"] = df_filtered_in.index.values df_filtered_in["emp_pval"] = df_filtered_in.apply( lambda row: calc_empirical_pval(row, n_permutations), axis=1) df_filtered_in["mean_difference"] = df_filtered_in.apply( lambda x: mean_difference(x, dataset_data, classes_data), axis=1) pvals_corrected = df_filtered_in["emp_pval"].values fdr_results = fdrcorrection0(pvals_corrected, alpha=0.05, method='indep', is_sorted=False) true_counter = len([cur for cur in fdr_results[0] if cur == True]) emp_cutoff = np.sort( np.sort(pvals_corrected))[true_counter - 1] if true_counter > 0 else 0 print "emp true hypothesis: {} (emp cutoff: {}, n={})".format( true_counter, emp_cutoff, len(fdr_results[0])) df_filtered_in["passed_fdr"] = df_filtered_in["emp_pval"].apply( lambda x: x <= emp_cutoff) df_filtered_in["emp_rank"] = df_filtered_in["emp_pval"].rank(ascending=1) df_filtered_in["hg_rank"] = df_filtered_in["hg_pval"].rank(ascending=0) df_filtered_in = df_filtered_in.sort_values(by=["emp_rank", "hg_rank"]) df_all = pd.concat((df_filtered_in, df_filtered_out), axis=0) df_all.loc[df_all["hg_pval"].values > 0, :][[ "GO name", "hg_pval", "emp_pval", "hg_rank", "emp_rank", "n_genes", "depth", "mean_difference", "passed_fdr" ]].to_csv(csv_file_name[:-4] + "_md.tsv", sep='\t') return len(df_filtered_in.index), true_counter, HG_CUTOFF, emp_cutoff