def test_multiple_test_correction(self): pvalues = [0.001, 0.005, 0.1, 0.5, 0.01] rejected_hypotheses = np.array([True, True, False, False, True]) corrected_pvalues = np.array([0.005, 0.0125, 0.125, 0.5, 0.01666667]) rej, cor = multiple_test_correction(pvalues) self.assertTrue(np.array_equal(rej, rejected_hypotheses), msg="multiple_test_correction returns wrong list of " "rejected hypotheses") self.assertTrue(np.allclose(cor, corrected_pvalues), msg="multiple_test_correction returns wrong list of" " corrected pvalues") pvalues = [0.01, 0.05, 0.1, 0.5] rejected_hypotheses = np.array([False, False, False, False]) corrected_pvalues = np.array([0.08333333, 0.20833333, 0.27777777, 1]) rej, cor = multiple_test_correction(pvalues, method="negcorr") self.assertTrue(np.array_equal(rej, rejected_hypotheses), msg="multiple_test_correction(negcorr) returns wrong" " list of rejected hypotheses") self.assertTrue(np.allclose(cor, corrected_pvalues), msg="multiple_test_correction(negcorr) returns wrong" " list of corrected pvalues") with self.assertRaises(ValueError): multiple_test_correction(pvalues, method="some_method")
def main(): # get command line arguments args = parseArgs() print(f"Using bam: {args.bam}") bf = args.bam print(f"Will write peaks: {args.outfile}_peaks.tsv") of = args.outfile cf = args.controlfile cs = args.chromsizes pvalue = args.pvalue minreads = args.minreads minsize = args.minsize bs = args.binsize corr = args.correct_pval if corr not in ["bh", "by", None]: print("Invalid correction method (please pass either 'bh' or 'by'") sys.exit(1) res, cov, cov2 = call_peaks(bf, cs, pvalue, minreads, bs, cfile=cf) # rpm norm the signal before writing to bigwig cov2.coverage = np.array(cov2.coverage, dtype='object') * (1e6 / float(cov2.reads)) bwfile = of + ".bw" write_bigwig(cov2, bwfile, cs) # write peaks to file outbed = of + "_peaks.tsv" write_bed(res, outbed, minsize) if corr is not None: dat = pd.read_csv( outbed, sep="\t", names=["chr", "start", "end", "name", "score", "strand"]) if corr == "bh": b, corr = multiple_test_correction(dat.score, method="p") elif corr == "by": b, corr = b, corr = multiple_test_correction(dat.score, method="n") dat["score"] = corr dat["score"] = dat["score"].apply(lambda x: -np.log(x)) dat[dat.score > -np.log10(pvalue)].to_csv(outbed, sep="\t", header=False, index=False) write_counts(of, dat)
def filter_by_pvalue_strand_lag(ratios, pcutoff, pvalues, output, no_correction, name, singlestrand): """Filter DPs by strang lag and pvalue""" if not singlestrand: zscore_ratios = zscore(ratios) ratios_pass = np.where(np.bitwise_and(zscore_ratios > -2, zscore_ratios < 2) == True, True, False) if not no_correction: pv_pass = [True] * len(pvalues) pvalues = map(lambda x: 10**-x, pvalues) _output_BED(name + '-uncor', output, pvalues, pv_pass) _output_narrowPeak(name + '-uncor', output, pvalues, pv_pass) pv_pass, pvalues = multiple_test_correction(pvalues, alpha=pcutoff) else: pv_pass = np.where(np.asarray(pvalues) >= -log10(pcutoff), True, False) if not singlestrand: filter_pass = np.bitwise_and(ratios_pass, pv_pass) assert len(pv_pass) == len(ratios_pass) else: filter_pass = pv_pass assert len(output) == len(pvalues) assert len(filter_pass) == len(pvalues) return output, pvalues, filter_pass
def filter_by_pvalue_strand_lag(ratios, pcutoff, pvalues, output, no_correction, name, singlestrand): """Filter DPs by strang lag and pvalue""" if not singlestrand: zscore_ratios = zscore(ratios) ratios_pass = np.where( np.bitwise_and(zscore_ratios > -2, zscore_ratios < 2) == True, True, False) if not no_correction: pv_pass = [True] * len(pvalues) pvalues = map(lambda x: 10**-x, pvalues) _output_BED(name + '-uncor', output, pvalues, pv_pass) _output_narrowPeak(name + '-uncor', output, pvalues, pv_pass) pv_pass, pvalues = multiple_test_correction(pvalues, alpha=pcutoff) else: pv_pass = np.where(np.asarray(pvalues) >= -log10(pcutoff), True, False) if not singlestrand: filter_pass = np.bitwise_and(ratios_pass, pv_pass) assert len(pv_pass) == len(ratios_pass) else: filter_pass = pv_pass assert len(output) == len(pvalues) assert len(filter_pass) == len(pvalues) return output, pvalues, filter_pass
def multiple_correction(dic): """ dic[ty][r][q] = p """ for ty in dic.keys(): all_p = [] rn = len(dic[ty].keys()) qn = len(dic[ty].values()[0].keys()) cue = {} i = 0 if rn == 1 and qn == 1: return # get all p values from the dictionary for r in dic[ty].keys(): for q in dic[ty][r].keys(): if isinstance(dic[ty][r][q], str): pass else: all_p.append(dic[ty][r][q]) cue[ty + r + q] = i i = i + 1 # correction reject, pvals_corrected = multiple_test_correction(all_p, alpha=0.05, method='indep') # modify all p values for ir, r in enumerate(dic[ty].keys()): for iq, q in enumerate(dic[ty][r].keys()): try: dic[ty][r][q] = pvals_corrected[cue[ty + r + q]] except: pass
def test_multiple_test_correction_using_R(self): pvalues_list = [[0.001, 0.005, 0.1, 0.5, 0.01], [0.03, 0.8, 0.47,0.1], [0.0003, 0.4, 0.002]] # corrected pvalues calculated by using R function p.adjust corrected_pvalues = [np.array([0.005, 0.0125, 0.125, 0.5, 0.01666667]), np.array([0.12, 0.8, 0.62666667, 0.2]), np.array([0.0009, 0.4, 0.003])] # get corrected pvalues from own function for i in range(0, len(pvalues_list)): res = multiple_test_correction(pvalues_list[i]) self.assertTrue(np.allclose(res[1], corrected_pvalues[i]), msg="multiple_test_correction returns wrong list" "of corrected pvalues")
def generate_rna_exp_pv_table(root, multi_corr=True): "Generate p value table for Experiments vs RNA in the same project" nested_dict = lambda: defaultdict(nested_dict) #nested_dict = lambda: defaultdict(lambda: 'n.a.') data = nested_dict() rnas = [] for item in os.listdir(root): pro = os.path.join(root, item, "profile.txt") if os.path.isfile(pro): with open(pro) as f: for line in f: if line.startswith("Experiment"): continue else: line = line.strip().split("\t") data[item][line[0]] = float(line[7]) rnas.append(line[0]) exp_list = sorted(data.keys()) rnas = sorted(list(set(rnas))) pvs = [] for rna in rnas: for exp in exp_list: if data[exp][rna]: pvs.append(data[exp][rna]) reject, pvals_corrected = multiple_test_correction(pvs, alpha=0.05, method='indep') with open(os.path.join(root, "table_exp_rna_pv.txt"), "w") as t: print("\t".join(["RNA_ID"] + exp_list), file=t) i = 0 for rna in rnas: newline = [rna] for exp in exp_list: if data[exp][rna]: newline.append(str(pvals_corrected[i])) i += 1 else: newline.append("n.a.") print("\t".join(newline), file=t) for d, p in plist.iteritems(): list_all_index(path=os.path.dirname(p), link_d=dirlist, show_RNA_ass_gene=show_RNA_ass_gene)
def generate_rna_exp_pv_table(root, multi_corr=True): "Generate p value table for Experiments vs RNA in the same project" nested_dict = lambda: defaultdict(nested_dict) # nested_dict = lambda: defaultdict(lambda: 'n.a.') data = nested_dict() rnas = [] for item in os.listdir(root): pro = os.path.join(root, item, "profile.txt") if os.path.isfile(pro): with open(pro) as f: for line in f: if line.startswith("Experiment"): continue else: line = line.strip().split("\t") data[item][line[0]] = float(line[7]) rnas.append(line[0]) exp_list = sorted(data.keys()) rnas = sorted(list(set(rnas))) pvs = [] for rna in rnas: for exp in exp_list: if data[exp][rna]: pvs.append(data[exp][rna]) if multi_corr: reject, pvals_corrected = multiple_test_correction(pvs, alpha=0.05, method='indep') else: pvals_corrected = pvs with open(os.path.join(root, "table_exp_rna_pv.txt"), "w") as t: print("\t".join(["RNA_ID"] + exp_list), file=t) i = 0 for rna in rnas: newline = [rna] for exp in exp_list: if data[exp][rna]: newline.append(str(pvals_corrected[i])) i += 1 else: newline.append("n.a.") print("\t".join(newline), file=t)
def get_peaks(name, DCS, states, ext_size, merge, distr, pcutoff, no_correction): indices_of_interest = DCS.indices_of_interest first_overall_coverage = DCS.first_overall_coverage second_overall_coverage = DCS.second_overall_coverage c1 = list(first_overall_coverage) c2 = list(second_overall_coverage) tmp_peaks = [] for i in range(len(indices_of_interest)): if states[i] not in [1, 2]: continue #ignore background states strand = '+' if states[i] == 1 else '-' cov1 = c1[indices_of_interest[i]] cov2 = c2[indices_of_interest[i]] chrom, start, end = DCS._index2coordinates(indices_of_interest[i]) tmp_peaks.append((chrom, start, end, cov1, cov2, strand)) i, j = 0, 0 peaks = [] pvalues = [] while i < len(tmp_peaks): j += 1 c, s, e, c1, c2, strand = tmp_peaks[i] v1 = [c1] v2 = [c2] #merge bins while i + 1 < len(tmp_peaks) and e == tmp_peaks[ i + 1][1] and strand == tmp_peaks[i + 1][5]: e = tmp_peaks[i + 1][2] v1.append(tmp_peaks[i + 1][3]) v2.append(tmp_peaks[i + 1][4]) i += 1 s1 = sum(v1) s2 = sum(v2) if s1 + s2 > SIGNAL_CUTOFF: pvalues.append(('NA', 'NA', 'NA', 'NA')) else: if strand == '+': pvalues.append((s1, s2, 'l', distr)) else: pvalues.append((s1, s2, 'r', distr)) peaks.append((c, s, e, s1, s2, strand)) i += 1 print('Number of Peaks where p-value is not calculated: ', pvalues.count(('NA', 'NA', 'NA', 'NA')), file=sys.stderr) #pool = multiprocessing.Pool(processes=2)#multiprocessing.cpu_count() * 3/2) pvalues = map(_compute_pvalue, pvalues) assert len(pvalues) == len(peaks) merge_delete(ext_size, merge, peaks, pvalues, name) #peaks = [(c, s, e, s1, s2, strand)] if not no_correction: pvalues = map(lambda x: 10**-x, pvalues) pv_pass, pvalues = multiple_test_correction(pvalues, alpha=pcutoff) pvalues = map(_get_log10pvalue, pvalues) else: pv_pass = [True] * len(pvalues) _output_BED(name, pvalues, peaks, pv_pass) _output_narrowPeak(name, pvalues, peaks, pv_pass)
def get_peaks(name, DCS, states, ext_size, merge, distr, pcutoff, no_correction): indices_of_interest = DCS.indices_of_interest first_overall_coverage = DCS.first_overall_coverage second_overall_coverage = DCS.second_overall_coverage c1 = list(first_overall_coverage) c2 = list(second_overall_coverage) tmp_peaks = [] for i in range(len(indices_of_interest)): if states[i] not in [1,2]: continue #ignore background states strand = '+' if states[i] == 1 else '-' cov1 = c1[indices_of_interest[i]] cov2 = c2[indices_of_interest[i]] chrom, start, end = DCS._index2coordinates(indices_of_interest[i]) tmp_peaks.append((chrom, start, end, cov1, cov2, strand)) i, j = 0, 0 peaks = [] pvalues = [] while i < len(tmp_peaks): j+=1 c, s, e, c1, c2, strand = tmp_peaks[i] v1 = [c1] v2 = [c2] #merge bins while i+1 < len(tmp_peaks) and e == tmp_peaks[i+1][1] and strand == tmp_peaks[i+1][5]: e = tmp_peaks[i+1][2] v1.append(tmp_peaks[i+1][3]) v2.append(tmp_peaks[i+1][4]) i += 1 s1 = sum(v1) s2 = sum(v2) if s1 + s2 > SIGNAL_CUTOFF: pvalues.append(('NA', 'NA', 'NA', 'NA')) else: if strand == '+': pvalues.append((s1, s2, 'l', distr)) else: pvalues.append((s1, s2, 'r', distr)) peaks.append((c, s, e, s1, s2, strand)) i += 1 print('Number of Peaks where p-value is not calculated: ', pvalues.count(('NA', 'NA', 'NA', 'NA')), file=sys.stderr) #pool = multiprocessing.Pool(processes=2)#multiprocessing.cpu_count() * 3/2) pvalues = map(_compute_pvalue, pvalues) assert len(pvalues) == len(peaks) merge_delete(ext_size, merge, peaks, pvalues, name) #peaks = [(c, s, e, s1, s2, strand)] if not no_correction: #first output uncorrected p-values pv_pass = [True] * len(pvalues) _output_BED(name + '-uncor', pvalues, peaks, pv_pass) _output_narrowPeak(name + '-uncor', pvalues, peaks, pv_pass) #then correct p-values and output pvalues = map(lambda x: 10**-x, pvalues) pv_pass, pvalues = multiple_test_correction(pvalues, alpha=pcutoff) pvalues = map(_get_log10pvalue, pvalues) else: pv_pass = list(np.where(np.asarray(pvalues) >= -log10(pcutoff), True, False)) _output_BED(name, pvalues, peaks, pv_pass) _output_narrowPeak(name, pvalues, peaks, pv_pass)