def acf(fnames, lags, col_num0, partial=True, simple=False, mlog=True): """ calculate the correlation of the numbers in `col_num0` from the bed files in `fnames` at various lags. The lags are specified by distance. Partial autocorrelation may be calculated as well. Since the bed files may be very large, this attempts to be as memory efficient as possible while still being very fast for a pure python implementation. """ # reversing allows optimization below. from multiprocessing import set_start_method set_start_method("fork", force=True) imap = get_map() arg_list = [] # chaining for fname in fnames: # groupby chromosome. arg_list = chain(arg_list, ((list(chromlist), lags) for chrom, \ chromlist in \ groupby(bediter(fname, col_num0), lambda a: a["chrom"]))) unmerged_acfs = [] # separated by chrom. need to merge later. for chrom_acf in imap(_acf_by_chrom, arg_list): unmerged_acfs.append(chrom_acf) acfs = merge_acfs(unmerged_acfs) acf_res = {} xs = np.array([], dtype='f') ys = np.array([], dtype='f') # iterate over it backwards and remove to reduce memory. while len(acfs): lmin, lmax, xys = acfs.pop() if partial: xs, ys = np.array(xys["x"]), np.array(xys["y"]) else: # add the inner layers as we move out. xs = np.hstack((xs, xys["x"])) ys = np.hstack((ys, xys["y"])) if len(xs) == 0: print("no values found at lag: %i-%i. skipping" \ % (lmin, lmax), file=sys.stderr) continue if mlog: xs[xs == 0] = 1e-12 ys[ys == 0] = 1e-12 xs, ys = -np.log10(xs), -np.log10(ys) #slope, intercept, corr, p_val, stderr = ss.linregress(xs, ys) # NOTE: using pearson correlation, which assumes normality. # could switch to spearman as below. corr, p_val = ss.spearmanr(xs, ys) if simple: acf_res[(lmin, lmax)] = corr else: acf_res[(lmin, lmax)] = (corr, len(xs), p_val) return sorted(acf_res.items())
def acf(fnames, lags, col_num0, partial=True, simple=False, mlog=True): """ calculate the correlation of the numbers in `col_num0` from the bed files in `fnames` at various lags. The lags are specified by distance. Partial autocorrelation may be calculated as well. Since the bed files may be very large, this attempts to be as memory efficient as possible while still being very fast for a pure python implementation. """ # reversing allows optimization below. imap = get_map() arg_list = [] # chaining for fname in fnames: # groupby chromosome. arg_list = chain(arg_list, ((list(chromlist), lags) for chrom, \ chromlist in \ groupby(bediter(fname, col_num0), lambda a: a["chrom"]))) unmerged_acfs = [] # separated by chrom. need to merge later. for chrom_acf in imap(_acf_by_chrom, arg_list): unmerged_acfs.append(chrom_acf) acfs = merge_acfs(unmerged_acfs) acf_res = {} xs = np.array([], dtype='f') ys = np.array([], dtype='f') # iterate over it backwards and remove to reduce memory. while len(acfs): lmin, lmax, xys = acfs.pop() if partial: xs, ys = np.array(xys["x"]), np.array(xys["y"]) else: # add the inner layers as we move out. xs = np.hstack((xs, xys["x"])) ys = np.hstack((ys, xys["y"])) if len(xs) == 0: print("no values found at lag: %i-%i. skipping" \ % (lmin, lmax), file=sys.stderr) continue if mlog: xs[xs == 0] = 1e-12 ys[ys == 0] = 1e-12 xs, ys = -np.log10(xs), -np.log10(ys) #slope, intercept, corr, p_val, stderr = ss.linregress(xs, ys) # NOTE: using pearson correlation, which assumes normality. # could switch to spearman as below. corr, p_val = ss.spearmanr(xs, ys) if simple: acf_res[(lmin, lmax)] = corr else: acf_res[(lmin, lmax)] = (corr, len(xs), p_val) return sorted(acf_res.items())
def pipeline(col_num, step, dist, acf_dist, prefix, threshold, seed, bed_files, mlog=True, region_filter_p=1, region_filter_n=None, genome_control=False, db=None, use_fdr=True): sys.path.insert(0, op.join(op.dirname(__file__), "..")) from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter from cpv._common import genome_control_adjust, genomic_control, bediter import operator if step is None: step = min(acf_dist, stepsize.stepsize(bed_files, col_num)) print("calculated stepsize as: %i" % step, file=sys.stderr) lags = list(range(1, acf_dist, step)) lags.append(lags[-1] + step) prefix = prefix.rstrip(".") putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False, mlog=mlog) acf_vals = [] # go out to max requested distance but stop once an autocorrelation # < 0.05 is added. for a in putative_acf_vals: # a is ((lmin, lmax), (corr, N)) # this heuristic seems to work. stop just above the 0.08 correlation # lag. if a[1][0] < 0.04 and len(acf_vals) > 2: break acf_vals.append(a) if a[1][0] < 0.04 and len(acf_vals): break # save the arguments that this was called with. with open(prefix + ".args.txt", "w") as fh: print(" ".join(sys.argv[1:]) + "\n", file=fh) import datetime print("date: %s" % datetime.datetime.today(), file=fh) from .__init__ import __version__ print("version:", __version__, file=fh) with open(prefix + ".acf.txt", "w") as fh: acf_vals = acf.write_acf(acf_vals, fh) print("wrote: %s" % fh.name, file=fh) print("ACF:\n", open(prefix + ".acf.txt").read(), file=sys.stderr) spvals, opvals = array.array('f'), array.array('f') with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk: fhslk.write('#chrom\tstart\tend\tp\tregion-p\n') for chrom, results in slk.adjust_pvals(bed_files, col_num, acf_vals): fmt = chrom + "\t%i\t%i\t%.4g\t%.4g\n" for row in results: row = tuple(row) fhslk.write(fmt % row) opvals.append(row[-2]) spvals.append(row[-1]) print("# original lambda: %.2f" % genomic_control(opvals), file=sys.stderr) del opvals gc_lambda = genomic_control(spvals) print("wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda), file=sys.stderr) if genome_control: fhslk = ts.nopen(prefix + ".slk.gc.bed.gz", "w") adj = genome_control_adjust([d['p'] for d in bediter(prefix + ".slk.bed.gz", -1)]) for i, line in enumerate(ts.nopen(prefix + ".slk.bed.gz")): print("%s\t%.5g" % (line.rstrip("\r\n"), adj[i]), file=fhslk) fhslk.close() print("wrote: %s" % fhslk.name, file=sys.stderr) with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh: fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n') for bh, l in fdr.fdr(fhslk.name, -1): fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh)) print("wrote: %s" % fh.name, file=sys.stderr) fregions = prefix + ".regions.bed.gz" with ts.nopen(fregions, "w") as fh: list(peaks.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2, threshold, seed, dist, fh, operator.le)) n_regions = sum(1 for _ in ts.nopen(fregions)) print("wrote: %s (%i regions)" % (fregions, n_regions), file=sys.stderr) if n_regions == 0: sys.exit() with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh: N = 0 fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n") # use -2 for original, uncorrected p-values in slk.bed for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p( prefix + ".slk.bed.gz", prefix + ".regions.bed.gz", -2, step): fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p)) fh.flush() N += int(slk_sidak_p < 0.05) print("wrote: %s, (regions with corrected-p < 0.05: %i)" \ % (fh.name, N), file=sys.stderr) regions_bed = fh.name #if all(h in header for h in ('t', 'start', 'end')): if region_filter_n is None: region_filter_n = 0 with ts.nopen(prefix + ".regions-t.bed", "w") as fh: N = 0 for i, toks in enumerate(filter.filter(bed_files[0], regions_bed, p_col_name=col_num)): if i == 0: toks[0] = "#" + toks[0] else: if float(toks[6]) > region_filter_p: continue if int(toks[4]) < region_filter_n: continue #if region_filter_t and "/" in toks[7]: # # t-pos/t-neg. if the lower one is > region_filter_t? # vals = map(int, toks[7].split("/")) # if min(vals) > region_filter_t: continue N += 1 print("\t".join(toks), file=sys.stderr) print(("wrote: %s, (regions with region-p " "< %.3f and n-probes >= %i: %i)") \ % (fh.name, region_filter_p, region_filter_n, N), file=sys.stderr) try: from cpv import manhattan regions = manhattan.read_regions(fh.name) manhattan.manhattan(prefix + ".slk.bed.gz", 3, prefix.rstrip(".") + ".manhattan.png", False, ['#959899', '#484B4C'], "", False, None, regions=regions, bonferonni=False) except ImportError: pass # they dont have matplotlib if db is not None: from cruzdb import Genome g = Genome(db) lastf = fh.name with open(prefix + ".anno.%s.bed" % db, "w") as fh: fh.write('#') g.annotate(lastf, ("refGene", "cpgIslandExt"), out=fh, feature_strand=True, parallel=len(spvals) > 500) print("wrote: %s annotated with %s" % (fh.name, db), file=sys.stderr)
def manhattan(fname, col_num, image_path, no_log, colors, title, lines, ymax, bonferonni=False, regions=None, subplots=False): """ regions is keyed by chromosome with [(start, stop), ...] extents of the regions to highlight """ xs, ys, cs = [], [], [] region_xys = [] # highlight certain regions. colors = cycle(colors) chrom_centers = [] last_x = 0 nrows = 0 giter = [(seqid, list(rlist)) for seqid, rlist \ in groupby(bediter(fname, col_num), key=itemgetter('chrom'))] region_xs, region_ys = [], [] new_bounds = [] rcolors = cycle(('#AE2117', '#EA352B')) for seqid, rlist in sorted(giter, cmp=chr_cmp): color = colors.next() nrows += len(rlist) # since chroms are on the same plot. add this chrom to the end of the # last chrom rcolor = rcolors.next() region_xs = [last_x + r['start'] for r in rlist] xs.extend(region_xs) ys.extend([r['p'] for r in rlist]) cs.extend([color] * len(rlist)) if regions and seqid in regions: regions_bounds = regions[seqid] if len(regions_bounds) < 500: region_xys.extend([(last_x + r['start'], r['p'], rcolor) for r in rlist \ if any((s - 1 <= r['start'] <= e + 1) for s, e in regions_bounds)]) else: sys.stderr.write("regions for %s > 500, not plotting\n" % seqid) # adjust the bounds of each region based on chrom. new_bounds.extend([(last_x + s, last_x + e) for s, e in regions_bounds]) # save the middle of the region to place the label chrom_centers.append((seqid, (region_xs[0] + region_xs[-1]) / 2)) # keep track so that chrs don't overlap. last_x = xs[-1] xs = np.array(xs) ys = np.array(ys) if no_log else -np.log10(ys) plt.close() f, ax = plt.subplots(1, figsize=(10, 6)) bonferonni_p = 0.05 / nrows if title is not None: plt.title(title) ax.set_ylabel('' if no_log else '-log10(p)') if regions: #""" # Plot as colored background if len(new_bounds) < 32: for s, e in new_bounds: ax.axvspan(s - 2, e + 2, facecolor='#EA352B', ec='#EA352B', alpha=0.3, zorder=0) #""" # plot as points. rxs, rys, rcs = zip(*region_xys) if not no_log: rys = -np.log10(rys) ax.scatter(rxs, rys, # s=rys ** 1.3, # size by -log10(p) s = 6, c=rcs, edgecolors=rcs, zorder=2) if lines: ax.vlines(xs, 0, ys, colors=cs, alpha=0.5) else: alpha = 0.8 if len(xs) < 10000 else 0.6 edgecolors = 'k' if len(xs) < 10000 else 'none' ax.scatter(xs, ys, s=3.5, c=cs, edgecolors=edgecolors, alpha=alpha, zorder=1) # plot 0.05 line after multiple testing. always nlog10'ed since # that's the space we're plotting in. if bonferonni: ax.axhline(y=-np.log10(bonferonni_p), color='0.5', linewidth=2) #plt.axis('tight') if max(xs) - min(xs) > 10000: plt.xlim(0, xs[-1]) else: plt.xlim(xs[0], xs[-1]) plt.ylim(ymin=0) if ymax is not None: plt.ylim(ymax=ymax) plt.xticks([c[1] for c in chrom_centers], [c[0].replace('chr', '') for c in chrom_centers], rotation=-90, size=8.5) #plt.show() print >>sys.stderr, "Bonferonni-corrected p-value for %i rows: %.3g" \ % (nrows, 0.05 / nrows) print >>sys.stderr, "values less than Bonferonni-corrected p-value: %i " \ % (ys > -np.log10(bonferonni_p)).sum() if subplots: pys = np.sort(10**-ys) # convert back to actual p-values gc = genomic_control(pys) ax_qq = f.add_axes((0.74, 0.12, 0.22, 0.22), alpha=0.2) ax_qq.text(0.03, 0.88, r'$\lambda : %.3f$' % gc, transform=ax_qq.transAxes) qqplot(ys, ax_qq) ax_hist = f.add_axes((0.12, 0.12, 0.22, 0.22), frameon=True, alpha=0.6) hist(pys, ax_hist) print >>sys.stderr, "saving to: %s" % image_path f.tight_layout() plt.savefig(image_path) return image_path
def manhattan(fname, col_num, image_path, no_log, colors, title, lines, ymax, bonferonni=False, regions=None, subplots=False): """ regions is keyed by chromosome with [(start, stop), ...] extents of the regions to highlight """ xs, ys, cs = [], [], [] region_xys = [] # highlight certain regions. colors = cycle(colors) chrom_centers = [] last_x = 0 nrows = 0 giter = [(seqid, list(rlist)) for seqid, rlist \ in groupby(bediter(fname, col_num), key=itemgetter('chrom'))] region_xs, region_ys = [], [] new_bounds = [] rcolors = cycle(('#AE2117', '#EA352B')) for seqid, rlist in sorted(giter, cmp=chr_cmp): color = colors.next() nrows += len(rlist) # since chroms are on the same plot. add this chrom to the end of the # last chrom rcolor = rcolors.next() region_xs = [last_x + r['start'] for r in rlist] xs.extend(region_xs) ys.extend([r['p'] for r in rlist]) cs.extend([color] * len(rlist)) if regions and seqid in regions: regions_bounds = regions[seqid] if len(regions_bounds) < 500: region_xys.extend([(last_x + r['start'], r['p'], rcolor) for r in rlist \ if any((s - 1 <= r['start'] <= e + 1) for s, e in regions_bounds)]) else: sys.stderr.write("regions for %s > 500, not plotting\n" % seqid) # adjust the bounds of each region based on chrom. new_bounds.extend([(last_x + s, last_x + e) for s, e in regions_bounds]) # save the middle of the region to place the label chrom_centers.append((seqid, (region_xs[0] + region_xs[-1]) / 2)) # keep track so that chrs don't overlap. last_x = xs[-1] xs = np.array(xs) ys = np.array(ys) if no_log else -np.log10(ys) plt.close() f, ax = plt.subplots(1, figsize=(10, 6)) bonferonni_p = 0.05 / nrows if title is not None: plt.title(title) ax.set_ylabel('' if no_log else '-log10(p)') if regions: #""" # Plot as colored background if len(new_bounds) < 32: for s, e in new_bounds: ax.axvspan(s - 2, e + 2, facecolor='#EA352B', ec='#EA352B', alpha=0.3, zorder=0) #""" # plot as points. rxs, rys, rcs = zip(*region_xys) if not no_log: rys = -np.log10(rys) ax.scatter( rxs, rys, # s=rys ** 1.3, # size by -log10(p) s=6, c=rcs, edgecolors=rcs, zorder=2) if lines: ax.vlines(xs, 0, ys, colors=cs, alpha=0.5) else: alpha = 0.8 if len(xs) < 10000 else 0.6 edgecolors = 'k' if len(xs) < 10000 else 'none' ax.scatter(xs, ys, s=3.5, c=cs, edgecolors=edgecolors, alpha=alpha, zorder=1) # plot 0.05 line after multiple testing. always nlog10'ed since # that's the space we're plotting in. if bonferonni: ax.axhline(y=-np.log10(bonferonni_p), color='0.5', linewidth=2) #plt.axis('tight') if max(xs) - min(xs) > 10000: plt.xlim(0, xs[-1]) else: plt.xlim(xs[0], xs[-1]) plt.ylim(ymin=0) if ymax is not None: plt.ylim(ymax=ymax) plt.xticks([c[1] for c in chrom_centers], [c[0].replace('chr', '') for c in chrom_centers], rotation=-90, size=8.5) #plt.show() print >>sys.stderr, "Bonferonni-corrected p-value for %i rows: %.3g" \ % (nrows, 0.05 / nrows) print >>sys.stderr, "values less than Bonferonni-corrected p-value: %i " \ % (ys > -np.log10(bonferonni_p)).sum() if subplots: pys = np.sort(10**-ys) # convert back to actual p-values gc = genomic_control(pys) ax_qq = f.add_axes((0.74, 0.12, 0.22, 0.22), alpha=0.2) ax_qq.text(0.03, 0.88, r'$\lambda : %.3f$' % gc, transform=ax_qq.transAxes) qqplot(ys, ax_qq) ax_hist = f.add_axes((0.12, 0.12, 0.22, 0.22), frameon=True, alpha=0.6) hist(pys, ax_hist) print >> sys.stderr, "saving to: %s" % image_path f.tight_layout() plt.savefig(image_path) return image_path
def manhattan(fname, col_num, image_path, no_log, colors, title, lines, ymax, bonferonni=False, regions=None): """ regions is keyed by chromosome with [(start, stop), ...] extents of the regions to highlight """ xs, ys, cs = [], [], [] region_xys = [] # highlight certain regions. colors = cycle(colors) chrom_centers = [] last_x = 0 nrows = 0 giter = [(seqid, list(rlist)) for seqid, rlist in groupby(bediter(fname, col_num), key=itemgetter("chrom"))] region_xs, region_ys = [], [] new_bounds = [] for seqid, rlist in sorted(giter, cmp=chr_cmp): color = colors.next() nrows += len(rlist) # since chroms are on the same plot. add this chrom to the end of the # last chrom region_xs = [last_x + r["start"] for r in rlist] xs.extend(region_xs) ys.extend([r["p"] for r in rlist]) cs.extend([color] * len(rlist)) if regions and seqid in regions: regions_bounds = regions[seqid] region_xys.extend( [(last_x + r["start"], r["p"]) for r in rlist if any((s <= r["start"] <= e) for s, e in regions_bounds)] ) # adjust the bounds of each region based on chrom. new_bounds.extend([(last_x + s, last_x + e) for s, e in regions_bounds]) # save the middle of the region to place the label chrom_centers.append((seqid, (region_xs[0] + region_xs[-1]) / 2)) # keep track so that chrs don't overlap. last_x = xs[-1] xs = np.array(xs) ys = np.array(ys) if no_log else -np.log10(ys) plt.close() f = plt.figure() ax = f.add_axes((0.1, 0.09, 0.88, 0.85)) bonferonni_p = 0.05 / nrows if title is not None: plt.title(title) ax.set_ylabel("" if no_log else "-log10(p)") if regions: for s, e in new_bounds: # ax.axvspan(s - 55, e + 10, facecolor='#f30000', ec='#f30000', alpha=0.3, # zorder=0) ax.axvspan(s - 55, e + 10, facecolor="#222222", ec="#222222", alpha=0.3, zorder=0) if lines: ax.vlines(xs, 0, ys, colors=cs, alpha=0.5) else: ax.scatter(xs, ys, s=3.5, c=cs, edgecolors="none", alpha=0.6, zorder=1) # plot 0.05 line after multiple testing. always nlog10'ed since # that's the space we're plotting in. if bonferonni: ax.axhline(y=-np.log10(bonferonni_p), color="0.5", linewidth=2) plt.axis("tight") plt.xlim(0, xs[-1]) plt.ylim(ymin=0) if ymax is not None: plt.ylim(ymax=ymax) plt.xticks([c[1] for c in chrom_centers], [c[0] for c in chrom_centers], rotation=-90, size=8.5) # plt.show() print >>sys.stderr, "Bonferonni-corrected p-value for %i rows: %.3g" % (nrows, 0.05 / nrows) print >>sys.stderr, "values less than Bonferonni-corrected p-value: %i " % (ys > -np.log10(bonferonni_p)).sum() if False: ax_qq = f.add_axes((0.74, 0.12, 0.22, 0.22), alpha=0.2) pys = np.sort(10 ** -ys) # convert back to actual p-values qqplot(ys, ax_qq) ax_hist = f.add_axes((0.12, 0.12, 0.22, 0.22), frameon=True, alpha=0.6) hist(pys, ax_hist) print >>sys.stderr, "saving to: %s" % image_path plt.savefig(image_path) return image_path
def pipeline(col_num, step, dist, acf_dist, prefix, threshold, seed, bed_files, mlog=True, region_filter_p=1, region_filter_n=None, genome_control=False, db=None, use_fdr=True): sys.path.insert(0, op.join(op.dirname(__file__), "..")) from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter from cpv._common import genome_control_adjust, genomic_control, bediter import operator if step is None: step = min(acf_dist, stepsize.stepsize(bed_files, col_num)) print >> sys.stderr, "calculated stepsize as: %i" % step lags = range(1, acf_dist, step) lags.append(lags[-1] + step) prefix = prefix.rstrip(".") putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False, mlog=mlog) acf_vals = [] # go out to max requested distance but stop once an autocorrelation # < 0.05 is added. for a in putative_acf_vals: # a is ((lmin, lmax), (corr, N)) # this heuristic seems to work. stop just above the 0.08 correlation # lag. if a[1][0] < 0.04 and len(acf_vals) > 2: break acf_vals.append(a) if a[1][0] < 0.04 and len(acf_vals): break # save the arguments that this was called with. with open(prefix + ".args.txt", "w") as fh: print >> fh, " ".join(sys.argv[1:]) + "\n" import datetime print >> fh, "date: %s" % datetime.datetime.today() from .__init__ import __version__ print >> fh, "version:", __version__ with open(prefix + ".acf.txt", "w") as fh: acf_vals = acf.write_acf(acf_vals, fh) print >> sys.stderr, "wrote: %s" % fh.name print >> sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read() spvals, opvals = [], [] with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk: fhslk.write('#chrom\tstart\tend\tp\tregion-p\n') for row in slk.adjust_pvals(bed_files, col_num, acf_vals): fhslk.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row) opvals.append(row[-2]) spvals.append(row[-1]) print >> sys.stderr, "# original lambda: %.2f" % genomic_control(opvals) del opvals gc_lambda = genomic_control(spvals) print >> sys.stderr, "wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda) if genome_control: fhslk = ts.nopen(prefix + ".slk.gc.bed.gz", "w") adj = genome_control_adjust( [d['p'] for d in bediter(prefix + ".slk.bed.gz", -1)]) for i, line in enumerate(ts.nopen(prefix + ".slk.bed.gz")): print >> fhslk, "%s\t%.5g" % (line.rstrip("\r\n"), adj[i]) fhslk.close() print >> sys.stderr, "wrote: %s" % fhslk.name with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh: fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n') for bh, l in fdr.fdr(fhslk.name, -1): fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh)) print >> sys.stderr, "wrote: %s" % fh.name fregions = prefix + ".regions.bed.gz" with ts.nopen(fregions, "w") as fh: list( peaks.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2, threshold, seed, dist, fh, operator.le)) n_regions = sum(1 for _ in ts.nopen(fregions)) print >> sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions) if n_regions == 0: sys.exit() with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh: N = 0 fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n") # use -2 for original, uncorrected p-values in slk.bed for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p( prefix + ".slk.bed.gz", prefix + ".regions.bed.gz", -2, step): fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p)) fh.flush() N += int(slk_sidak_p < 0.05) print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \ % (fh.name, N) regions_bed = fh.name header = ts.header(bed_files[0]) #if all(h in header for h in ('t', 'start', 'end')): if region_filter_n is None: region_filter_n = 0 with ts.nopen(prefix + ".regions-t.bed", "w") as fh: N = 0 for i, toks in enumerate( filter.filter(bed_files[0], regions_bed, p_col_name=col_num)): if i == 0: toks[0] = "#" + toks[0] else: if float(toks[6]) > region_filter_p: continue if int(toks[4]) < region_filter_n: continue #if region_filter_t and "/" in toks[7]: # # t-pos/t-neg. if the lower one is > region_filter_t? # vals = map(int, toks[7].split("/")) # if min(vals) > region_filter_t: continue N += 1 print >> fh, "\t".join(toks) print >>sys.stderr, ("wrote: %s, (regions with region-p " "< %.3f and n-probes >= %i: %i)") \ % (fh.name, region_filter_p, region_filter_n, N) try: from cpv import manhattan regions = manhattan.read_regions(fh.name) manhattan.manhattan(prefix + ".slk.bed.gz", 3, prefix.rstrip(".") + ".manhattan.png", False, ['#959899', '#484B4C'], "", False, None, regions=regions, bonferonni=False) except ImportError: pass # they dont have matplotlib if db is not None: from cruzdb import Genome g = Genome(db) lastf = fh.name with open(prefix + ".anno.%s.bed" % db, "w") as fh: fh.write('#') g.annotate(lastf, ("refGene", "cpgIslandExt"), out=fh, feature_strand=True, parallel=len(spvals) > 500) print >> sys.stderr, "wrote: %s annotated with %s" % (fh.name, db)
def pipeline(col_num, step, dist, prefix, threshold, seed, bed_files, mlog=False, region_filter_p=1, region_filter_n=1, genome_control=False, db=None): sys.path.insert(0, op.join(op.dirname(__file__), "..")) from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter from cpv._common import genome_control_adjust, genomic_control, bediter import operator if step is None: step = stepsize.stepsize(bed_files, col_num) print >>sys.stderr, "calculated stepsize as: %i" % step lags = range(1, dist, step) lags.append(lags[-1] + step) prefix = prefix.rstrip(".") #if genome_control: # with open(prefix + ".adj.bed", "w") as fh: # genome_control_adjust_bed(bed_files, col_num, fh) # bed_files = [fh.name] putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False, mlog=mlog) acf_vals = [] # go out to max requested distance but stop once an autocorrelation # < 0.05 is added. for a in putative_acf_vals: # a is ((lmin, lmax), (corr, N)) # this heuristic seems to work. stop just above the 0.08 correlation # lag. if a[1][0] < 0.04 and len(acf_vals) > 2: break acf_vals.append(a) if a[1][0] < 0.04 and len(acf_vals): break # save the arguments that this was called with. with open(prefix + ".args.txt", "w") as fh: print >>fh, " ".join(sys.argv[1:]) + "\n" import datetime print >>fh, "date: %s" % datetime.datetime.today() with open(prefix + ".acf.txt", "w") as fh: acf_vals = acf.write_acf(acf_vals, fh) print >>sys.stderr, "wrote: %s" % fh.name print >>sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read() spvals, opvals = [], [] with open(prefix + ".slk.bed", "w") as fhslk: for row in slk.adjust_pvals(bed_files, col_num, acf_vals): fhslk.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row) opvals.append(row[-2]) spvals.append(row[-1]) print >>sys.stderr, "# original lambda: %.2f" % genomic_control(opvals) del opvals gc_lambda = genomic_control(spvals) print >>sys.stderr, "wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda) if genome_control: fhslk = open(prefix + ".slk.gc.bed", "w") adj = genome_control_adjust([d['p'] for d in bediter(prefix + ".slk.bed", -1)]) for i, line in enumerate(open(prefix + ".slk.bed")): print >>fhslk, "%s\t%.5g" % (line.rstrip("\r\n"), adj[i]) fhslk.close() print >>sys.stderr, "wrote: %s" % fhslk.name with open(prefix + ".fdr.bed", "w") as fh: for bh, l in fdr.fdr(fhslk.name, -1): fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh)) print >>sys.stderr, "wrote: %s" % fh.name fregions = prefix + ".regions.bed" with open(fregions, "w") as fh: list(peaks.peaks(prefix + ".fdr.bed", -1, threshold, seed, step, fh, operator.le)) n_regions = sum(1 for _ in open(fregions)) print >>sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions) with open(prefix + ".regions-p.bed", "w") as fh: N = 0 fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tslk_p\tslk_sidak_p\n") # use -2 for original, uncorrected p-values in slk.bed for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p( prefix + ".slk.bed", prefix + ".regions.bed", -2, 0, step, mlog=mlog): fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p)) fh.flush() N += int(slk_sidak_p < 0.05) print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \ % (fh.name, N) regions_bed = fh.name header = (gzip.open(bed_files[0]) if bed_files[0].endswith(".gz") else open(bed_files[0])).next().split("\t") if all(h in header for h in ('t', 'start', 'end')): with open(prefix + ".regions-t.bed", "w") as fh: N = 0 for i, toks in enumerate(filter.filter(bed_files[0], regions_bed, p_col_name=col_num)): if i == 0: toks[0] = "#" + toks[0] else: if float(toks[6]) > region_filter_p: continue if int(toks[4]) < region_filter_n: continue N += 1 print >>fh, "\t".join(toks) print >>sys.stderr, ("wrote: %s, (regions with region-p" "< %.3f and n-probes >= %i: %i)") \ % (fh.name, region_filter_p, region_filter_n, N) try: from cpv import manhattan regions = manhattan.read_regions(fh.name) manhattan.manhattan(prefix + ".slk.bed", 3, prefix.rstrip(".") + ".manhattan.png", False, ['#959899', '#484B4C'], "", False, None, regions=regions, bonferonni=True) except ImportError: pass # they dont have matplotlib if db is not None: from cruzdb import Genome g = Genome(db) lastf = fh.name with open(prefix + ".anno.%s.bed" % db, "w") as fh: g.annotate(lastf, ("refGene", "cpgIslandExt", "cytoBand"), out=fh, feature_strand=True, parallel=len(spvals) > 500) print >>sys.stderr, "wrote: %s annotated with %s" % (fh.name, db)