예제 #1
0
def acf(fnames, lags, col_num0, partial=True, simple=False, mlog=True):
    """
    calculate the correlation of the numbers in `col_num0` from the bed files
    in `fnames` at various lags. The lags are specified by distance. Partial
    autocorrelation may be calculated as well.

    Since the bed files may be very large, this attempts to be as memory
    efficient as possible while still being very fast for a pure python
    implementation.
    """
    # reversing allows optimization below.
    from multiprocessing import set_start_method
    set_start_method("fork", force=True)
    imap = get_map()

    arg_list = []  # chaining
    for fname in fnames:
        # groupby chromosome.
        arg_list = chain(arg_list, ((list(chromlist), lags) for chrom, \
                    chromlist in \
                    groupby(bediter(fname, col_num0), lambda a: a["chrom"])))

    unmerged_acfs = []  # separated by chrom. need to merge later.
    for chrom_acf in imap(_acf_by_chrom, arg_list):
        unmerged_acfs.append(chrom_acf)

    acfs = merge_acfs(unmerged_acfs)
    acf_res = {}
    xs = np.array([], dtype='f')
    ys = np.array([], dtype='f')
    # iterate over it backwards and remove to reduce memory.
    while len(acfs):
        lmin, lmax, xys = acfs.pop()
        if partial:
            xs, ys = np.array(xys["x"]), np.array(xys["y"])
        else:
            # add the inner layers as we move out.
            xs = np.hstack((xs, xys["x"]))
            ys = np.hstack((ys, xys["y"]))
        if len(xs) == 0:
            print("no values found at lag: %i-%i. skipping" \
                    % (lmin, lmax), file=sys.stderr)
            continue
        if mlog:
            xs[xs == 0] = 1e-12
            ys[ys == 0] = 1e-12
            xs, ys = -np.log10(xs), -np.log10(ys)
        #slope, intercept, corr, p_val, stderr = ss.linregress(xs, ys)
        # NOTE: using pearson correlation, which assumes normality.
        # could switch to spearman as below.
        corr, p_val = ss.spearmanr(xs, ys)
        if simple:
            acf_res[(lmin, lmax)] = corr
        else:
            acf_res[(lmin, lmax)] = (corr, len(xs), p_val)
    return sorted(acf_res.items())
예제 #2
0
def acf(fnames, lags, col_num0, partial=True, simple=False, mlog=True):
    """
    calculate the correlation of the numbers in `col_num0` from the bed files
    in `fnames` at various lags. The lags are specified by distance. Partial
    autocorrelation may be calculated as well.

    Since the bed files may be very large, this attempts to be as memory
    efficient as possible while still being very fast for a pure python
    implementation.
    """
    # reversing allows optimization below.
    imap = get_map()

    arg_list = [] # chaining
    for fname in fnames:
        # groupby chromosome.
        arg_list = chain(arg_list, ((list(chromlist), lags) for chrom, \
                    chromlist in \
                    groupby(bediter(fname, col_num0), lambda a: a["chrom"])))

    unmerged_acfs = [] # separated by chrom. need to merge later.
    for chrom_acf in imap(_acf_by_chrom, arg_list):
        unmerged_acfs.append(chrom_acf)

    acfs = merge_acfs(unmerged_acfs)
    acf_res = {}
    xs = np.array([], dtype='f')
    ys = np.array([], dtype='f')
    # iterate over it backwards and remove to reduce memory.
    while len(acfs):
        lmin, lmax, xys = acfs.pop()
        if partial:
            xs, ys = np.array(xys["x"]), np.array(xys["y"])
        else:
            # add the inner layers as we move out.
            xs = np.hstack((xs, xys["x"]))
            ys = np.hstack((ys, xys["y"]))
        if len(xs) == 0:
            print("no values found at lag: %i-%i. skipping" \
                    % (lmin, lmax), file=sys.stderr)
            continue
        if mlog:
            xs[xs == 0] = 1e-12
            ys[ys == 0] = 1e-12
            xs, ys = -np.log10(xs), -np.log10(ys)
        #slope, intercept, corr, p_val, stderr = ss.linregress(xs, ys)
        # NOTE: using pearson correlation, which assumes normality.
        # could switch to spearman as below.
        corr, p_val = ss.spearmanr(xs, ys)
        if simple:
            acf_res[(lmin, lmax)] = corr
        else:
            acf_res[(lmin, lmax)] = (corr, len(xs), p_val)
    return sorted(acf_res.items())
예제 #3
0
def pipeline(col_num, step, dist, acf_dist, prefix, threshold, seed,
        bed_files, mlog=True, region_filter_p=1, region_filter_n=None,
        genome_control=False, db=None, use_fdr=True):
    sys.path.insert(0, op.join(op.dirname(__file__), ".."))
    from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter
    from cpv._common import genome_control_adjust, genomic_control, bediter
    import operator


    if step is None:
        step = min(acf_dist, stepsize.stepsize(bed_files, col_num))
        print("calculated stepsize as: %i" % step, file=sys.stderr)

    lags = list(range(1, acf_dist, step))
    lags.append(lags[-1] + step)

    prefix = prefix.rstrip(".")
    putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False,
                                mlog=mlog)
    acf_vals = []
    # go out to max requested distance but stop once an autocorrelation
    # < 0.05 is added.
    for a in putative_acf_vals:
        # a is ((lmin, lmax), (corr, N))
        # this heuristic seems to work. stop just above the 0.08 correlation
        # lag.
        if a[1][0] < 0.04 and len(acf_vals) > 2: break
        acf_vals.append(a)
        if a[1][0] < 0.04 and len(acf_vals): break

    # save the arguments that this was called with.
    with open(prefix + ".args.txt", "w") as fh:
        print(" ".join(sys.argv[1:]) + "\n", file=fh)
        import datetime
        print("date: %s" % datetime.datetime.today(), file=fh)
        from .__init__ import __version__
        print("version:", __version__, file=fh)

    with open(prefix + ".acf.txt", "w") as fh:
        acf_vals = acf.write_acf(acf_vals, fh)
        print("wrote: %s" % fh.name, file=fh)
    print("ACF:\n", open(prefix + ".acf.txt").read(), file=sys.stderr)

    spvals, opvals = array.array('f'), array.array('f')
    with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk:
        fhslk.write('#chrom\tstart\tend\tp\tregion-p\n')
        for chrom, results in slk.adjust_pvals(bed_files, col_num, acf_vals):
            fmt = chrom + "\t%i\t%i\t%.4g\t%.4g\n"
            for row in results:
                row = tuple(row)
                fhslk.write(fmt % row)
                opvals.append(row[-2])
                spvals.append(row[-1])

    print("# original lambda: %.2f" % genomic_control(opvals), file=sys.stderr)
    del opvals

    gc_lambda = genomic_control(spvals)
    print("wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda),
            file=sys.stderr)

    if genome_control:
        fhslk = ts.nopen(prefix + ".slk.gc.bed.gz", "w")
        adj = genome_control_adjust([d['p'] for d in bediter(prefix + ".slk.bed.gz", -1)])
        for i, line in enumerate(ts.nopen(prefix + ".slk.bed.gz")):
            print("%s\t%.5g" % (line.rstrip("\r\n"), adj[i]), file=fhslk)

        fhslk.close()
        print("wrote: %s" % fhslk.name, file=sys.stderr)

    with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh:
        fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n')
        for bh, l in fdr.fdr(fhslk.name, -1):
            fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh))
        print("wrote: %s" % fh.name, file=sys.stderr)
    fregions = prefix + ".regions.bed.gz"
    with ts.nopen(fregions, "w") as fh:
        list(peaks.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2, threshold, seed,
            dist, fh, operator.le))
    n_regions = sum(1 for _ in ts.nopen(fregions))
    print("wrote: %s (%i regions)" % (fregions, n_regions), file=sys.stderr)
    if n_regions == 0:
        sys.exit()

    with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh:
        N = 0
        fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n")
        # use -2 for original, uncorrected p-values in slk.bed
        for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p(
                               prefix + ".slk.bed.gz",
                               prefix + ".regions.bed.gz", -2,
                               step):
            fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p))
            fh.flush()
            N += int(slk_sidak_p < 0.05)
        print("wrote: %s, (regions with corrected-p < 0.05: %i)" \
                % (fh.name, N), file=sys.stderr)

    regions_bed = fh.name
    #if all(h in header for h in ('t', 'start', 'end')):
    if region_filter_n is None: region_filter_n = 0
    with ts.nopen(prefix + ".regions-t.bed", "w") as fh:
        N = 0
        for i, toks in enumerate(filter.filter(bed_files[0],
            regions_bed, p_col_name=col_num)):
            if i == 0: toks[0] = "#" + toks[0]
            else:
                if float(toks[6]) > region_filter_p: continue
                if int(toks[4]) < region_filter_n: continue
                #if region_filter_t and "/" in toks[7]:
                #    # t-pos/t-neg. if the lower one is > region_filter_t?
                #    vals = map(int, toks[7].split("/"))
                #    if min(vals) > region_filter_t: continue

                N += 1
            print("\t".join(toks), file=sys.stderr)
        print(("wrote: %s, (regions with region-p "
                            "< %.3f and n-probes >= %i: %i)") \
                % (fh.name, region_filter_p, region_filter_n, N),
                file=sys.stderr)

    try:
        from cpv import manhattan
        regions = manhattan.read_regions(fh.name)

        manhattan.manhattan(prefix + ".slk.bed.gz", 3, prefix.rstrip(".") + ".manhattan.png",
                         False, ['#959899', '#484B4C'], "", False, None,
                         regions=regions, bonferonni=False)
    except ImportError:
        pass # they dont have matplotlib


    if db is not None:
        from cruzdb import Genome
        g = Genome(db)
        lastf = fh.name
        with open(prefix + ".anno.%s.bed" % db, "w") as fh:
            fh.write('#')
            g.annotate(lastf, ("refGene", "cpgIslandExt"), out=fh,
                    feature_strand=True, parallel=len(spvals) > 500)
        print("wrote: %s annotated with %s" % (fh.name, db), file=sys.stderr)
예제 #4
0
def manhattan(fname, col_num, image_path, no_log, colors, title, lines, ymax,
             bonferonni=False, regions=None, subplots=False):
    """
    regions is keyed by chromosome with [(start, stop), ...] extents of
    the regions to highlight
    """
    xs, ys, cs = [], [], []
    region_xys = [] # highlight certain regions.
    colors = cycle(colors)
    chrom_centers = []

    last_x = 0
    nrows = 0
    giter = [(seqid, list(rlist)) for seqid, rlist \
        in groupby(bediter(fname, col_num), key=itemgetter('chrom'))]

    region_xs, region_ys = [], []
    new_bounds = []
    rcolors = cycle(('#AE2117', '#EA352B'))
    for seqid, rlist in sorted(giter, cmp=chr_cmp):
        color = colors.next()
        nrows += len(rlist)
        # since chroms are on the same plot. add this chrom to the end of the
        # last chrom
        rcolor = rcolors.next()

        region_xs = [last_x + r['start'] for r in rlist]
        xs.extend(region_xs)
        ys.extend([r['p'] for r in rlist])
        cs.extend([color] * len(rlist))

        if regions and seqid in regions:
            regions_bounds = regions[seqid]
            if len(regions_bounds) < 500:
                region_xys.extend([(last_x + r['start'], r['p'], rcolor) for r in rlist \
                  if any((s - 1 <= r['start'] <= e + 1) for s, e in regions_bounds)])
            else:
                sys.stderr.write("regions for %s > 500, not plotting\n" % seqid)
            # adjust the bounds of each region based on chrom.
            new_bounds.extend([(last_x + s, last_x + e)
                            for s, e in regions_bounds])

        # save the middle of the region to place the label
        chrom_centers.append((seqid, (region_xs[0] + region_xs[-1]) / 2))
        # keep track so that chrs don't overlap.
        last_x = xs[-1]

    xs = np.array(xs)
    ys = np.array(ys) if no_log else -np.log10(ys)

    plt.close()
    f, ax = plt.subplots(1, figsize=(10, 6))

    bonferonni_p = 0.05 / nrows

    if title is not None:
        plt.title(title)

    ax.set_ylabel('' if no_log else '-log10(p)')
    if regions:
        #"""
        # Plot as colored background
        if len(new_bounds) < 32:
            for s, e in new_bounds:
                ax.axvspan(s - 2, e + 2, facecolor='#EA352B',
                           ec='#EA352B', alpha=0.3, zorder=0)
        #"""
        # plot as points.
        rxs, rys, rcs = zip(*region_xys)
        if not no_log: rys = -np.log10(rys)
        ax.scatter(rxs, rys,
                #  s=rys ** 1.3,  # size by -log10(p)
                s = 6,
                c=rcs, edgecolors=rcs,
                zorder=2)

    if lines:
        ax.vlines(xs, 0, ys, colors=cs, alpha=0.5)
    else:
        alpha = 0.8 if len(xs) < 10000 else 0.6
        edgecolors = 'k' if len(xs) < 10000 else 'none'
        ax.scatter(xs, ys, s=3.5, c=cs, edgecolors=edgecolors, alpha=alpha, zorder=1)


    # plot 0.05 line after multiple testing. always nlog10'ed since
    # that's the space we're plotting in.
    if bonferonni:
        ax.axhline(y=-np.log10(bonferonni_p), color='0.5', linewidth=2)
    #plt.axis('tight')
    if max(xs) - min(xs) > 10000:
        plt.xlim(0, xs[-1])
    else:
        plt.xlim(xs[0], xs[-1])
    plt.ylim(ymin=0)
    if ymax is not None: plt.ylim(ymax=ymax)
    plt.xticks([c[1] for c in chrom_centers],
               [c[0].replace('chr', '') for c in chrom_centers], rotation=-90, size=8.5)
    #plt.show()
    print >>sys.stderr, "Bonferonni-corrected p-value for %i rows: %.3g" \
            % (nrows, 0.05 / nrows)
    print >>sys.stderr, "values less than Bonferonni-corrected p-value: %i " \
            % (ys > -np.log10(bonferonni_p)).sum()

    if subplots:
        pys = np.sort(10**-ys) # convert back to actual p-values
        gc = genomic_control(pys)
        ax_qq = f.add_axes((0.74, 0.12, 0.22, 0.22), alpha=0.2)
        ax_qq.text(0.03, 0.88, r'$\lambda : %.3f$' % gc, transform=ax_qq.transAxes)
        qqplot(ys, ax_qq)

        ax_hist = f.add_axes((0.12, 0.12, 0.22, 0.22), frameon=True, alpha=0.6)
        hist(pys, ax_hist)

    print >>sys.stderr, "saving to: %s" % image_path
    f.tight_layout()
    plt.savefig(image_path)

    return image_path
예제 #5
0
def manhattan(fname,
              col_num,
              image_path,
              no_log,
              colors,
              title,
              lines,
              ymax,
              bonferonni=False,
              regions=None,
              subplots=False):
    """
    regions is keyed by chromosome with [(start, stop), ...] extents of
    the regions to highlight
    """
    xs, ys, cs = [], [], []
    region_xys = []  # highlight certain regions.
    colors = cycle(colors)
    chrom_centers = []

    last_x = 0
    nrows = 0
    giter = [(seqid, list(rlist)) for seqid, rlist \
        in groupby(bediter(fname, col_num), key=itemgetter('chrom'))]

    region_xs, region_ys = [], []
    new_bounds = []
    rcolors = cycle(('#AE2117', '#EA352B'))
    for seqid, rlist in sorted(giter, cmp=chr_cmp):
        color = colors.next()
        nrows += len(rlist)
        # since chroms are on the same plot. add this chrom to the end of the
        # last chrom
        rcolor = rcolors.next()

        region_xs = [last_x + r['start'] for r in rlist]
        xs.extend(region_xs)
        ys.extend([r['p'] for r in rlist])
        cs.extend([color] * len(rlist))

        if regions and seqid in regions:
            regions_bounds = regions[seqid]
            if len(regions_bounds) < 500:
                region_xys.extend([(last_x + r['start'], r['p'], rcolor) for r in rlist \
                  if any((s - 1 <= r['start'] <= e + 1) for s, e in regions_bounds)])
            else:
                sys.stderr.write("regions for %s > 500, not plotting\n" %
                                 seqid)
            # adjust the bounds of each region based on chrom.
            new_bounds.extend([(last_x + s, last_x + e)
                               for s, e in regions_bounds])

        # save the middle of the region to place the label
        chrom_centers.append((seqid, (region_xs[0] + region_xs[-1]) / 2))
        # keep track so that chrs don't overlap.
        last_x = xs[-1]

    xs = np.array(xs)
    ys = np.array(ys) if no_log else -np.log10(ys)

    plt.close()
    f, ax = plt.subplots(1, figsize=(10, 6))

    bonferonni_p = 0.05 / nrows

    if title is not None:
        plt.title(title)

    ax.set_ylabel('' if no_log else '-log10(p)')
    if regions:
        #"""
        # Plot as colored background
        if len(new_bounds) < 32:
            for s, e in new_bounds:
                ax.axvspan(s - 2,
                           e + 2,
                           facecolor='#EA352B',
                           ec='#EA352B',
                           alpha=0.3,
                           zorder=0)
        #"""
        # plot as points.
        rxs, rys, rcs = zip(*region_xys)
        if not no_log: rys = -np.log10(rys)
        ax.scatter(
            rxs,
            rys,
            #  s=rys ** 1.3,  # size by -log10(p)
            s=6,
            c=rcs,
            edgecolors=rcs,
            zorder=2)

    if lines:
        ax.vlines(xs, 0, ys, colors=cs, alpha=0.5)
    else:
        alpha = 0.8 if len(xs) < 10000 else 0.6
        edgecolors = 'k' if len(xs) < 10000 else 'none'
        ax.scatter(xs,
                   ys,
                   s=3.5,
                   c=cs,
                   edgecolors=edgecolors,
                   alpha=alpha,
                   zorder=1)

    # plot 0.05 line after multiple testing. always nlog10'ed since
    # that's the space we're plotting in.
    if bonferonni:
        ax.axhline(y=-np.log10(bonferonni_p), color='0.5', linewidth=2)
    #plt.axis('tight')
    if max(xs) - min(xs) > 10000:
        plt.xlim(0, xs[-1])
    else:
        plt.xlim(xs[0], xs[-1])
    plt.ylim(ymin=0)
    if ymax is not None: plt.ylim(ymax=ymax)
    plt.xticks([c[1] for c in chrom_centers],
               [c[0].replace('chr', '') for c in chrom_centers],
               rotation=-90,
               size=8.5)
    #plt.show()
    print >>sys.stderr, "Bonferonni-corrected p-value for %i rows: %.3g" \
            % (nrows, 0.05 / nrows)
    print >>sys.stderr, "values less than Bonferonni-corrected p-value: %i " \
            % (ys > -np.log10(bonferonni_p)).sum()

    if subplots:
        pys = np.sort(10**-ys)  # convert back to actual p-values
        gc = genomic_control(pys)
        ax_qq = f.add_axes((0.74, 0.12, 0.22, 0.22), alpha=0.2)
        ax_qq.text(0.03,
                   0.88,
                   r'$\lambda : %.3f$' % gc,
                   transform=ax_qq.transAxes)
        qqplot(ys, ax_qq)

        ax_hist = f.add_axes((0.12, 0.12, 0.22, 0.22), frameon=True, alpha=0.6)
        hist(pys, ax_hist)

    print >> sys.stderr, "saving to: %s" % image_path
    f.tight_layout()
    plt.savefig(image_path)

    return image_path
def manhattan(fname, col_num, image_path, no_log, colors, title, lines, ymax, bonferonni=False, regions=None):
    """
    regions is keyed by chromosome with [(start, stop), ...] extents of
    the regions to highlight
    """
    xs, ys, cs = [], [], []
    region_xys = []  # highlight certain regions.
    colors = cycle(colors)
    chrom_centers = []

    last_x = 0
    nrows = 0
    giter = [(seqid, list(rlist)) for seqid, rlist in groupby(bediter(fname, col_num), key=itemgetter("chrom"))]

    region_xs, region_ys = [], []
    new_bounds = []
    for seqid, rlist in sorted(giter, cmp=chr_cmp):
        color = colors.next()
        nrows += len(rlist)
        # since chroms are on the same plot. add this chrom to the end of the
        # last chrom

        region_xs = [last_x + r["start"] for r in rlist]
        xs.extend(region_xs)
        ys.extend([r["p"] for r in rlist])
        cs.extend([color] * len(rlist))

        if regions and seqid in regions:
            regions_bounds = regions[seqid]
            region_xys.extend(
                [(last_x + r["start"], r["p"]) for r in rlist if any((s <= r["start"] <= e) for s, e in regions_bounds)]
            )
            # adjust the bounds of each region based on chrom.
            new_bounds.extend([(last_x + s, last_x + e) for s, e in regions_bounds])

        # save the middle of the region to place the label
        chrom_centers.append((seqid, (region_xs[0] + region_xs[-1]) / 2))
        # keep track so that chrs don't overlap.
        last_x = xs[-1]

    xs = np.array(xs)
    ys = np.array(ys) if no_log else -np.log10(ys)

    plt.close()
    f = plt.figure()
    ax = f.add_axes((0.1, 0.09, 0.88, 0.85))

    bonferonni_p = 0.05 / nrows

    if title is not None:
        plt.title(title)

    ax.set_ylabel("" if no_log else "-log10(p)")
    if regions:
        for s, e in new_bounds:
            # ax.axvspan(s - 55, e + 10, facecolor='#f30000', ec='#f30000', alpha=0.3,
            #        zorder=0)
            ax.axvspan(s - 55, e + 10, facecolor="#222222", ec="#222222", alpha=0.3, zorder=0)
    if lines:
        ax.vlines(xs, 0, ys, colors=cs, alpha=0.5)
    else:
        ax.scatter(xs, ys, s=3.5, c=cs, edgecolors="none", alpha=0.6, zorder=1)

    # plot 0.05 line after multiple testing. always nlog10'ed since
    # that's the space we're plotting in.
    if bonferonni:
        ax.axhline(y=-np.log10(bonferonni_p), color="0.5", linewidth=2)
    plt.axis("tight")
    plt.xlim(0, xs[-1])
    plt.ylim(ymin=0)
    if ymax is not None:
        plt.ylim(ymax=ymax)
    plt.xticks([c[1] for c in chrom_centers], [c[0] for c in chrom_centers], rotation=-90, size=8.5)
    # plt.show()
    print >>sys.stderr, "Bonferonni-corrected p-value for %i rows: %.3g" % (nrows, 0.05 / nrows)
    print >>sys.stderr, "values less than Bonferonni-corrected p-value: %i " % (ys > -np.log10(bonferonni_p)).sum()

    if False:
        ax_qq = f.add_axes((0.74, 0.12, 0.22, 0.22), alpha=0.2)

        pys = np.sort(10 ** -ys)  # convert back to actual p-values
        qqplot(ys, ax_qq)

        ax_hist = f.add_axes((0.12, 0.12, 0.22, 0.22), frameon=True, alpha=0.6)
        hist(pys, ax_hist)

    print >>sys.stderr, "saving to: %s" % image_path
    plt.savefig(image_path)

    return image_path
예제 #7
0
def pipeline(col_num,
             step,
             dist,
             acf_dist,
             prefix,
             threshold,
             seed,
             bed_files,
             mlog=True,
             region_filter_p=1,
             region_filter_n=None,
             genome_control=False,
             db=None,
             use_fdr=True):
    sys.path.insert(0, op.join(op.dirname(__file__), ".."))
    from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter
    from cpv._common import genome_control_adjust, genomic_control, bediter
    import operator

    if step is None:
        step = min(acf_dist, stepsize.stepsize(bed_files, col_num))
        print >> sys.stderr, "calculated stepsize as: %i" % step

    lags = range(1, acf_dist, step)
    lags.append(lags[-1] + step)

    prefix = prefix.rstrip(".")
    putative_acf_vals = acf.acf(bed_files,
                                lags,
                                col_num,
                                simple=False,
                                mlog=mlog)
    acf_vals = []
    # go out to max requested distance but stop once an autocorrelation
    # < 0.05 is added.
    for a in putative_acf_vals:
        # a is ((lmin, lmax), (corr, N))
        # this heuristic seems to work. stop just above the 0.08 correlation
        # lag.
        if a[1][0] < 0.04 and len(acf_vals) > 2: break
        acf_vals.append(a)
        if a[1][0] < 0.04 and len(acf_vals): break

    # save the arguments that this was called with.
    with open(prefix + ".args.txt", "w") as fh:
        print >> fh, " ".join(sys.argv[1:]) + "\n"
        import datetime
        print >> fh, "date: %s" % datetime.datetime.today()
        from .__init__ import __version__
        print >> fh, "version:", __version__

    with open(prefix + ".acf.txt", "w") as fh:
        acf_vals = acf.write_acf(acf_vals, fh)
        print >> sys.stderr, "wrote: %s" % fh.name
    print >> sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read()

    spvals, opvals = [], []
    with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk:
        fhslk.write('#chrom\tstart\tend\tp\tregion-p\n')
        for row in slk.adjust_pvals(bed_files, col_num, acf_vals):
            fhslk.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row)
            opvals.append(row[-2])
            spvals.append(row[-1])

    print >> sys.stderr, "# original lambda: %.2f" % genomic_control(opvals)
    del opvals

    gc_lambda = genomic_control(spvals)
    print >> sys.stderr, "wrote: %s with lambda: %.2f" % (fhslk.name,
                                                          gc_lambda)

    if genome_control:
        fhslk = ts.nopen(prefix + ".slk.gc.bed.gz", "w")
        adj = genome_control_adjust(
            [d['p'] for d in bediter(prefix + ".slk.bed.gz", -1)])
        for i, line in enumerate(ts.nopen(prefix + ".slk.bed.gz")):
            print >> fhslk, "%s\t%.5g" % (line.rstrip("\r\n"), adj[i])

        fhslk.close()
        print >> sys.stderr, "wrote: %s" % fhslk.name

    with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh:
        fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n')
        for bh, l in fdr.fdr(fhslk.name, -1):
            fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh))
        print >> sys.stderr, "wrote: %s" % fh.name
    fregions = prefix + ".regions.bed.gz"
    with ts.nopen(fregions, "w") as fh:
        list(
            peaks.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2,
                        threshold, seed, dist, fh, operator.le))
    n_regions = sum(1 for _ in ts.nopen(fregions))
    print >> sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions)
    if n_regions == 0:
        sys.exit()

    with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh:
        N = 0
        fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n")
        # use -2 for original, uncorrected p-values in slk.bed
        for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p(
                prefix + ".slk.bed.gz", prefix + ".regions.bed.gz", -2, step):
            fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p))
            fh.flush()
            N += int(slk_sidak_p < 0.05)
        print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \
                % (fh.name, N)

    regions_bed = fh.name
    header = ts.header(bed_files[0])
    #if all(h in header for h in ('t', 'start', 'end')):
    if region_filter_n is None: region_filter_n = 0
    with ts.nopen(prefix + ".regions-t.bed", "w") as fh:
        N = 0
        for i, toks in enumerate(
                filter.filter(bed_files[0], regions_bed, p_col_name=col_num)):
            if i == 0: toks[0] = "#" + toks[0]
            else:
                if float(toks[6]) > region_filter_p: continue
                if int(toks[4]) < region_filter_n: continue
                #if region_filter_t and "/" in toks[7]:
                #    # t-pos/t-neg. if the lower one is > region_filter_t?
                #    vals = map(int, toks[7].split("/"))
                #    if min(vals) > region_filter_t: continue

                N += 1
            print >> fh, "\t".join(toks)
        print >>sys.stderr, ("wrote: %s, (regions with region-p "
                            "< %.3f and n-probes >= %i: %i)") \
                % (fh.name, region_filter_p, region_filter_n, N)

    try:
        from cpv import manhattan
        regions = manhattan.read_regions(fh.name)

        manhattan.manhattan(prefix + ".slk.bed.gz",
                            3,
                            prefix.rstrip(".") + ".manhattan.png",
                            False, ['#959899', '#484B4C'],
                            "",
                            False,
                            None,
                            regions=regions,
                            bonferonni=False)
    except ImportError:
        pass  # they dont have matplotlib

    if db is not None:
        from cruzdb import Genome
        g = Genome(db)
        lastf = fh.name
        with open(prefix + ".anno.%s.bed" % db, "w") as fh:
            fh.write('#')
            g.annotate(lastf, ("refGene", "cpgIslandExt"),
                       out=fh,
                       feature_strand=True,
                       parallel=len(spvals) > 500)
        print >> sys.stderr, "wrote: %s annotated with %s" % (fh.name, db)
예제 #8
0
def pipeline(col_num, step, dist, prefix, threshold, seed, bed_files, mlog=False,
    region_filter_p=1, region_filter_n=1, genome_control=False, db=None):
    sys.path.insert(0, op.join(op.dirname(__file__), ".."))
    from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter
    from cpv._common import genome_control_adjust, genomic_control, bediter
    import operator


    if step is None:
        step = stepsize.stepsize(bed_files, col_num)
        print >>sys.stderr, "calculated stepsize as: %i" % step

    lags = range(1, dist, step)
    lags.append(lags[-1] + step)

    prefix = prefix.rstrip(".")
    #if genome_control:
    #    with open(prefix + ".adj.bed", "w") as fh:
    #        genome_control_adjust_bed(bed_files, col_num, fh)
    #    bed_files = [fh.name]
    putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False,
                                mlog=mlog)
    acf_vals = []
    # go out to max requested distance but stop once an autocorrelation
    # < 0.05 is added.
    for a in putative_acf_vals:
        # a is ((lmin, lmax), (corr, N))
        # this heuristic seems to work. stop just above the 0.08 correlation
        # lag.
        if a[1][0] < 0.04 and len(acf_vals) > 2: break
        acf_vals.append(a)
        if a[1][0] < 0.04 and len(acf_vals): break

    # save the arguments that this was called with.
    with open(prefix + ".args.txt", "w") as fh:
        print >>fh, " ".join(sys.argv[1:]) + "\n"
        import datetime
        print >>fh, "date: %s" % datetime.datetime.today()

    with open(prefix + ".acf.txt", "w") as fh:
        acf_vals = acf.write_acf(acf_vals, fh)
        print >>sys.stderr, "wrote: %s" % fh.name
    print >>sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read()

    spvals, opvals = [], []
    with open(prefix + ".slk.bed", "w") as fhslk:

        for row in slk.adjust_pvals(bed_files, col_num, acf_vals):
            fhslk.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row)
            opvals.append(row[-2])
            spvals.append(row[-1])

    print >>sys.stderr, "# original lambda: %.2f" % genomic_control(opvals)
    del opvals

    gc_lambda = genomic_control(spvals)
    print >>sys.stderr, "wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda)

    if genome_control:
        fhslk = open(prefix + ".slk.gc.bed", "w")
        adj = genome_control_adjust([d['p'] for d in bediter(prefix + ".slk.bed", -1)])
        for i, line in enumerate(open(prefix + ".slk.bed")):
            print >>fhslk, "%s\t%.5g" % (line.rstrip("\r\n"), adj[i])

        fhslk.close()
        print >>sys.stderr, "wrote: %s" % fhslk.name

    with open(prefix + ".fdr.bed", "w") as fh:
        for bh, l in fdr.fdr(fhslk.name, -1):
            fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh))
        print >>sys.stderr, "wrote: %s" % fh.name

    fregions = prefix + ".regions.bed"
    with open(fregions, "w") as fh:
        list(peaks.peaks(prefix + ".fdr.bed", -1, threshold, seed,
            step, fh, operator.le))
    n_regions = sum(1 for _ in open(fregions))
    print >>sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions)

    with open(prefix + ".regions-p.bed", "w") as fh:
        N = 0
        fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tslk_p\tslk_sidak_p\n")
        # use -2 for original, uncorrected p-values in slk.bed
        for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p(
                               prefix + ".slk.bed",
                               prefix + ".regions.bed", -2,
                               0, step, mlog=mlog):
            fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p))
            fh.flush()
            N += int(slk_sidak_p < 0.05)
        print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \
                % (fh.name, N)

    regions_bed = fh.name
    header = (gzip.open(bed_files[0]) if bed_files[0].endswith(".gz")
            else open(bed_files[0])).next().split("\t")
    if all(h in header for h in ('t', 'start', 'end')):
        with open(prefix + ".regions-t.bed", "w") as fh:
            N = 0
            for i, toks in enumerate(filter.filter(bed_files[0], regions_bed,
                p_col_name=col_num)):
                if i == 0: toks[0] = "#" + toks[0]
                else:
                    if float(toks[6]) > region_filter_p: continue
                    if int(toks[4]) < region_filter_n: continue
                    N += 1
                print >>fh, "\t".join(toks)
            print >>sys.stderr, ("wrote: %s, (regions with region-p"
                                "< %.3f and n-probes >= %i: %i)") \
                    % (fh.name, region_filter_p, region_filter_n, N)

    try:
        from cpv import manhattan
        regions = manhattan.read_regions(fh.name)

        manhattan.manhattan(prefix + ".slk.bed", 3, prefix.rstrip(".") + ".manhattan.png",
                         False, ['#959899', '#484B4C'], "", False, None,
                         regions=regions, bonferonni=True)
    except ImportError:
        pass # they dont have matplotlib


    if db is not None:
        from cruzdb import Genome
        g = Genome(db)
        lastf = fh.name
        with open(prefix + ".anno.%s.bed" % db, "w") as fh:
            g.annotate(lastf, ("refGene", "cpgIslandExt", "cytoBand"), out=fh,
                    feature_strand=True, parallel=len(spvals) > 500)
        print >>sys.stderr, "wrote: %s annotated with %s" % (fh.name, db)