Exemplo n.º 1
0
def pipeline(col_num, step, dist, acf_dist, prefix, threshold, seed,
        bed_files, mlog=True, region_filter_p=1, region_filter_n=None,
        genome_control=False, db=None, use_fdr=True):
    sys.path.insert(0, op.join(op.dirname(__file__), ".."))
    from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter
    from cpv._common import genome_control_adjust, genomic_control, bediter
    import operator


    if step is None:
        step = min(acf_dist, stepsize.stepsize(bed_files, col_num))
        print("calculated stepsize as: %i" % step, file=sys.stderr)

    lags = list(range(1, acf_dist, step))
    lags.append(lags[-1] + step)

    prefix = prefix.rstrip(".")
    putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False,
                                mlog=mlog)
    acf_vals = []
    # go out to max requested distance but stop once an autocorrelation
    # < 0.05 is added.
    for a in putative_acf_vals:
        # a is ((lmin, lmax), (corr, N))
        # this heuristic seems to work. stop just above the 0.08 correlation
        # lag.
        if a[1][0] < 0.04 and len(acf_vals) > 2: break
        acf_vals.append(a)
        if a[1][0] < 0.04 and len(acf_vals): break

    # save the arguments that this was called with.
    with open(prefix + ".args.txt", "w") as fh:
        print(" ".join(sys.argv[1:]) + "\n", file=fh)
        import datetime
        print("date: %s" % datetime.datetime.today(), file=fh)
        from .__init__ import __version__
        print("version:", __version__, file=fh)

    with open(prefix + ".acf.txt", "w") as fh:
        acf_vals = acf.write_acf(acf_vals, fh)
        print("wrote: %s" % fh.name, file=fh)
    print("ACF:\n", open(prefix + ".acf.txt").read(), file=sys.stderr)

    spvals, opvals = array.array('f'), array.array('f')
    with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk:
        fhslk.write('#chrom\tstart\tend\tp\tregion-p\n')
        for chrom, results in slk.adjust_pvals(bed_files, col_num, acf_vals):
            fmt = chrom + "\t%i\t%i\t%.4g\t%.4g\n"
            for row in results:
                row = tuple(row)
                fhslk.write(fmt % row)
                opvals.append(row[-2])
                spvals.append(row[-1])

    print("# original lambda: %.2f" % genomic_control(opvals), file=sys.stderr)
    del opvals

    gc_lambda = genomic_control(spvals)
    print("wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda),
            file=sys.stderr)

    if genome_control:
        fhslk = ts.nopen(prefix + ".slk.gc.bed.gz", "w")
        adj = genome_control_adjust([d['p'] for d in bediter(prefix + ".slk.bed.gz", -1)])
        for i, line in enumerate(ts.nopen(prefix + ".slk.bed.gz")):
            print("%s\t%.5g" % (line.rstrip("\r\n"), adj[i]), file=fhslk)

        fhslk.close()
        print("wrote: %s" % fhslk.name, file=sys.stderr)

    with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh:
        fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n')
        for bh, l in fdr.fdr(fhslk.name, -1):
            fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh))
        print("wrote: %s" % fh.name, file=sys.stderr)
    fregions = prefix + ".regions.bed.gz"
    with ts.nopen(fregions, "w") as fh:
        list(peaks.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2, threshold, seed,
            dist, fh, operator.le))
    n_regions = sum(1 for _ in ts.nopen(fregions))
    print("wrote: %s (%i regions)" % (fregions, n_regions), file=sys.stderr)
    if n_regions == 0:
        sys.exit()

    with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh:
        N = 0
        fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n")
        # use -2 for original, uncorrected p-values in slk.bed
        for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p(
                               prefix + ".slk.bed.gz",
                               prefix + ".regions.bed.gz", -2,
                               step):
            fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p))
            fh.flush()
            N += int(slk_sidak_p < 0.05)
        print("wrote: %s, (regions with corrected-p < 0.05: %i)" \
                % (fh.name, N), file=sys.stderr)

    regions_bed = fh.name
    #if all(h in header for h in ('t', 'start', 'end')):
    if region_filter_n is None: region_filter_n = 0
    with ts.nopen(prefix + ".regions-t.bed", "w") as fh:
        N = 0
        for i, toks in enumerate(filter.filter(bed_files[0],
            regions_bed, p_col_name=col_num)):
            if i == 0: toks[0] = "#" + toks[0]
            else:
                if float(toks[6]) > region_filter_p: continue
                if int(toks[4]) < region_filter_n: continue
                #if region_filter_t and "/" in toks[7]:
                #    # t-pos/t-neg. if the lower one is > region_filter_t?
                #    vals = map(int, toks[7].split("/"))
                #    if min(vals) > region_filter_t: continue

                N += 1
            print("\t".join(toks), file=sys.stderr)
        print(("wrote: %s, (regions with region-p "
                            "< %.3f and n-probes >= %i: %i)") \
                % (fh.name, region_filter_p, region_filter_n, N),
                file=sys.stderr)

    try:
        from cpv import manhattan
        regions = manhattan.read_regions(fh.name)

        manhattan.manhattan(prefix + ".slk.bed.gz", 3, prefix.rstrip(".") + ".manhattan.png",
                         False, ['#959899', '#484B4C'], "", False, None,
                         regions=regions, bonferonni=False)
    except ImportError:
        pass # they dont have matplotlib


    if db is not None:
        from cruzdb import Genome
        g = Genome(db)
        lastf = fh.name
        with open(prefix + ".anno.%s.bed" % db, "w") as fh:
            fh.write('#')
            g.annotate(lastf, ("refGene", "cpgIslandExt"), out=fh,
                    feature_strand=True, parallel=len(spvals) > 500)
        print("wrote: %s annotated with %s" % (fh.name, db), file=sys.stderr)
def pipeline(col_num, step, dist, prefix, threshold, seed, bed_files, mlog=False,
    region_filter_p=1, region_filter_n=1):
    sys.path.insert(0, op.join(op.dirname(__file__), ".."))
    from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter
    import operator

    if step is None:
        step = stepsize.stepsize(bed_files, col_num)
        print >>sys.stderr, "calculated stepsize as: %i" % step

    lags = range(1, dist, step)
    lags.append(lags[-1] + step)

    prefix = prefix.rstrip(".")
    # go out to max requested distance but stop once an autocorrelation
    # < 0.05 is added.

    putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False,
                                mlog=mlog)
    acf_vals = []
    for a in putative_acf_vals:
        # a is ((lmin, lmax), (corr, N))
        # this heuristic seems to work. stop just above the 0.08 correlation
        # lag.
        if a[1][0] < 0.04 and len(acf_vals) > 2: break
        acf_vals.append(a)
        if a[1][0] < 0.04 and len(acf_vals): break

    # save the arguments that this was called with.
    with open(prefix + ".args.txt", "w") as fh:
        print >>fh, " ".join(sys.argv[1:]) + "\n"
        import datetime
        print >>fh, "date: %s" % datetime.datetime.today()

    with open(prefix + ".acf.txt", "w") as fh:
        acf_vals = acf.write_acf(acf_vals, fh)
        print >>sys.stderr, "wrote: %s" % fh.name
    print >>sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read()

    with open(prefix + ".slk.bed", "w") as fh:
        for row in slk.adjust_pvals(bed_files, col_num, acf_vals):
            fh.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row)
        print >>sys.stderr, "wrote: %s" % fh.name

    with open(prefix + ".fdr.bed", "w") as fh:
        for bh, l in fdr.fdr(prefix + ".slk.bed", -1):
            fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh))
        print >>sys.stderr, "wrote: %s" % fh.name

    fregions = prefix + ".regions.bed"
    with open(fregions, "w") as fh:
        peaks.peaks(prefix + ".fdr.bed", -1, threshold, seed,
            step, fh, operator.le)
    n_regions = sum(1 for _ in open(fregions))
    print >>sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions)

    with open(prefix + ".regions-p.bed", "w") as fh:
        N = 0
        fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tslk_p\tslk_sidak_p\n")
        # use -2 for original, uncorrected p-values in slk.bed
        for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p(
                               prefix + ".slk.bed",
                               prefix + ".regions.bed", -2,
                               0, step, mlog=mlog):
            fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p))
            fh.flush()
            N += int(slk_sidak_p < 0.05)
        print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \
                % (fh.name, N)

    regions_bed = fh.name
    with open(prefix + ".regions-t.bed", "w") as fh:
        N = 0
        for i, toks in enumerate(filter.filter(bed_files[0], regions_bed)):
            if i == 0: toks[0] = "#" + toks[0]
            else:
                if float(toks[6]) > region_filter_p: continue
                if int(toks[4]) < region_filter_n: continue
                N += 1
            print >>fh, "\t".join(toks)
        print >>sys.stderr, "wrote: %s, (regions with region-p < %.3f and n-probes >= %i: %i)" \
                % (fh.name, region_filter_p, region_filter_n, N)
Exemplo n.º 3
0
def pipeline(col_num,
             step,
             dist,
             acf_dist,
             prefix,
             threshold,
             seed,
             bed_files,
             mlog=True,
             region_filter_p=1,
             region_filter_n=None,
             genome_control=False,
             db=None,
             use_fdr=True):
    sys.path.insert(0, op.join(op.dirname(__file__), ".."))
    from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter
    from cpv._common import genome_control_adjust, genomic_control, bediter
    import operator

    if step is None:
        step = min(acf_dist, stepsize.stepsize(bed_files, col_num))
        print >> sys.stderr, "calculated stepsize as: %i" % step

    lags = range(1, acf_dist, step)
    lags.append(lags[-1] + step)

    prefix = prefix.rstrip(".")
    putative_acf_vals = acf.acf(bed_files,
                                lags,
                                col_num,
                                simple=False,
                                mlog=mlog)
    acf_vals = []
    # go out to max requested distance but stop once an autocorrelation
    # < 0.05 is added.
    for a in putative_acf_vals:
        # a is ((lmin, lmax), (corr, N))
        # this heuristic seems to work. stop just above the 0.08 correlation
        # lag.
        if a[1][0] < 0.04 and len(acf_vals) > 2: break
        acf_vals.append(a)
        if a[1][0] < 0.04 and len(acf_vals): break

    # save the arguments that this was called with.
    with open(prefix + ".args.txt", "w") as fh:
        print >> fh, " ".join(sys.argv[1:]) + "\n"
        import datetime
        print >> fh, "date: %s" % datetime.datetime.today()
        from .__init__ import __version__
        print >> fh, "version:", __version__

    with open(prefix + ".acf.txt", "w") as fh:
        acf_vals = acf.write_acf(acf_vals, fh)
        print >> sys.stderr, "wrote: %s" % fh.name
    print >> sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read()

    spvals, opvals = [], []
    with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk:
        fhslk.write('#chrom\tstart\tend\tp\tregion-p\n')
        for row in slk.adjust_pvals(bed_files, col_num, acf_vals):
            fhslk.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row)
            opvals.append(row[-2])
            spvals.append(row[-1])

    print >> sys.stderr, "# original lambda: %.2f" % genomic_control(opvals)
    del opvals

    gc_lambda = genomic_control(spvals)
    print >> sys.stderr, "wrote: %s with lambda: %.2f" % (fhslk.name,
                                                          gc_lambda)

    if genome_control:
        fhslk = ts.nopen(prefix + ".slk.gc.bed.gz", "w")
        adj = genome_control_adjust(
            [d['p'] for d in bediter(prefix + ".slk.bed.gz", -1)])
        for i, line in enumerate(ts.nopen(prefix + ".slk.bed.gz")):
            print >> fhslk, "%s\t%.5g" % (line.rstrip("\r\n"), adj[i])

        fhslk.close()
        print >> sys.stderr, "wrote: %s" % fhslk.name

    with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh:
        fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n')
        for bh, l in fdr.fdr(fhslk.name, -1):
            fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh))
        print >> sys.stderr, "wrote: %s" % fh.name
    fregions = prefix + ".regions.bed.gz"
    with ts.nopen(fregions, "w") as fh:
        list(
            peaks.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2,
                        threshold, seed, dist, fh, operator.le))
    n_regions = sum(1 for _ in ts.nopen(fregions))
    print >> sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions)
    if n_regions == 0:
        sys.exit()

    with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh:
        N = 0
        fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n")
        # use -2 for original, uncorrected p-values in slk.bed
        for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p(
                prefix + ".slk.bed.gz", prefix + ".regions.bed.gz", -2, step):
            fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p))
            fh.flush()
            N += int(slk_sidak_p < 0.05)
        print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \
                % (fh.name, N)

    regions_bed = fh.name
    header = ts.header(bed_files[0])
    #if all(h in header for h in ('t', 'start', 'end')):
    if region_filter_n is None: region_filter_n = 0
    with ts.nopen(prefix + ".regions-t.bed", "w") as fh:
        N = 0
        for i, toks in enumerate(
                filter.filter(bed_files[0], regions_bed, p_col_name=col_num)):
            if i == 0: toks[0] = "#" + toks[0]
            else:
                if float(toks[6]) > region_filter_p: continue
                if int(toks[4]) < region_filter_n: continue
                #if region_filter_t and "/" in toks[7]:
                #    # t-pos/t-neg. if the lower one is > region_filter_t?
                #    vals = map(int, toks[7].split("/"))
                #    if min(vals) > region_filter_t: continue

                N += 1
            print >> fh, "\t".join(toks)
        print >>sys.stderr, ("wrote: %s, (regions with region-p "
                            "< %.3f and n-probes >= %i: %i)") \
                % (fh.name, region_filter_p, region_filter_n, N)

    try:
        from cpv import manhattan
        regions = manhattan.read_regions(fh.name)

        manhattan.manhattan(prefix + ".slk.bed.gz",
                            3,
                            prefix.rstrip(".") + ".manhattan.png",
                            False, ['#959899', '#484B4C'],
                            "",
                            False,
                            None,
                            regions=regions,
                            bonferonni=False)
    except ImportError:
        pass  # they dont have matplotlib

    if db is not None:
        from cruzdb import Genome
        g = Genome(db)
        lastf = fh.name
        with open(prefix + ".anno.%s.bed" % db, "w") as fh:
            fh.write('#')
            g.annotate(lastf, ("refGene", "cpgIslandExt"),
                       out=fh,
                       feature_strand=True,
                       parallel=len(spvals) > 500)
        print >> sys.stderr, "wrote: %s annotated with %s" % (fh.name, db)
Exemplo n.º 4
0
def pipeline(col_num, step, dist, prefix, threshold, seed, bed_files, mlog=False,
    region_filter_p=1, region_filter_n=1, genome_control=False, db=None):
    sys.path.insert(0, op.join(op.dirname(__file__), ".."))
    from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter
    from cpv._common import genome_control_adjust, genomic_control, bediter
    import operator


    if step is None:
        step = stepsize.stepsize(bed_files, col_num)
        print >>sys.stderr, "calculated stepsize as: %i" % step

    lags = range(1, dist, step)
    lags.append(lags[-1] + step)

    prefix = prefix.rstrip(".")
    #if genome_control:
    #    with open(prefix + ".adj.bed", "w") as fh:
    #        genome_control_adjust_bed(bed_files, col_num, fh)
    #    bed_files = [fh.name]
    putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False,
                                mlog=mlog)
    acf_vals = []
    # go out to max requested distance but stop once an autocorrelation
    # < 0.05 is added.
    for a in putative_acf_vals:
        # a is ((lmin, lmax), (corr, N))
        # this heuristic seems to work. stop just above the 0.08 correlation
        # lag.
        if a[1][0] < 0.04 and len(acf_vals) > 2: break
        acf_vals.append(a)
        if a[1][0] < 0.04 and len(acf_vals): break

    # save the arguments that this was called with.
    with open(prefix + ".args.txt", "w") as fh:
        print >>fh, " ".join(sys.argv[1:]) + "\n"
        import datetime
        print >>fh, "date: %s" % datetime.datetime.today()

    with open(prefix + ".acf.txt", "w") as fh:
        acf_vals = acf.write_acf(acf_vals, fh)
        print >>sys.stderr, "wrote: %s" % fh.name
    print >>sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read()

    spvals, opvals = [], []
    with open(prefix + ".slk.bed", "w") as fhslk:

        for row in slk.adjust_pvals(bed_files, col_num, acf_vals):
            fhslk.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row)
            opvals.append(row[-2])
            spvals.append(row[-1])

    print >>sys.stderr, "# original lambda: %.2f" % genomic_control(opvals)
    del opvals

    gc_lambda = genomic_control(spvals)
    print >>sys.stderr, "wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda)

    if genome_control:
        fhslk = open(prefix + ".slk.gc.bed", "w")
        adj = genome_control_adjust([d['p'] for d in bediter(prefix + ".slk.bed", -1)])
        for i, line in enumerate(open(prefix + ".slk.bed")):
            print >>fhslk, "%s\t%.5g" % (line.rstrip("\r\n"), adj[i])

        fhslk.close()
        print >>sys.stderr, "wrote: %s" % fhslk.name

    with open(prefix + ".fdr.bed", "w") as fh:
        for bh, l in fdr.fdr(fhslk.name, -1):
            fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh))
        print >>sys.stderr, "wrote: %s" % fh.name

    fregions = prefix + ".regions.bed"
    with open(fregions, "w") as fh:
        list(peaks.peaks(prefix + ".fdr.bed", -1, threshold, seed,
            step, fh, operator.le))
    n_regions = sum(1 for _ in open(fregions))
    print >>sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions)

    with open(prefix + ".regions-p.bed", "w") as fh:
        N = 0
        fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tslk_p\tslk_sidak_p\n")
        # use -2 for original, uncorrected p-values in slk.bed
        for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p(
                               prefix + ".slk.bed",
                               prefix + ".regions.bed", -2,
                               0, step, mlog=mlog):
            fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p))
            fh.flush()
            N += int(slk_sidak_p < 0.05)
        print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \
                % (fh.name, N)

    regions_bed = fh.name
    header = (gzip.open(bed_files[0]) if bed_files[0].endswith(".gz")
            else open(bed_files[0])).next().split("\t")
    if all(h in header for h in ('t', 'start', 'end')):
        with open(prefix + ".regions-t.bed", "w") as fh:
            N = 0
            for i, toks in enumerate(filter.filter(bed_files[0], regions_bed,
                p_col_name=col_num)):
                if i == 0: toks[0] = "#" + toks[0]
                else:
                    if float(toks[6]) > region_filter_p: continue
                    if int(toks[4]) < region_filter_n: continue
                    N += 1
                print >>fh, "\t".join(toks)
            print >>sys.stderr, ("wrote: %s, (regions with region-p"
                                "< %.3f and n-probes >= %i: %i)") \
                    % (fh.name, region_filter_p, region_filter_n, N)

    try:
        from cpv import manhattan
        regions = manhattan.read_regions(fh.name)

        manhattan.manhattan(prefix + ".slk.bed", 3, prefix.rstrip(".") + ".manhattan.png",
                         False, ['#959899', '#484B4C'], "", False, None,
                         regions=regions, bonferonni=True)
    except ImportError:
        pass # they dont have matplotlib


    if db is not None:
        from cruzdb import Genome
        g = Genome(db)
        lastf = fh.name
        with open(prefix + ".anno.%s.bed" % db, "w") as fh:
            g.annotate(lastf, ("refGene", "cpgIslandExt", "cytoBand"), out=fh,
                    feature_strand=True, parallel=len(spvals) > 500)
        print >>sys.stderr, "wrote: %s annotated with %s" % (fh.name, db)