Exemplo n.º 1
0
def coverage(args):
    """
    %prog coverage fastafile ctg bedfile1 bedfile2 ..

    Plot coverage from a set of BED files that contain the read mappings. The
    paired read span will be converted to a new bedfile that contain the happy
    mates. ctg is the chr/scf/ctg that you want to plot the histogram on.

    If the bedfiles already contain the clone spans, turn on --spans.
    """
    from jcvi.formats.bed import mates, bedpe

    p = OptionParser(coverage.__doc__)
    p.add_option("--ymax",
                 default=None,
                 type="int",
                 help="Limit ymax [default: %default]")
    p.add_option(
        "--spans",
        default=False,
        action="store_true",
        help="BED files already contain clone spans [default: %default]")
    opts, args, iopts = p.set_image_options(args, figsize="8x5")

    if len(args) < 3:
        sys.exit(not p.print_help())

    fastafile, ctg = args[0:2]
    bedfiles = args[2:]

    sizes = Sizes(fastafile)
    size = sizes.mapping[ctg]

    plt.figure(1, (iopts.w, iopts.h))
    ax = plt.gca()

    bins = 100  # smooth the curve
    lines = []
    legends = []
    not_covered = []
    yy = .9
    for bedfile, c in zip(bedfiles, "rgbcky"):
        if not opts.spans:
            pf = bedfile.rsplit(".", 1)[0]
            matesfile = pf + ".mates"
            if need_update(bedfile, matesfile):
                matesfile, matesbedfile = mates([bedfile, "--lib"])

            bedspanfile = pf + ".spans.bed"
            if need_update(matesfile, bedspanfile):
                bedpefile, bedspanfile = bedpe(
                    [bedfile, "--span", "--mates={0}".format(matesfile)])
            bedfile = bedspanfile

        bedsum = Bed(bedfile).sum(seqid=ctg)
        notcoveredbases = size - bedsum

        legend = bedfile.split(".")[0]
        msg = "{0}: {1} bp not covered".format(legend,
                                               thousands(notcoveredbases))
        not_covered.append(msg)
        print >> sys.stderr, msg
        ax.text(.1, yy, msg, color=c, size=9, transform=ax.transAxes)
        yy -= .08

        cov = Coverage(bedfile, sizes.filename)
        x, y = cov.get_plot_data(ctg, bins=bins)
        line, = ax.plot(x, y, '-', color=c, lw=2, alpha=.5)
        lines.append(line)
        legends.append(legend)

    leg = ax.legend(lines, legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(.5)

    ylabel = "Average depth per {0}Kb".format(size / bins / 1000)
    ax.set_xlim(0, size)
    ax.set_ylim(0, opts.ymax)
    ax.set_xlabel(ctg)
    ax.set_ylabel(ylabel)
    set_human_base_axis(ax)

    figname = "{0}.{1}.pdf".format(fastafile, ctg)
    savefig(figname, dpi=iopts.dpi, iopts=iopts)
Exemplo n.º 2
0
def qc(args):
    """
    %prog qc prefix

    Expects data files including:
    1. `prefix.bedpe` draws Bezier curve between paired reads
    2. `prefix.sizes` draws length of the contig/scaffold
    3. `prefix.gaps.bed` mark the position of the gaps in sequence
    4. `prefix.bed.coverage` plots the base coverage
    5. `prefix.pairs.bed.coverage` plots the clone coverage

    See assembly.coverage.posmap() for the generation of these files.
    """
    from jcvi.graphics.glyph import Bezier

    p = OptionParser(qc.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    prefix, = args
    scf = prefix

    # All these files *must* be present in the current folder
    bedpefile = prefix + ".bedpe"
    fastafile = prefix + ".fasta"
    sizesfile = prefix + ".sizes"
    gapsbedfile = prefix + ".gaps.bed"
    bedfile = prefix + ".bed"
    bedpefile = prefix + ".bedpe"
    pairsbedfile = prefix + ".pairs.bed"

    sizes = Sizes(fastafile).mapping
    size = sizes[scf]

    fig = plt.figure(1, (8, 5))
    root = fig.add_axes([0, 0, 1, 1])

    # the scaffold
    root.add_patch(Rectangle((.1, .15), .8, .03, fc='k'))

    # basecoverage and matecoverage
    ax = fig.add_axes([.1, .45, .8, .45])

    bins = 200  # Smooth the curve
    basecoverage = Coverage(bedfile, sizesfile)
    matecoverage = Coverage(pairsbedfile, sizesfile)

    x, y = basecoverage.get_plot_data(scf, bins=bins)
    baseline, = ax.plot(x, y, 'g-')
    x, y = matecoverage.get_plot_data(scf, bins=bins)
    mateline, = ax.plot(x, y, 'r-')
    legends = ("Base coverage", "Mate coverage")
    leg = ax.legend((baseline, mateline), legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(.5)
    ax.set_xlim(0, size)

    # draw the read pairs
    fp = open(bedpefile)
    pairs = []
    for row in fp:
        scf, astart, aend, scf, bstart, bend, clonename = row.split()
        astart, bstart = int(astart), int(bstart)
        aend, bend = int(aend), int(bend)
        start = min(astart, bstart) + 1
        end = max(aend, bend)
        pairs.append((start, end))

    bpratio = .8 / size
    cutoff = 1000  # inserts smaller than this are not plotted
    # this convert from base => x-coordinate
    pos = lambda x: (.1 + x * bpratio)
    ypos = .15 + .03
    for start, end in pairs:
        dist = end - start

        if dist < cutoff:
            continue

        dist = min(dist, 10000)
        # 10Kb == .25 canvas height
        height = .25 * dist / 10000
        xstart = pos(start)
        xend = pos(end)
        p0 = (xstart, ypos)
        p1 = (xstart, ypos + height)
        p2 = (xend, ypos + height)
        p3 = (xend, ypos)
        Bezier(root, p0, p1, p2, p3)

    # gaps on the scaffold
    fp = open(gapsbedfile)
    for row in fp:
        b = BedLine(row)
        start, end = b.start, b.end
        xstart = pos(start)
        xend = pos(end)
        root.add_patch(Rectangle((xstart, .15), xend - xstart, .03, fc='w'))

    root.text(.5, .1, scf, color='b', ha="center")
    warn_msg = "Only the inserts > {0}bp are shown".format(cutoff)
    root.text(.5, .1, scf, color='b', ha="center")
    root.text(.5, .05, warn_msg, color='gray', ha="center")
    # clean up and output
    set_human_base_axis(ax)
    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    figname = prefix + ".pdf"
    savefig(figname, dpi=300)
Exemplo n.º 3
0
def coverage(args):
    """
    %prog coverage fastafile ctg bedfile1 bedfile2 ..

    Plot coverage from a set of BED files that contain the read mappings. The
    paired read span will be converted to a new bedfile that contain the happy
    mates. ctg is the chr/scf/ctg that you want to plot the histogram on.

    If the bedfiles already contain the clone spans, turn on --spans.
    """
    from jcvi.formats.bed import mates, bedpe

    p = OptionParser(coverage.__doc__)
    p.add_option("--ymax", default=None, type="int",
                 help="Limit ymax [default: %default]")
    p.add_option("--spans", default=False, action="store_true",
                 help="BED files already contain clone spans [default: %default]")
    opts, args, iopts = p.set_image_options(args, figsize="8x5")

    if len(args) < 3:
        sys.exit(not p.print_help())

    fastafile, ctg = args[0:2]
    bedfiles = args[2:]

    sizes = Sizes(fastafile)
    size = sizes.mapping[ctg]

    plt.figure(1, (iopts.w, iopts.h))
    ax = plt.gca()

    bins = 100  # smooth the curve
    lines = []
    legends = []
    not_covered = []
    yy = .9
    for bedfile, c in zip(bedfiles, "rgbcky"):
        if not opts.spans:
            pf = bedfile.rsplit(".", 1)[0]
            matesfile = pf + ".mates"
            if need_update(bedfile, matesfile):
                matesfile, matesbedfile = mates([bedfile, "--lib"])

            bedspanfile = pf + ".spans.bed"
            if need_update(matesfile, bedspanfile):
                bedpefile, bedspanfile = bedpe([bedfile, "--span",
                    "--mates={0}".format(matesfile)])
            bedfile = bedspanfile

        bedsum = Bed(bedfile).sum(seqid=ctg)
        notcoveredbases = size - bedsum

        legend = bedfile.split(".")[0]
        msg = "{0}: {1} bp not covered".format(legend, thousands(notcoveredbases))
        not_covered.append(msg)
        print >> sys.stderr, msg
        ax.text(.1, yy, msg, color=c, size=9, transform=ax.transAxes)
        yy -= .08

        cov = Coverage(bedfile, sizes.filename)
        x, y = cov.get_plot_data(ctg, bins=bins)
        line, = ax.plot(x, y, '-', color=c, lw=2, alpha=.5)
        lines.append(line)
        legends.append(legend)

    leg = ax.legend(lines, legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(.5)

    ylabel = "Average depth per {0}Kb".format(size / bins / 1000)
    ax.set_xlim(0, size)
    ax.set_ylim(0, opts.ymax)
    ax.set_xlabel(ctg)
    ax.set_ylabel(ylabel)
    set_human_base_axis(ax)

    figname ="{0}.{1}.pdf".format(fastafile, ctg)
    savefig(figname, dpi=iopts.dpi, iopts=iopts)
Exemplo n.º 4
0
Arquivo: bed.py Projeto: radaniba/jcvi
def sample(args):
    """
    %prog sample bedfile sizesfile

    Sample bed file and remove high-coverage regions.

    When option --targetsize is used, this program uses a differnent mode. It
    first calculates the current total bases from all ranges and then compare to
    targetsize, if more, then sample down as close to targetsize as possible.
    """
    import random
    from jcvi.assembly.coverage import Coverage

    p = OptionParser(sample.__doc__)
    p.add_option("--max",
                 default=10,
                 type="int",
                 help="Max depth allowed [default: %default]")
    p.add_option(
        "--targetsize",
        type="int",
        help="Sample bed file to get target base number [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, sizesfile = args
    pf = bedfile.rsplit(".", 1)[0]

    targetsize = opts.targetsize
    if targetsize:
        bed = Bed(bedfile)
        samplebed = pf + ".sample.bed"
        fw = open(samplebed, "w")
        nfeats = len(bed)
        nbases = bed.sum(unique=False)
        targetfeats = int(round(nfeats * targetsize / nbases))
        sub_bed = random.sample(bed, targetfeats)
        for b in sub_bed:
            print >> fw, b

        logging.debug("File written to `{0}`.".format(samplebed))
        return

    c = Coverage(bedfile, sizesfile)
    coveragefile = c.filename
    samplecoveragefile = pf + ".sample.coverage"
    fw = open(samplecoveragefile, "w")
    fp = open(coveragefile)
    for row in fp:
        seqid, start, end, cov = row.split()
        cov = int(cov)
        if cov <= opts.max:
            fw.write(row)
    fw.close()

    samplebedfile = pf + ".sample.bed"
    cmd = "intersectBed -a {0} -b {1} -wa -u".format(bedfile,
                                                     samplecoveragefile)
    sh(cmd, outfile=samplebedfile)
    logging.debug("Sampled bedfile written to `{0}`.".format(samplebedfile))
Exemplo n.º 5
0
def qc(args):
    """
    %prog qc prefix

    Expects data files including:
    1. `prefix.bedpe` draws Bezier curve between paired reads
    2. `prefix.sizes` draws length of the contig/scaffold
    3. `prefix.gaps.bed` mark the position of the gaps in sequence
    4. `prefix.bed.coverage` plots the base coverage
    5. `prefix.pairs.bed.coverage` plots the clone coverage

    See assembly.coverage.posmap() for the generation of these files.
    """
    from jcvi.graphics.glyph import Bezier

    p = OptionParser(qc.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    prefix, = args
    scf = prefix

    # All these files *must* be present in the current folder
    bedpefile = prefix + ".bedpe"
    fastafile = prefix + ".fasta"
    sizesfile = prefix + ".sizes"
    gapsbedfile = prefix + ".gaps.bed"
    bedfile = prefix + ".bed"
    bedpefile = prefix + ".bedpe"
    pairsbedfile = prefix + ".pairs.bed"

    sizes = Sizes(fastafile).mapping
    size = sizes[scf]

    fig = plt.figure(1, (8, 5))
    root = fig.add_axes([0, 0, 1, 1])

    # the scaffold
    root.add_patch(Rectangle((.1, .15), .8, .03, fc='k'))

    # basecoverage and matecoverage
    ax = fig.add_axes([.1, .45, .8, .45])

    bins = 200  # Smooth the curve
    basecoverage = Coverage(bedfile, sizesfile)
    matecoverage = Coverage(pairsbedfile, sizesfile)

    x, y = basecoverage.get_plot_data(scf, bins=bins)
    baseline, = ax.plot(x, y, 'g-')
    x, y = matecoverage.get_plot_data(scf, bins=bins)
    mateline, = ax.plot(x, y, 'r-')
    legends = ("Base coverage", "Mate coverage")
    leg = ax.legend((baseline, mateline), legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(.5)
    ax.set_xlim(0, size)

    # draw the read pairs
    fp = open(bedpefile)
    pairs = []
    for row in fp:
        scf, astart, aend, scf, bstart, bend, clonename = row.split()
        astart, bstart = int(astart), int(bstart)
        aend, bend = int(aend), int(bend)
        start = min(astart, bstart) + 1
        end = max(aend, bend)
        pairs.append((start, end))

    bpratio = .8 / size
    cutoff = 1000  # inserts smaller than this are not plotted
    # this convert from base => x-coordinate
    pos = lambda x: (.1 + x * bpratio)
    ypos = .15 + .03
    for start, end in pairs:
        dist = end - start

        if dist < cutoff:
            continue

        dist = min(dist, 10000)
        # 10Kb == .25 canvas height
        height = .25 * dist / 10000
        xstart = pos(start)
        xend = pos(end)
        p0 = (xstart, ypos)
        p1 = (xstart, ypos + height)
        p2 = (xend, ypos + height)
        p3 = (xend, ypos)
        Bezier(root, p0, p1, p2, p3)

    # gaps on the scaffold
    fp = open(gapsbedfile)
    for row in fp:
        b = BedLine(row)
        start, end = b.start, b.end
        xstart = pos(start)
        xend = pos(end)
        root.add_patch(Rectangle((xstart, .15), xend - xstart, .03, fc='w'))

    root.text(.5, .1, scf, color='b', ha="center")
    warn_msg = "Only the inserts > {0}bp are shown".format(cutoff)
    root.text(.5, .1, scf, color='b', ha="center")
    root.text(.5, .05, warn_msg, color='gray', ha="center")
    # clean up and output
    set_human_base_axis(ax)
    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    figname = prefix + ".pdf"
    savefig(figname, dpi=300)