Пример #1
0
def main():
    p = OptionParser(__doc__)
    p.add_option("--order",
                help="The order to plot the tracks, comma-separated")
    opts, args, iopts = p.set_image_options()

    if len(args) != 3:
        sys.exit(not p.print_help())

    chr, sizes, datadir = args
    order = opts.order
    hlsuffix = opts.hlsuffix
    if order:
        order = order.split(",")
    sizes = Sizes(sizes)
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    canvas = (.12, .35, .8, .35)
    chr_size = sizes.get_size(chr)
    c = Coverage(fig, root, canvas, chr, (0, chr_size), datadir,
                 order=order, hlsuffix=hlsuffix)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    image_name = chr + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Пример #2
0
def agp(args):
    """
    %prog agp main_results/ contigs.fasta

    Generate AGP file based on LACHESIS output.
    """
    p = OptionParser(agp.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    odir, contigsfasta = args
    fwagp = must_open(opts.outfile, 'w')
    orderingfiles = natsorted(iglob(odir, "*.ordering"))
    sizes = Sizes(contigsfasta).mapping
    contigs = set(sizes.keys())
    anchored = set()

    for ofile in orderingfiles:
        co = ContigOrdering(ofile)
        anchored |= set([x.contig_name for x in co])
        obj = op.basename(ofile).split('.')[0]
        co.write_agp(obj, sizes, fwagp)

    singletons = contigs - anchored
    logging.debug('Anchored: {}, Singletons: {}'.
                  format(len(anchored), len(singletons)))

    for s in natsorted(singletons):
        order_to_agp(s, [(s, "?")], sizes, fwagp)
Пример #3
0
def bincount(args):
    """
    %prog bincount fastafile binfile

    Count K-mers in the bin.
    """
    from bitarray import bitarray
    from jcvi.formats.sizes import Sizes

    p = OptionParser(bincount.__doc__)
    p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, binfile = args
    K = opts.K

    fp = open(binfile)
    a = bitarray()
    a.fromfile(fp)
    f = Sizes(fastafile)
    tsize = 0
    fw = must_open(opts.outfile, "w")
    for name, seqlen in f.iter_sizes():
        ksize = seqlen - K + 1
        b = a[tsize : tsize + ksize]
        bcount = b.count()
        print >> fw, "\t".join(str(x) for x in (name, bcount))
        tsize += ksize
Пример #4
0
def pasteprepare(args):
    """
    %prog pasteprepare bacs.fasta

    Prepare sequences for paste.
    """
    p = OptionParser(pasteprepare.__doc__)
    p.add_option("--flank", default=5000, type="int",
                 help="Get the seq of size on two ends [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    goodfasta, = args
    flank = opts.flank
    pf = goodfasta.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"

    sizes = Sizes(goodfasta)
    fw = open(extbed, "w")
    for bac, size in sizes.iter_sizes():
        print >> fw, "\t".join(str(x) for x in \
                               (bac, 0, min(flank, size), bac + "L"))
        print >> fw, "\t".join(str(x) for x in \
                               (bac, max(size - flank, 0), size, bac + "R"))
    fw.close()

    fastaFromBed(extbed, goodfasta, name=True)
Пример #5
0
Файл: bed.py Проект: yangjl/jcvi
def longest(args):
    """
    %prog longest bedfile fastafile

    Select longest feature within overlapping piles.
    """
    from jcvi.formats.sizes import Sizes

    p = OptionParser(longest.__doc__)
    p.add_option("--maxsize", default=20000, type="int",
                 help="Limit max size")
    p.add_option("--minsize", default=60, type="int",
                 help="Limit min size")
    p.add_option("--precedence", default="Medtr",
                 help="Accessions with prefix take precedence")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, fastafile = args
    maxsize = opts.maxsize
    minsize = opts.minsize
    prec = opts.precedence
    mergedbed = mergeBed(bedfile, nms=True)
    sizes = Sizes(fastafile).mapping
    bed = Bed(mergedbed)

    pf = bedfile.rsplit(".", 1)[0]
    ids = set()
    for b in bed:
        accns = b.accn.split(";")
        prec_accns = [x for x in accns if x.startswith(prec)]
        if prec_accns:
            accns = prec_accns
        accn_sizes = [(sizes.get(x, 0), x) for x in accns]
        accn_sizes = [(size, x) for size, x in accn_sizes if size < maxsize]
        if not accn_sizes:
            continue
        max_size, max_accn = max(accn_sizes)
        if max_size < minsize:
            continue
        ids.add(max_accn)

    newids = remove_isoforms(ids)
    logging.debug("Remove isoforms: before={0} after={1}".\
                    format(len(ids), len(newids)))

    longestidsfile = pf + ".longest.ids"
    fw = open(longestidsfile, "w")
    print >> fw, "\n".join(newids)
    fw.close()
    logging.debug("A total of {0} records written to `{1}`.".\
                    format(len(newids), longestidsfile))

    longestbedfile = pf + ".longest.bed"
    some([bedfile, longestidsfile, "--outfile={0}".format(longestbedfile),
            "--no_strip_names"])
Пример #6
0
def scaffold(args):
    """
    %prog scaffold scaffold.fasta synteny.blast synteny.sizes synteny.bed
                         physicalmap.blast physicalmap.sizes physicalmap.bed

    As evaluation of scaffolding, visualize external line of evidences:
    * Plot synteny to an external genome
    * Plot alignments to physical map
    * Plot alignments to genetic map (TODO)

    Each trio defines one panel to be plotted. blastfile defines the matchings
    between the evidences vs scaffolds. Then the evidence sizes, and evidence
    bed to plot dot plots.

    This script will plot a dot in the dot plot in the corresponding location
    the plots are one contig/scaffold per plot.
    """
    from jcvi.graphics.base import set_image_options
    from jcvi.utils.iter import grouper

    p = OptionParser(scaffold.__doc__)
    p.add_option("--cutoff", type="int", default=1000000,
            help="Plot scaffolds with size larger than [default: %default]")
    p.add_option("--highlights",
            help="A set of regions in BED format to highlight [default: %default]")
    opts, args, iopts = set_image_options(p, args, figsize="14x8", dpi=150)

    if len(args) < 4 or len(args) % 3 != 1:
        sys.exit(not p.print_help())

    highlights = opts.highlights
    scafsizes = Sizes(args[0])
    trios = list(grouper(3, args[1:]))
    trios = [(a, Sizes(b), Bed(c)) for a, b, c in trios]
    if highlights:
        hlbed = Bed(highlights)

    for scaffoldID, scafsize in scafsizes.iter_sizes():
        if scafsize < opts.cutoff:
            continue
        logging.debug("Loading {0} (size={1})".format(scaffoldID,
            thousands(scafsize)))

        tmpname = scaffoldID + ".sizes"
        tmp = open(tmpname, "w")
        tmp.write("{0}\t{1}".format(scaffoldID, scafsize))
        tmp.close()

        tmpsizes = Sizes(tmpname)
        tmpsizes.close(clean=True)

        if highlights:
            subhighlights = list(hlbed.sub_bed(scaffoldID))

        imagename = ".".join((scaffoldID, opts.format))
        plot_one_scaffold(scaffoldID, tmpsizes, None, trios, imagename, iopts,
                          highlights=subhighlights)
Пример #7
0
 def __init__(self, filename, fastafile):
     super(EvidenceFile, self).__init__(filename)
     sz = Sizes(fastafile)
     sizes = [None]  # tig-list starts at 1
     for name, size in sz.iter_sizes():
         sizes.append((name, size))
     self.sizes = sizes
     self.sz = sz.mapping
     self.scf = {}
Пример #8
0
def write_unplaced_agp(agpfile, scaffolds, unplaced_agp):
    agp = AGP(agpfile)
    scaffolds_seen = set(x.component_id for x in agp)
    sizes = Sizes(scaffolds).mapping
    fwagp = must_open(unplaced_agp, "w")
    for s in sorted(sizes.keys()):
        if s in scaffolds_seen:
            continue
        order_to_agp(s, [(s, "?")], sizes, fwagp)
    logging.debug("Write unplaced AGP to `{0}`.".format(unplaced_agp))
Пример #9
0
def graph_to_agp(g, blastfile, subjectfasta, exclude=[], verbose=False):

    from jcvi.formats.agp import order_to_agp

    logging.debug(str(g))
    g.write("graph.txt")
    #g.draw("graph.pdf")

    paths = []
    for path in g.iter_paths():
        m, oo = g.path(path)
        if len(oo) == 1:  # Singleton path
            continue
        paths.append(oo)
        if verbose:
            print m
            print oo

    npaths = len(paths)
    ntigs = sum(len(x) for x in paths)
    logging.debug("Graph decomposed to {0} paths with {1} components.".\
                  format(npaths, ntigs))

    agpfile = blastfile + ".agp"
    sizes = Sizes(subjectfasta)
    fwagp = open(agpfile, "w")
    scaffolded = set()
    for i, oo in enumerate(paths):
        ctgorder = [(str(ctg), ("+" if strand else "-")) \
                     for ctg, strand in oo]
        scaffolded |= set(ctg for ctg, strand in ctgorder)
        object = "pmol_{0:04d}".format(i)
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)

    # Get the singletons as well
    nsingletons = nscaffolded = nexcluded = 0
    for ctg, size in sizes.iter_sizes():
        if ctg in scaffolded:
            nscaffolded += 1
            continue
        if ctg in exclude:
            nexcluded += 1
            continue

        ctgorder = [(ctg, "+")]
        object = ctg
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)
        nsingletons += 1
    logging.debug("scaffolded={} excluded={} singletons={}".\
                    format(nscaffolded, nexcluded, nsingletons))

    fwagp.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))
Пример #10
0
def covlen(args):
    """
    %prog covlen covfile fastafile

    Plot coverage vs length. `covfile` is two-column listing contig id and
    depth of coverage.
    """
    import numpy as np
    import pandas as pd
    import seaborn as sns
    from jcvi.formats.base import DictFile

    p = OptionParser(covlen.__doc__)
    p.add_option("--maxsize", default=1000000, type="int", help="Max contig size")
    p.add_option("--maxcov", default=100, type="int", help="Max contig size")
    p.add_option("--color", default='m', help="Color of the data points")
    p.add_option("--kind", default="scatter",
                 choices=("scatter", "reg", "resid", "kde", "hex"),
                 help="Kind of plot to draw")
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 2:
        sys.exit(not p.print_help())

    covfile, fastafile = args
    cov = DictFile(covfile, cast=float)
    s = Sizes(fastafile)
    data = []
    maxsize, maxcov = opts.maxsize, opts.maxcov
    for ctg, size in s.iter_sizes():
        c = cov.get(ctg, 0)
        if size > maxsize:
            continue
        if c > maxcov:
            continue
        data.append((size, c))

    x, y = zip(*data)
    x = np.array(x)
    y = np.array(y)
    logging.debug("X size {0}, Y size {1}".format(x.size, y.size))

    df = pd.DataFrame()
    xlab, ylab = "Length", "Coverage of depth (X)"
    df[xlab] = x
    df[ylab] = y
    sns.jointplot(xlab, ylab, kind=opts.kind, data=df,
                  xlim=(0, maxsize), ylim=(0, maxcov),
                  stat_func=None, edgecolor="w", color=opts.color)

    figname = covfile + ".pdf"
    savefig(figname, dpi=iopts.dpi, iopts=iopts)
Пример #11
0
def main(args):
    """
    %prog deltafile

    Plot one query. Extract the references that have major matches to this
    query. Control "major" by option --refcov.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--refids", help="Use subset of contigs in the ref")
    p.add_option("--refcov", default=.01, type="float",
                 help="Minimum reference coverage [default: %default]")
    p.add_option("--all", default=False, action="store_true",
                 help="Plot one pdf file per ref in refidsfile [default: %default]")
    p.add_option("--color", default="similarity",
                 choices=("similarity", "direction", "none"),
                 help="Color the dots based on")
    p.add_option("--nolayout", default=False, action="store_true",
                 help="Do not rearrange contigs")
    p.set_align(pctid=0, hitlen=0)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    deltafile, = args
    reffasta, queryfasta = open(deltafile).readline().split()
    color = opts.color
    layout = not opts.nolayout
    prefix = op.basename(deltafile).split(".")[0]
    qsizes = Sizes(queryfasta).mapping
    rsizes = Sizes(reffasta).mapping

    refs = SetFile(opts.refids) if opts.refids else set(rsizes.keys())
    refcov = opts.refcov
    pctid = opts.pctid
    hitlen = opts.hitlen
    deltafile = filter([deltafile, "--pctid={0}".format(pctid),
                        "--hitlen={0}".format(hitlen)])

    if opts.all:
        for r in refs:
            pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov,
                                        prefix=prefix, color=color,
                                        layout=layout)
            if pdffile:
                sh("mv {0} {1}.pdf".format(pdffile, r))
    else:
        plot_some_queries(refs, qsizes, rsizes,
                          deltafile, refcov,
                          prefix=prefix, color=color, layout=layout)
Пример #12
0
def bed(args):
    """
    %prog bed binfile fastafile

    Write bed files where the bases have at least certain depth.
    """
    p = OptionParser(bed.__doc__)
    p.add_option("-o", dest="output", default="stdout",
            help="Output file name [default: %default]")
    p.add_option("--cutoff", dest="cutoff", default=10, type="int",
            help="Minimum read depth to report intervals [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    binfile, fastafile = args
    fw = must_open(opts.output, "w")
    cutoff = opts.cutoff
    assert cutoff >= 0, "Need non-negative cutoff"

    b = BinFile(binfile)
    ar = b.array

    fastasize, sizes, offsets = get_offsets(fastafile)
    s = Sizes(fastafile)
    for ctg, ctglen in s.iter_sizes():
        offset = offsets[ctg]
        subarray = ar[offset:offset + ctglen]
        key = lambda x: x[1] >= cutoff
        for tf, array_elements in groupby(enumerate(subarray), key=key):
            array_elements = list(array_elements)
            if not tf:
                continue

            # 0-based system => 1-based system
            start = array_elements[0][0] + 1
            end = array_elements[-1][0] + 1

            mean_depth = sum([x[1] for x in array_elements]) / \
                len(array_elements)
            mean_depth = int(mean_depth)

            name = "na"
            print >> fw, "\t".join(str(x) for x in (ctg, \
                    start - 1, end, name, mean_depth))
Пример #13
0
def posmap(args):
    """
    %prog posmap frgscf.sorted scf.fasta scfID

    Perform QC on the selected scfID, generate multiple BED files for plotting.
    """
    p = OptionParser(posmap.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(p.print_help())

    frgscffile, fastafile, scf = args

    # fasta
    cmd = "faOneRecord {0} {1}".format(fastafile, scf)
    scffastafile = scf + ".fasta"
    if not op.exists(scffastafile):
        sh(cmd, outfile=scffastafile)

    # sizes
    sizesfile = scffastafile + ".sizes"
    sizes = Sizes(scffastafile).mapping
    scfsize = sizes[scf]
    logging.debug("`{0}` has length of {1}.".format(scf, scfsize))

    # gaps.bed
    gapsbedfile = scf + ".gaps.bed"
    if not op.exists(gapsbedfile):
        args = [scffastafile, "--bed", "--mingap=100"]
        gaps(args)

    # reads frgscf posmap
    posmapfile = scf + ".posmap"
    if not op.exists(posmapfile):
        args = [frgscffile, scf]
        query(args)

    # reads bed
    bedfile = scf + ".bed"
    if not op.exists(bedfile):
        args = [posmapfile]
        bed(args)

    # reads bedpe
    bedpefile = scf + ".bedpe"
    pairsbedfile = scf + ".pairs.bed"
    if not (op.exists(bedpefile) and op.exists(pairsbedfile)):
        bed_to_bedpe(bedfile, bedpefile, pairsbedfile=pairsbedfile, ca=True)

    # base coverage
    basecoverage = Coverage(bedfile, sizesfile)
    pecoverage = Coverage(pairsbedfile, sizesfile)
Пример #14
0
def main(args):
    """
    %prog deltafile refidsfile query.fasta ref.fasta

    Plot one query. Extract the references that have major matches to this
    query. Control "major" by option --refcov.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--refcov",
                 default=.01,
                 type="float",
                 help="Minimum reference coverage [default: %default]")
    p.add_option(
        "--all",
        default=False,
        action="store_true",
        help="Plot one pdf file per ref in refidsfile [default: %default]")
    p.set_align(pctid=96, hitlen=500)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    deltafile, refidsfile, queryfasta, reffasta = args
    qsizes = Sizes(queryfasta).mapping
    rsizes = Sizes(reffasta).mapping
    refs = SetFile(refidsfile)
    refcov = opts.refcov
    pctid = opts.pctid
    hitlen = opts.hitlen
    deltafile = filter([
        deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen)
    ])

    if opts.all:
        for r in refs:
            pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov)
            if pdffile:
                sh("mv {0} {1}.pdf".format(pdffile, r))
    else:
        plot_some_queries(refs, qsizes, rsizes, deltafile, refcov)
Пример #15
0
def coverage(args):
    """
    %prog coverage fastafile bamfile

    Calculate coverage for BAM file. BAM file will be sorted unless with
    --nosort.
    """
    p = OptionParser(coverage.__doc__)
    p.add_option(
        "--format",
        default="bigwig",
        choices=("bedgraph", "bigwig", "coverage"),
        help="Output format",
    )
    p.add_option("--nosort", default=False, action="store_true", help="Do not sort BAM")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, bamfile = args
    format = opts.format
    if opts.nosort:
        logging.debug("BAM sorting skipped")
    else:
        bamfile = index([bamfile, "--fasta={0}".format(fastafile)])

    pf = bamfile.rsplit(".", 2)[0]
    sizesfile = Sizes(fastafile).filename
    cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile)
    if format in ("bedgraph", "bigwig"):
        cmd += " -bg"
        bedgraphfile = pf + ".bedgraph"
        sh(cmd, outfile=bedgraphfile)

        if format == "bedgraph":
            return bedgraphfile

        bigwigfile = pf + ".bigwig"
        cmd = "bedGraphToBigWig {0} {1} {2}".format(bedgraphfile, sizesfile, bigwigfile)
        sh(cmd)
        return bigwigfile

    coveragefile = pf + ".coverage"
    if need_update(fastafile, coveragefile):
        sh(cmd, outfile=coveragefile)

    gcf = GenomeCoverageFile(coveragefile)
    fw = must_open(opts.outfile, "w")
    for seqid, cov in gcf.iter_coverage_seqid():
        print("\t".join((seqid, "{0:.1f}".format(cov))), file=fw)
    fw.close()
Пример #16
0
def flanks(args):
    """
    %prog flanks gaps.bed fastafile

    Create sequences flanking the gaps.
    """
    p = OptionParser(flanks.__doc__)
    p.add_option(
        "--extend",
        default=2000,
        type="int",
        help="Extend seq flanking the gaps",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gapsbed, fastafile = args
    Ext = opts.extend
    sizes = Sizes(fastafile).mapping

    bed = Bed(gapsbed)
    pf = gapsbed.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"
    fw = open(extbed, "w")
    for i, b in enumerate(bed):
        seqid = b.seqid
        gapname = b.accn
        size = sizes[seqid]

        prev_b = bed[i - 1] if i > 0 else None
        next_b = bed[i + 1] if i + 1 < len(bed) else None
        if prev_b and prev_b.seqid != seqid:
            prev_b = None
        if next_b and next_b.seqid != seqid:
            next_b = None

        start = prev_b.end + 1 if prev_b else 1
        start, end = max(start, b.start - Ext), b.start - 1
        print("\t".join(
            str(x) for x in (b.seqid, start - 1, end, gapname + "L")),
              file=fw)

        end = next_b.start - 1 if next_b else size
        start, end = b.end + 1, min(end, b.end + Ext)
        print("\t".join(
            str(x) for x in (b.seqid, start - 1, end, gapname + "R")),
              file=fw)
    fw.close()

    extfasta = fastaFromBed(extbed, fastafile, name=True)
    return extbed, extfasta
Пример #17
0
def coordQ(args):
    sizes = Sizes(args.fs)
    for line in must_open(args.fi):
        if not re.match(r'\d+', line[0]):
            continue
        p = PslLine(line)
        qnames = p.qName.split("-")
        if len(qnames) == 3:
            x = p.qName
            p.qName, qosStart, qosEnd = qnames[0], int(qnames[1]), int(
                qnames[2])
            assert qosEnd - qosStart + 1 == p.qSize
            cSize = sizes.get_size(p.qName)
            p.qStart += qosStart - 1
            p.qEnd += qosStart - 1
            if p.qstrand == "-":
                p.qStarts = [x + cSize - qosEnd for x in p.qStarts]
            else:
                p.qStarts = [x + qosStart - 1 for x in p.qStarts]
            p.qSize = cSize
        print(str(p))
Пример #18
0
def bed2chain(args):
    from maize.formats.sizes import Sizes
    tdic = Sizes(args.tsize)
    qdic = Sizes(args.qsize)
    
    firstline = True
    cid0, tName0, qName0, srd0, locs = '', '', '', '', []
    for line in must_open(args.fi):
        line = line.rstrip("\n")
        if not line:
            continue
        tName, tStart, tEnd, srd, qName, qStart, qEnd, cid = line.split()[:8]
        tStart, tEnd, qStart, qEnd = int(tStart), int(tEnd), int(qStart), int(qEnd)
        if firstline:
            cid0, tName0, qName0, srd0 = cid, tName, qName, srd
            locs.append([tStart, tEnd, qStart, qEnd])
            firstline = False
        elif cid0 == cid:
            assert tName == tName0 and qName == qName0 and srd == srd0, "inconsistent info in chain"
            locs.append([tStart, tEnd, qStart, qEnd])
        else:
            print_chain(cid0, tName0, qName0, srd0, tdic.get_size(tName0), qdic.get_size(qName0), locs)
            cid0, tName0, qName0, srd0 = cid, tName, qName, srd
            locs = [[tStart, tEnd, qStart, qEnd]]
    print_chain(cid0, tName0, qName0, srd0, tdic.get_size(tName0), qdic.get_size(qName0), locs)
Пример #19
0
def covlen(args):
    """
    %prog covlen covfile fastafile

    Plot coverage vs lenght. `covfile` is two-column listing contig id and
    depth of coverage.
    """
    import numpy as np
    import seaborn as sns
    from jcvi.formats.base import DictFile

    p = OptionParser(covlen.__doc__)
    p.add_option("--maxsize", default=100000, type="int", help="Max contig size")
    p.add_option("--maxcov", default=100, type="int", help="Max contig size")
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 2:
        sys.exit(not p.print_help())

    covfile, fastafile = args
    cov = DictFile(covfile, cast=float)
    s = Sizes(fastafile)
    data = []
    maxsize, maxcov = opts.maxsize, opts.maxcov
    for ctg, size in s.iter_sizes():
        c = cov[ctg]
        if size > maxsize:
            continue
        if c > maxcov:
            continue
        data.append((size, c))

    x, y = zip(*data)
    x = np.array(x)
    y = np.array(y)
    logging.debug("X size {0}, Y size {1}".format(x.size, y.size))
    sns.jointplot(x, y, kind="kde")

    figname = covfile + ".pdf"
    savefig(figname, dpi=iopts.dpi, iopts=iopts)
Пример #20
0
    def __init__(self, bedfile, sizesfile):

        bedfile = sort([bedfile])
        coveragefile = bedfile + ".coverage"
        if need_update(bedfile, coveragefile):
            cmd = "genomeCoverageBed"
            cmd += " -bg -i {0} -g {1}".format(bedfile, sizesfile)
            sh(cmd, outfile=coveragefile)

        self.sizes = Sizes(sizesfile).mapping

        filename = coveragefile
        assert filename.endswith(".coverage")
        super(Coverage, self).__init__(filename)
Пример #21
0
def summary(args):
    """
    %prog summary input.bed scaffolds.fasta

    Print out summary statistics per map, followed by consensus summary of
    scaffold anchoring based on multiple maps.
    """
    p = OptionParser(summary.__doc__)
    p.set_table(sep="|", align=True)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inputbed, scaffolds = args
    pf = inputbed.rsplit(".", 1)[0]
    mapbed = pf + ".bed"
    chr_agp = pf + ".chr.agp"
    sep = opts.sep
    align = opts.align
    cc = Map(mapbed)
    mapnames = cc.mapnames
    s = Sizes(scaffolds)
    total, l50, n50 = s.summary
    r = {}
    maps = []

    fw = must_open(opts.outfile, "w")
    print >> fw, "*** Summary for each individual map ***"
    for mapname in mapnames:
        markers = [x for x in cc if x.mapname == mapname]
        ms = MapSummary(markers, l50, s)
        r["Linkage Groups", mapname] = ms.num_lgs
        ms.export_table(r, mapname, total)
        maps.append(ms)
    print >> fw, tabulate(r, sep=sep, align=align)

    r = {}
    agp = AGP(chr_agp)
    print >> fw, "*** Summary for consensus map ***"
    consensus_scaffolds = set(x.component_id for x in agp if not x.is_gap)
    unplaced_scaffolds = set(s.mapping.keys()) - consensus_scaffolds

    for mapname, sc in (("Anchored", consensus_scaffolds),
                        ("Unplaced", unplaced_scaffolds)):
        markers = [x for x in cc if x.seqid in sc]
        ms = MapSummary(markers, l50, s, scaffolds=sc)
        ms.export_table(r, mapname, total)
    print >> fw, tabulate(r, sep=sep, align=align)
Пример #22
0
def location(args):
    """
    %prog location bedfile fastafile

    Given SNP locations, summarize the locations in the sequences. For example,
    find out if there are more 3`-SNPs than 5`-SNPs.
    """
    from jcvi.formats.bed import BedLine
    from jcvi.graphics.histogram import stem_leaf_plot

    p = OptionParser(location.__doc__)
    p.add_option(
        "--dist",
        default=100,
        type="int",
        help="Distance cutoff to call 5` and 3` [default: %default]",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, fastafile = args
    dist = opts.dist
    sizes = Sizes(fastafile).mapping
    fp = open(bedfile)
    fiveprime = threeprime = total = 0
    percentages = []
    for row in fp:
        b = BedLine(row)
        pos = b.start
        size = sizes[b.seqid]
        if pos < dist:
            fiveprime += 1
        if size - pos < dist:
            threeprime += 1
        total += 1
        percentages.append(100 * pos / size)

    m = "Five prime (within {0}bp of start codon): {1}\n".format(
        dist, fiveprime)
    m += "Three prime (within {0}bp of stop codon): {1}\n".format(
        dist, threeprime)
    m += "Total: {0}".format(total)
    print(m, file=sys.stderr)

    bins = 10
    title = "Locations within the gene [0=Five-prime, 100=Three-prime]"
    stem_leaf_plot(percentages, 0, 100, bins, title=title)
Пример #23
0
def pasteprepare(args):
    """
    %prog pasteprepare bacs.fasta

    Prepare sequences for paste.
    """
    p = OptionParser(pasteprepare.__doc__)
    p.add_option(
        "--flank",
        default=5000,
        type="int",
        help="Get the seq of size on two ends",
    )
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (goodfasta, ) = args
    flank = opts.flank
    pf = goodfasta.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"

    sizes = Sizes(goodfasta)
    fw = open(extbed, "w")
    for bac, size in sizes.iter_sizes():
        print("\t".join(str(x) for x in (bac, 0, min(flank, size), bac + "L")),
              file=fw)
        print(
            "\t".join(
                str(x) for x in (bac, max(size - flank, 0), size, bac + "R")),
            file=fw,
        )
    fw.close()

    fastaFromBed(extbed, goodfasta, name=True)
Пример #24
0
def annotate(args):
    """
    %prog annotate blastfile query.fasta subject.fasta

    Annotate overlap types (dovetail, contained, etc) in BLAST tabular file.
    """
    from jcvi.assembly.goldenpath import Overlap, Overlap_types

    p = OptionParser(annotate.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blastfile, afasta, bfasta = args
    fp = open(blastfile)
    asizes = Sizes(afasta).mapping
    bsizes = Sizes(bfasta).mapping
    for row in fp:
        b = BlastLine(row)
        asize = asizes[b.query]
        bsize = bsizes[b.subject]
        ov = Overlap(b, asize, bsize)
        print "{0}\t{1}".format(b, Overlap_types[ov.otype])
Пример #25
0
def uniq(args):
    """
    %prog uniq gffile cdsfasta

    Remove overlapping gene models. Similar to formats.gff.uniq(), overlapping
    'piles' are processed, one by one.

    Here, we use a different algorithm, that retains the best non-overlapping
    subset witin each pile, rather than single best model. Scoring function is
    also different, rather than based on score or span, we optimize for the
    subset that show the best combined score. Score is defined by:

    score = (1 - AED) * length
    """

    p = OptionParser(uniq.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gffile, cdsfasta = args
    gff = Gff(gffile)
    sizes = Sizes(cdsfasta).mapping
    gene_register = {}
    for g in gff:
        if g.type != "mRNA":
            continue
        aed = float(g.attributes["_AED"][0])
        gene_register[g.parent] = (1 - aed) * sizes[g.accn]

    allgenes = import_feats(gffile)
    g = get_piles(allgenes)

    bestids = set()
    for group in g:
        ranges = [
            to_range(x, score=gene_register[x.accn], id=x.accn) for x in group
        ]
        selected_chain, score = range_chain(ranges)
        bestids |= set(x.id for x in selected_chain)

    removed = set(x.accn for x in allgenes) - bestids
    fw = open("removed.ids", "w")
    print("\n".join(sorted(removed)), file=fw)
    fw.close()
    populate_children(opts.outfile, bestids, gffile, "gene")
Пример #26
0
def agp(args):
    fp = open("assembly-order.txt")
    next(fp)
    sizes = Sizes("SCAFFOLD-SPLIT.fasta").mapping
    for row in fp:
        atoms = row.split()
        assert len(atoms) in (4, 5)
        if len(atoms) == 4:
            atoms.append('?')
        scaf, tag, linkage, no, strand = atoms
        strand = strand.lower()
        strand = {'f': '+', 'r': '-', '?': '?'}[strand]
        scaf = "scaffold_" + scaf
        scaf_size = sizes[scaf]
        linkage = "LG{0:02d}".format(ord(linkage.lower()) - ord('a') + 1)
        print("\t".join(str(x) for x in \
                    (scaf, 0, scaf_size, linkage, 1000, strand)))
Пример #27
0
def closest(args):
    """
    %prog closest candidates.bed gaps.bed fastafile

    Identify the nearest gaps flanking suggested regions.
    """
    p = OptionParser(closest.__doc__)
    p.add_option(
        "--om",
        default=False,
        action="store_true",
        help="The bedfile is OM blocks",
    )
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    candidates, gapsbed, fastafile = args
    sizes = Sizes(fastafile).mapping
    bed = Bed(candidates)
    ranges = []
    for b in bed:
        r = range_parse(b.accn) if opts.om else b
        ranges.append([r.seqid, r.start, r.end])

    gapsbed = Bed(gapsbed)
    granges = [(x.seqid, x.start, x.end) for x in gapsbed]

    ranges = range_merge(ranges)
    for r in ranges:
        a = range_closest(granges, r)
        b = range_closest(granges, r, left=False)
        seqid = r[0]

        if a is not None and a[0] != seqid:
            a = None
        if b is not None and b[0] != seqid:
            b = None

        mmin = 1 if a is None else a[1]
        mmax = sizes[seqid] if b is None else b[2]

        print("\t".join(str(x) for x in (seqid, mmin - 1, mmax)))
Пример #28
0
    def __init__(self, bedfile, sizesfile):
        from jcvi.formats.bed import sort

        sortedbedfile = bedfile.rsplit(".", 1)[0] + ".sorted.bed"
        if need_update(bedfile, sortedbedfile):
            sort([bedfile])
        bedfile = sortedbedfile

        coveragefile = bedfile + ".coverage"
        if need_update(bedfile, coveragefile):
            cmd = "genomeCoverageBed"
            cmd += " -bg -i {0} -g {1}".format(bedfile, sizesfile)
            sh(cmd, outfile=coveragefile)

        self.sizes = Sizes(sizesfile).mapping

        filename = coveragefile
        assert filename.endswith(".coverage")
        super(Coverage, self).__init__(filename)
Пример #29
0
def fasta(args):
    """
    %prog fasta map.out scaffolds.fasta

    Extract marker sequences based on map.
    """
    from jcvi.formats.sizes import Sizes

    p = OptionParser(fasta.__doc__)
    p.add_option(
        "--extend",
        default=1000,
        type="int",
        help="Extend seq flanking the gaps",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    mapout, sfasta = args
    Flank = opts.extend
    pf = mapout.split(".")[0]
    mapbed = pf + ".bed"
    bm = BinMap(mapout)
    bm.print_to_bed(mapbed)

    bed = Bed(mapbed, sorted=False)
    markersbed = pf + ".markers.bed"
    fw = open(markersbed, "w")
    sizes = Sizes(sfasta).mapping
    for b in bed:
        accn = b.accn
        scf, pos = accn.split(".")
        pos = int(pos)
        start = max(0, pos - Flank)
        end = min(pos + Flank, sizes[scf])
        print("\t".join(str(x) for x in (scf, start, end, accn)), file=fw)

    fw.close()

    fastaFromBed(markersbed, sfasta, name=True)
Пример #30
0
def coverage(args):
    """
    %prog coverage fastafile bamfile

    Calculate coverage for BAM file. BAM file must be sorted.
    """
    p = OptionParser(coverage.__doc__)
    p.add_option("--format",
                 default="bigwig",
                 choices=("bedgraph", "bigwig", "coverage"),
                 help="Output format")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, bamfile = args
    format = opts.format
    pf = bamfile.rsplit(".", 2)[0]
    sizesfile = Sizes(fastafile).filename
    cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile)
    if format in ("bedgraph", "bigwig"):
        cmd += " -bg"
        bedgraphfile = pf + ".bedgraph"
        sh(cmd, outfile=bedgraphfile)

        if format == "bedgraph":
            return bedgraphfile

        bigwigfile = pf + ".bigwig"
        cmd = "bedGraphToBigWig {0} {1} {2}".\
                    format(bedgraphfile, sizesfile, bigwigfile)
        sh(cmd)
        return bigwigfile

    coveragefile = pf + ".coverage"
    if need_update(fastafile, coveragefile):
        sh(cmd, outfile=coveragefile)

    gcf = GenomeCoverageFile(coveragefile)
    for seqid, cov in gcf.iter_coverage_seqid():
        print "\t".join((seqid, "{0:.1f}".format(cov)))
Пример #31
0
def longest(args):
    """
    %prog longest pile.txt cds.fasta

    Pick the longest model per group. `pile.txt` can be generated by
    formats.bed.pile().
    """
    from jcvi.formats.sizes import Sizes

    p = OptionParser(longest.__doc__)
    p.add_option("--samesize", default=False, action="store_true",
                 help="Only report where the group has same size "\
                      "[default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pilefile, cdsfasta = args
    sizes = Sizes(cdsfasta).mapping

    fp = open(pilefile)
    fw = open("Problems.ids", "w")
    for row in fp:
        models, = row.split()
        all_models = models.split("|")
        all_lengths = [(x, sizes[x]) for x in all_models]
        max_model = max(all_lengths, key=lambda x: x[-1])[0]

        if opts.samesize:
            mms, lengths = zip(*all_lengths)
            if len(set(lengths)) != 1:
                continue

        modelmsg = "|".join("{0}({1})".format(a, b) for a, b in all_lengths)
        print "\t".join((max_model, modelmsg))

        problems = [x for x in all_models if x != max_model]
        print >> fw, "\n".join(problems)

    fw.close()
Пример #32
0
def fill(args):
    """
    %prog fill gaps.bed bad.fasta

    Perform gap filling of one assembly (bad) using sequences from another.
    """
    p = OptionParser(fill.__doc__)
    p.add_option(
        "--extend",
        default=2000,
        type="int",
        help="Extend seq flanking the gaps",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gapsbed, badfasta = args
    Ext = opts.extend

    gapdist = 2 * Ext + 1  # This is to prevent to replacement ranges intersect
    gapsbed = mergeBed(gapsbed, d=gapdist, nms=True)

    bed = Bed(gapsbed)
    sizes = Sizes(badfasta).mapping
    pf = gapsbed.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"
    fw = open(extbed, "w")
    for b in bed:
        gapname = b.accn
        start, end = max(0, b.start - Ext - 1), b.start - 1
        print("\t".join(str(x) for x in (b.seqid, start, end, gapname + "L")),
              file=fw)
        start, end = b.end, min(sizes[b.seqid], b.end + Ext)
        print("\t".join(str(x) for x in (b.seqid, start, end, gapname + "R")),
              file=fw)
    fw.close()

    fastaFromBed(extbed, badfasta, name=True)
Пример #33
0
def fasta(args):
    """
    %prog fasta bedfile scf.fasta pseudomolecules.fasta

    Use OM bed to scaffold and create pseudomolecules. bedfile can be generated
    by running jcvi.assembly.opticalmap bed --blockonly
    """
    from jcvi.formats.sizes import Sizes
    from jcvi.formats.agp import OO, build

    p = OptionParser(fasta.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, scffasta, pmolfasta = args
    pf = bedfile.rsplit(".", 1)[0]
    bed = Bed(bedfile)
    selected = select_bed(bed)
    oo = OO()
    seen = set()
    sizes = Sizes(scffasta).mapping
    agpfile = pf + ".agp"
    agp = open(agpfile, "w")
    for b in selected:
        scf = range_parse(b.accn).seqid
        chr = b.seqid
        cs = (chr, scf)
        if cs not in seen:
            oo.add(chr, scf, sizes[scf], b.strand)
            seen.add(cs)
        else:
            logging.debug("Seen {0}, ignored.".format(cs))

    oo.write_AGP(agp, gaptype="contig")
    agp.close()
    build([agpfile, scffasta, pmolfasta])
Пример #34
0
def eject(args):
    """
    %prog eject candidates.bed chr.fasta

    Eject scaffolds from assembly, using the range identified by closest().
    """
    p = OptionParser(eject.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    candidates, chrfasta = args
    sizesfile = Sizes(chrfasta).filename
    cbedfile = complementBed(candidates, sizesfile)

    cbed = Bed(cbedfile)
    for b in cbed:
        b.accn = b.seqid
        b.score = 1000
        b.strand = "+"

    cbed.print_to_file()
Пример #35
0
def ancestral(args):
    """
    %prog ancestral ancestral.txt assembly.fasta

    Karyotype evolution of pineapple. The figure is inspired by Amphioxus paper
    Figure 3 and Tetradon paper Figure 9.
    """
    p = OptionParser(ancestral.__doc__)
    opts, args, iopts = p.set_image_options(args, figsize="8x7")

    if len(args) != 2:
        sys.exit(not p.print_help())

    regionsfile, sizesfile = args
    regions = RegionsFile(regionsfile)
    sizes = Sizes(sizesfile).mapping
    sizes = dict((k, v) for (k, v) in sizes.iteritems() if k[:2] == "LG")
    maxsize = max(sizes.values())
    ratio = .5 / maxsize

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes((0, 0, 1, 1))

    from jcvi.graphics.base import set2
    a, b, c, d, e, f, g = set2[:7]
    set2 = (c, g, b, e, d, a, f)

    # Upper panel is the evolution of segments
    # All segments belong to one of seven karyotypes 1 to 7
    karyotypes = regions.karyotypes
    xgap = 1. / (1 + len(karyotypes))
    ygap = .05
    mgap = xgap / 4.5
    gwidth = mgap * .75
    tip = .02
    coords = {}
    for i, k in enumerate(regions.karyotypes):
        x = (i + 1) * xgap
        y = .9
        root.text(x, y + tip, "Anc" + k, ha="center")
        root.plot((x, x), (y, y - ygap), "k-", lw=2)
        y -= 2 * ygap
        coords['a'] = (x - 1.5 * mgap , y)
        coords['b'] = (x - .5 * mgap , y)
        coords['c'] = (x + .5 * mgap , y)
        coords['d'] = (x + 1.5 * mgap , y)
        coords['ab'] = join_nodes_vertical(root, coords, 'a', 'b', y + ygap / 2)
        coords['cd'] = join_nodes_vertical(root, coords, 'c', 'd', y + ygap / 2)
        coords['abcd'] = join_nodes_vertical(root, coords, 'ab', 'cd', y + ygap)
        for n in 'abcd':
            nx, ny = coords[n]
            root.text(nx, ny - tip, n, ha="center")
            coords[n] = (nx, ny - ygap / 2)

        kdata = regions.get_karyotype(k)
        for kd in kdata:
            g = kd.group
            gx, gy = coords[g]
            gsize = ratio * kd.span
            gy -= gsize
            p = Rectangle((gx - gwidth / 2, gy),
                           gwidth, gsize, lw=0, color=set2[i])
            root.add_patch(p)
            root.text(gx, gy + gsize / 2, kd.chromosome,
                      ha="center", va="center", color='w')
            coords[g] = (gx, gy - tip)

    # Bottom panel shows the location of segments on chromosomes
    # TODO: redundant code, similar to graphics.chromosome
    ystart = .54
    chr_number = len(sizes)
    xstart, xend = xgap - 2 * mgap, 1 - xgap + 2 * mgap
    xinterval = (xend - xstart - gwidth) / (chr_number - 1)
    chrpos = {}
    for a, (chr, clen) in enumerate(sorted(sizes.items())):
        chr = get_number(chr)
        xx = xstart + a * xinterval + gwidth / 2
        chrpos[chr] = xx
        root.text(xx, ystart + .01, chr, ha="center")
        Chromosome(root, xx, ystart, ystart - clen * ratio, width=gwidth)

    # Start painting
    for r in regions:
        xx = chrpos[r.chromosome]
        yystart = ystart - r.start * ratio
        yyend = ystart - r.end * ratio
        p = Rectangle((xx - gwidth / 2, yystart), gwidth, yyend - yystart,
                      color=set2[int(r.karyotype) - 1], lw=0)
        root.add_patch(p)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    pf = "pineapple-karyotype"
    image_name = pf + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Пример #36
0
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None):
    # Shuffle the two bedfiles together
    sz = Sizes(bbfasta)
    sizes = sz.mapping
    shuffled = "shuffled.bed"
    border = bfbed.order

    all = []
    afbed.sort(key=afbed.nullkey)
    totalids = len(sizes)
    pad = int(math.log10(totalids)) + 1
    cj = 0
    seen = set()
    accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad)

    for seqid, aa in afbed.sub_beds():
        cj += 1
        abeds, bbeds, beds = [], [], []
        size = sizes[seqid]
        ranges = [(x.seqid, x.start, x.end) for x in aa]
        cranges = range_interleave(ranges, sizes={seqid: size}, empty=True)
        for crange in cranges:
            if crange:
                seqid, start, end = crange
                bedline = "\t".join(str(x) for x in (seqid, start - 1, end))
                abeds.append(BedLine(bedline))
            else:
                abeds.append(None)

        for a in aa:
            gapid = a.accn
            bi, b = border[gapid]
            if a.strand == "-":
                b.extra[1] = b.strand = "-" if b.strand == "+" else "+"

            bbeds.append(b)

        n_abeds = len(abeds)
        n_bbeds = len(bbeds)
        assert n_abeds - n_bbeds == 1, "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds)

        beds = [x for x in roundrobin(abeds, bbeds) if x]
        if prefix:
            for b in beds:
                b.accn = accn(cj)

        all.extend(beds)
        seen.add(seqid)

    # Singletons
    for seqid, size in sz.iter_sizes():
        if seqid in seen:
            continue

        bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj)))
        b = BedLine(bedline)

        cj += 1
        if prefix:
            b.accn = accn(cj)

        all.append(b)

    shuffledbed = Bed()
    shuffledbed.extend(all)
    shuffledbed.print_to_file(shuffled)

    return shuffledbed
Пример #37
0
def bambus(args):
    """
    %prog bambus bambus.bed bambus.mates total.fasta

    Insert unplaced scaffolds based on mates.
    """
    from jcvi.formats.bed import BedLine
    from jcvi.formats.posmap import MatesFile

    p = OptionParser(bambus.__doc__)
    p.add_option(
        "--prefix",
        default="scaffold",
        help="Prefix of the unplaced scaffolds",
    )
    p.add_option(
        "--minlinks",
        default=3,
        type="int",
        help="Minimum number of links to place",
    )
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, matesfile, fastafile = args
    pf = matesfile.rsplit(".", 1)[0]
    logfile = pf + ".log"
    log = open(logfile, "w")

    mf = MatesFile(matesfile)
    maxdist = max(x.max for x in mf.libraries.values())
    logging.debug("Max separation: {0}".format(maxdist))

    prefix = opts.prefix
    minlinks = opts.minlinks

    is_unplaced = lambda x: x.startswith(prefix)
    bed = Bed(bedfile, sorted=False)
    beds = []
    unplaced = defaultdict(list)

    for a, b in pairwise(bed):
        aname, bname = a.accn, b.accn
        aseqid, bseqid = a.seqid, b.seqid

        if aname not in mf:
            continue

        pa, la = mf[aname]
        if pa != bname:
            continue

        ia = is_unplaced(aseqid)
        ib = is_unplaced(bseqid)
        if ia == ib:
            continue

        if ia:
            a, b = b, a

        unplaced[b.seqid].append((a, b))
        beds.extend([a, b])

    sizes = Sizes(fastafile)
    candidatebed = Bed()
    cbeds = []
    # For each unplaced scaffold, find most likely placement and orientation
    for scf, beds in sorted(unplaced.items()):
        print(file=log)
        ranges = []
        for a, b in beds:
            aname, astrand = a.accn, a.strand
            bname, bstrand = b.accn, b.strand
            aseqid, bseqid = a.seqid, b.seqid
            pa, lib = mf[aname]

            print(a, file=log)
            print(b, file=log)

            flip_b = astrand == bstrand
            fbstrand = "-" if flip_b else "+"
            if flip_b:
                b.reverse_complement(sizes)

            lmin, lmax = lib.min, lib.max

            L = sizes.get_size(scf)
            assert astrand in ("+", "-")
            if astrand == "+":
                offset = a.start - b.end
                sstart, sstop = offset + lmin, offset + lmax
            else:
                offset = a.end - b.start + L
                sstart, sstop = offset - lmax, offset - lmin

            # Prevent out of range error
            size = sizes.get_size(aseqid)
            sstart = max(0, sstart)
            sstop = max(0, sstop)
            sstart = min(size - 1, sstart)
            sstop = min(size - 1, sstop)

            start_range = (aseqid, sstart, sstop, scf, 1, fbstrand)
            print("*" + "\t".join(str(x) for x in start_range), file=log)
            ranges.append(start_range)

        mranges = [x[:3] for x in ranges]
        # Determine placement by finding the interval with the most support
        rd = ranges_depth(mranges, sizes.mapping, verbose=False)
        alldepths = []
        for depth in rd:
            alldepths.extend(depth)
        print(alldepths, file=log)

        maxdepth = max(alldepths, key=lambda x: x[-1])[-1]
        if maxdepth < minlinks:
            print("Insufficient links ({0} < {1})".format(maxdepth, minlinks), file=log)
            continue

        candidates = [x for x in alldepths if x[-1] == maxdepth]
        nseqids = len(set(x[0] for x in candidates))
        if nseqids != 1:
            msg = "Multiple conflicting candidates found"
            print(msg, file=log)
            continue

        seqid, mmin, mmax, depth = candidates[0]
        mmin, mmax = range_minmax([x[1:3] for x in candidates])

        if mmin >= mmax:
            msg = "Invalid (min, max) range"
            print("Invalid (min, max) range", file=log)
            continue

        if (mmax - mmin) > maxdist:
            msg = "(min, max) distance greater than library maxdist"
            print(msg, file=log)
            continue

        # Determine orientation by voting
        nplus, nminus = 0, 0
        arange = (seqid, mmin, mmax)
        for sid, start, end, sf, sc, fbstrand in ranges:
            brange = (sid, start, end)
            if range_overlap(arange, brange):
                if fbstrand == "+":
                    nplus += 1
                else:
                    nminus += 1

        fbstrand = "+" if nplus >= nminus else "-"

        candidate = (seqid, mmin, mmax, scf, depth, fbstrand)
        bedline = BedLine("\t".join((str(x) for x in candidate)))
        cbeds.append(bedline)
        print("Plus: {0}, Minus: {1}".format(nplus, nminus), file=log)
        print(candidate, file=log)

    candidatebed.extend(cbeds)
    logging.debug("A total of {0} scaffolds can be placed.".format(len(candidatebed)))
    log.close()

    candidatebedfile = pf + ".candidate.bed"
    candidatebed.print_to_file(candidatebedfile, sorted=True)
Пример #38
0
def insert(args):
    """
    %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta

    Insert scaffolds into assembly.
    """
    from jcvi.formats.agp import mask, bed
    from jcvi.formats.sizes import agp

    p = OptionParser(insert.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    candidates, gapsbed, chrfasta, unplacedfasta = args
    refinedbed = refine([candidates, gapsbed])
    sizes = Sizes(unplacedfasta).mapping
    cbed = Bed(candidates)
    corder = cbed.order
    gbed = Bed(gapsbed)
    gorder = gbed.order

    gpbed = Bed()
    gappositions = {}  # (chr, start, end) => gapid

    fp = open(refinedbed)
    gap_to_scf = defaultdict(list)
    seen = set()
    for row in fp:
        atoms = row.split()
        if len(atoms) <= 6:
            continue
        unplaced = atoms[3]
        strand = atoms[5]
        gapid = atoms[9]
        if gapid not in seen:
            seen.add(gapid)
            gi, gb = gorder[gapid]
            gpbed.append(gb)
            gappositions[(gb.seqid, gb.start, gb.end)] = gapid
        gap_to_scf[gapid].append((unplaced, strand))

    gpbedfile = "candidate.gaps.bed"
    gpbed.print_to_file(gpbedfile, sorted=True)

    agpfile = agp([chrfasta])
    maskedagpfile = mask([agpfile, gpbedfile])
    maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed"
    bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)])

    mbed = Bed(maskedbedfile)
    finalbed = Bed()
    for b in mbed:
        sid = b.seqid
        key = (sid, b.start, b.end)
        if key not in gappositions:
            finalbed.add("{0}\n".format(b))
            continue

        gapid = gappositions[key]
        scfs = gap_to_scf[gapid]

        # For scaffolds placed in the same gap, sort according to positions
        scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end)
        for scf, strand in scfs:
            size = sizes[scf]
            finalbed.add("\t".join(str(x) for x in (scf, 0, size, sid, 1000, strand)))

    finalbedfile = "final.bed"
    finalbed.print_to_file(finalbedfile)

    # Clean-up
    toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile]
    FileShredder(toclean)
Пример #39
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    p = OptionParser(covfilter.__doc__)
    p.add_option("--pctid", dest="pctid", default=90, type="int",
            help="Percentage identity cutoff [default: %default]")
    p.add_option("--pctcov", dest="pctcov", default=50, type="int",
            help="Percentage identity cutoff [default: %default]")
    p.add_option("--ids", dest="ids", default=None,
            help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list", dest="list", default=False, action="store_true",
            help="List the id% and cov% per gene [default: %default]")
    set_outfile(p, outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    from jcvi.algorithms.supermap import supermap

    blastfile, fastafile = args
    sizes = Sizes(fastafile).mapping
    querysupermap = blastfile + ".query.supermap"
    if not op.exists(querysupermap):
        supermap(blastfile, filter="query")

    blastfile = querysupermap
    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(querysupermap)
    for query, blines in blast.iter_hits():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0

        for b in blines:
            this_covered += abs(b.qstart - b.qstop + 1)
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps

        this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen
        this_coverage = this_covered * 100. / sizes[query]

        if opts.list:
            print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage)

        if this_identity >= opts.pctid and this_coverage >= opts.pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    print >> sys.stderr, "Identity: {0} mismatches, {1} gaps, {2} alignlen".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    print >> sys.stderr, "Total mapped: {0} ({1:.1f}% of {2})".\
            format(mapped_count, mapped_count * 100. / total, total)
    print >> sys.stderr, "Total valid {0}: {1} ({2:.1f}% of {3})".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    print >> sys.stderr, "Average id = {0:.2f}%".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sum(sizes[x] for x in queries)
    print >> sys.stderr, "Coverage: {0} covered, {1} total".\
            format(covered, queries_combined)
    print >> sys.stderr, "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fp = open(blastfile)
    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast.iter_line():
        if b.query in valid:
            print >> fw, b
Пример #40
0
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None):
    # Shuffle the two bedfiles together
    sz = Sizes(bbfasta)
    sizes = sz.mapping
    shuffled = "shuffled.bed"
    border = bfbed.order

    all = []
    afbed.sort(key=afbed.nullkey)
    totalids = len(sizes)
    pad = int(math.log10(totalids)) + 1
    cj = 0
    seen = set()
    accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad)

    for seqid, aa in afbed.sub_beds():
        cj += 1
        abeds, bbeds, beds = [], [], []
        size = sizes[seqid]
        ranges = [(x.seqid, x.start, x.end) for x in aa]
        cranges = range_interleave(ranges, sizes={seqid: size}, empty=True)
        for crange in cranges:
            if crange:
                seqid, start, end = crange
                bedline = "\t".join(str(x) for x in (seqid, start - 1, end))
                abeds.append(BedLine(bedline))
            else:
                abeds.append(None)

        for a in aa:
            gapid = a.accn
            bi, b = border[gapid]
            if a.strand == '-':
                b.extra[1] = b.strand = ('-' if b.strand == '+' else '+')

            bbeds.append(b)

        n_abeds = len(abeds)
        n_bbeds = len(bbeds)
        assert n_abeds - n_bbeds == 1, \
            "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds)

        beds = [x for x in roundrobin(abeds, bbeds) if x]
        if prefix:
            for b in beds:
                b.accn = accn(cj)

        all.extend(beds)
        seen.add(seqid)

    # Singletons
    for seqid, size in sz.iter_sizes():
        if seqid in seen:
            continue

        bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj)))
        b = BedLine(bedline)

        cj += 1
        if prefix:
            b.accn = accn(cj)

        all.append(b)

    shuffledbed = Bed()
    shuffledbed.extend(all)
    shuffledbed.print_to_file(shuffled)

    return shuffledbed
Пример #41
0
def bambus(args):
    """
    %prog bambus bambus.bed bambus.mates total.fasta

    Insert unplaced scaffolds based on mates.
    """
    from jcvi.utils.iter import pairwise
    from jcvi.formats.posmap import MatesFile

    p = OptionParser(bambus.__doc__)
    p.add_option("--prefix", default="scaffold",
                 help="Prefix of the unplaced scaffolds [default: %default]")
    p.add_option("--minlinks", default=3, type="int",
                 help="Minimum number of links to place [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, matesfile, fastafile = args
    pf = matesfile.rsplit(".", 1)[0]
    logfile = pf + ".log"
    log = open(logfile, "w")

    mf = MatesFile(matesfile)
    maxdist = max(x.max for x in mf.libraries.values())
    logging.debug("Max separation: {0}".format(maxdist))

    prefix = opts.prefix
    minlinks = opts.minlinks

    is_unplaced = lambda x: x.startswith(prefix)
    bed = Bed(bedfile, sorted=False)
    beds = []
    unplaced = defaultdict(list)

    for a, b in pairwise(bed):
        aname, bname = a.accn, b.accn
        aseqid, bseqid = a.seqid, b.seqid

        if aname not in mf:
            continue

        pa, la = mf[aname]
        if pa != bname:
            continue

        ia = is_unplaced(aseqid)
        ib = is_unplaced(bseqid)
        if ia == ib:
            continue

        if ia:
            a, b = b, a

        unplaced[b.seqid].append((a, b))
        beds.extend([a, b])

    sizes = Sizes(fastafile)
    candidatebed = Bed()
    cbeds = []
    # For each unplaced scaffold, find most likely placement and orientation
    for scf, beds in sorted(unplaced.items()):
        print >> log
        ranges = []
        for a, b in beds:
            aname, astrand = a.accn, a.strand
            bname, bstrand = b.accn, b.strand
            aseqid, bseqid = a.seqid, b.seqid
            pa, lib = mf[aname]

            print >> log, a
            print >> log, b

            flip_b = (astrand == bstrand)
            fbstrand = '-' if flip_b else '+'
            if flip_b:
                b.reverse_complement(sizes)

            lmin, lmax = lib.min, lib.max

            L = sizes.get_size(scf)
            assert astrand in ('+', '-')
            if astrand == '+':
                offset = a.start - b.end
                sstart, sstop = offset + lmin, offset + lmax
            else:
                offset = a.end - b.start + L
                sstart, sstop = offset - lmax, offset - lmin

            # Prevent out of range error
            size = sizes.get_size(aseqid)
            sstart = max(0, sstart)
            sstop = max(0, sstop)
            sstart = min(size - 1, sstart)
            sstop = min(size - 1, sstop)

            start_range = (aseqid, sstart, sstop, scf, 1, fbstrand)
            print >> log, "*" + "\t".join(str(x) for x in start_range)
            ranges.append(start_range)

        mranges = [x[:3] for x in ranges]
        # Determine placement by finding the interval with the most support
        rd = ranges_depth(mranges, sizes.mapping, verbose=False)
        alldepths = []
        for depth in rd:
            alldepths.extend(depth)
        print >> log, alldepths

        maxdepth = max(alldepths, key=lambda x: x[-1])[-1]
        if maxdepth < minlinks:
            print >> log, "Insufficient links ({0} < {1})".format(maxdepth, minlinks)
            continue

        candidates = [x for x in alldepths if x[-1] == maxdepth]
        nseqids = len(set(x[0] for x in candidates))
        msg = "Multiple conflicting candidates found"
        if nseqids != 1:
            print >> log, msg
            continue

        seqid, mmin, mmax, depth = candidates[0]
        mmin, mmax = range_minmax([x[1:3] for x in candidates])
        if (mmax - mmin) > maxdist:
            print >> log, msg
            continue

        # Determine orientation by voting
        nplus, nminus = 0, 0
        arange = (seqid, mmin, mmax)
        for sid, start, end, sf, sc, fbstrand in ranges:
            brange = (sid, start, end)
            if range_overlap(arange, brange):
                if fbstrand == '+':
                    nplus += 1
                else:
                    nminus += 1

        fbstrand = '+' if nplus >= nminus else '-'

        candidate = (seqid, mmin, mmax, scf, depth, fbstrand)
        bedline = BedLine("\t".join((str(x) for x in candidate)))
        cbeds.append(bedline)
        print >> log, "Plus: {0}, Minus: {1}".format(nplus, nminus)
        print >> log, candidate

    candidatebed.extend(cbeds)
    logging.debug("A total of {0} scaffolds can be placed.".\
                    format(len(candidatebed)))
    log.close()

    candidatebedfile = pf + ".candidate.bed"
    candidatebed.print_to_file(candidatebedfile, sorted=True)
Пример #42
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    from jcvi.algorithms.supermap import supermap
    from jcvi.utils.range import range_union

    allowed_iterby = ("query", "query_sbjct")

    p = OptionParser(covfilter.__doc__)
    p.set_align(pctid=95, pctcov=50)
    p.add_option("--scov", default=False, action="store_true",
            help="Subject coverage instead of query [default: %default]")
    p.add_option("--supermap", action="store_true",
            help="Use supermap instead of union")
    p.add_option("--ids", dest="ids", default=None,
            help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list", dest="list", default=False, action="store_true",
            help="List the id% and cov% per gene [default: %default]")
    p.add_option("--iterby", dest="iterby", default="query", choices=allowed_iterby,
            help="Choose how to iterate through BLAST [default: %default]")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    pctid = opts.pctid
    pctcov = opts.pctcov
    union = not opts.supermap
    scov = opts.scov
    sz = Sizes(fastafile)
    sizes = sz.mapping
    iterby = opts.iterby
    qspair = iterby == "query_sbjct"

    if not union:
        querysupermap = blastfile + ".query.supermap"
        if not op.exists(querysupermap):
            supermap(blastfile, filter="query")

        blastfile = querysupermap

    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(blastfile)
    iterator = blast.iter_hits_pair if qspair else blast.iter_hits

    covidstore = {}
    for query, blines in iterator():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0
        this_identity = 0

        ranges = []
        for b in blines:
            if scov:
                s, start, stop = b.subject, b.sstart, b.sstop
            else:
                s, start, stop = b.query, b.qstart, b.qstop
            cov_id = s

            if b.pctid < pctid:
                continue

            if start > stop:
                start, stop = stop, start
            this_covered += stop - start + 1
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps
            ranges.append(("1", start, stop))

        if ranges:
            this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen

        if union:
            this_covered = range_union(ranges)

        this_coverage = this_covered * 100. / sizes[cov_id]
        covidstore[query] = (this_identity, this_coverage)
        if this_identity >= pctid and this_coverage >= pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    if opts.list:
        if qspair:
            allpairs = defaultdict(list)
            for (q, s) in covidstore:
                allpairs[q].append((q, s))
                allpairs[s].append((q, s))

            for id, size in sz.iter_sizes():
                if id not in allpairs:
                    print "\t".join((id, "na", "0", "0"))
                else:
                    for qs in allpairs[id]:
                        this_identity, this_coverage = covidstore[qs]
                        print "{0}\t{1:.1f}\t{2:.1f}".format("\t".join(qs), this_identity, this_coverage)
        else:
            for query, size in sz.iter_sizes():
                this_identity, this_coverage = covidstore.get(query, (0, 0))
                print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage)

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    m += "Total mapped: {0} ({1:.1f}% of {2})\n".\
            format(mapped_count, mapped_count * 100. / total, total)
    m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    m += "Average id = {0:.2f}%\n".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sz.totalsize
    m += "Coverage: {0} covered, {1} total\n".\
            format(covered, queries_combined)
    m += "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    logfile = blastfile + ".covfilter.log"
    fw = open(logfile, "w")
    for f in (sys.stderr, fw):
        print >> f, m
    fw.close()

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast:
        query = (b.query, b.subject) if qspair else b.query
        if query in valid:
            print >> fw, b
Пример #43
0
def main(args):
    """
    %prog deltafile

    Plot one query. Extract the references that have major matches to this
    query. Control "major" by option --refcov.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--refids", help="Use subset of contigs in the ref")
    p.add_option("--refcov",
                 default=.01,
                 type="float",
                 help="Minimum reference coverage [default: %default]")
    p.add_option(
        "--all",
        default=False,
        action="store_true",
        help="Plot one pdf file per ref in refidsfile [default: %default]")
    p.add_option("--color",
                 default="similarity",
                 choices=("similarity", "direction", "none"),
                 help="Color the dots based on")
    p.add_option("--nolayout",
                 default=False,
                 action="store_true",
                 help="Do not rearrange contigs")
    p.set_align(pctid=0, hitlen=0)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    deltafile, = args
    reffasta, queryfasta = open(deltafile).readline().split()
    color = opts.color
    layout = not opts.nolayout
    prefix = op.basename(deltafile).split(".")[0]
    qsizes = Sizes(queryfasta).mapping
    rsizes = Sizes(reffasta).mapping

    refs = SetFile(opts.refids) if opts.refids else set(rsizes.keys())
    refcov = opts.refcov
    pctid = opts.pctid
    hitlen = opts.hitlen
    deltafile = filter([
        deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen)
    ])

    if opts.all:
        for r in refs:
            pdffile = plot_some_queries([r],
                                        qsizes,
                                        rsizes,
                                        deltafile,
                                        refcov,
                                        prefix=prefix,
                                        color=color,
                                        layout=layout)
            if pdffile:
                sh("mv {0} {1}.pdf".format(pdffile, r))
    else:
        plot_some_queries(refs,
                          qsizes,
                          rsizes,
                          deltafile,
                          refcov,
                          prefix=prefix,
                          color=color,
                          layout=layout)
Пример #44
0
def scaffold(args):
    """
    %prog scaffold ctgfasta agpfile

    Build scaffolds based on ordering in the AGP file.
    """
    from jcvi.formats.agp import bed, order_to_agp, build
    from jcvi.formats.bed import Bed

    p = OptionParser(scaffold.__doc__)
    p.add_option("--prefix",
                 default=False,
                 action="store_true",
                 help="Keep IDs with same prefix together [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, agpfile = args
    sizes = Sizes(ctgfasta).mapping

    pf = ctgfasta.rsplit(".", 1)[0]
    phasefile = pf + ".phases"
    fwphase = open(phasefile, "w")
    newagpfile = pf + ".new.agp"
    fwagp = open(newagpfile, "w")

    scaffoldbuckets = defaultdict(list)

    bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"])
    bb = Bed(bedfile)
    for s, partialorder in bb.sub_beds():
        name = partialorder[0].accn
        bname = name.rsplit("_", 1)[0] if opts.prefix else s
        scaffoldbuckets[bname].append([(b.accn, b.strand)
                                       for b in partialorder])

    # Now the buckets contain a mixture of singletons and partially resolved
    # scaffolds. Print the scaffolds first then remaining singletons.
    for bname, scaffolds in sorted(scaffoldbuckets.items()):
        ctgorder = []
        singletons = set()
        for scaf in sorted(scaffolds):
            for node, orientation in scaf:
                ctgorder.append((node, orientation))
            if len(scaf) == 1:
                singletons.add(node)
        nscaffolds = len(scaffolds)
        nsingletons = len(singletons)
        if nsingletons == 1 and nscaffolds == 0:
            phase = 3
        elif nsingletons == 0 and nscaffolds == 1:
            phase = 2
        else:
            phase = 1

        msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\
            format(bname, nscaffolds, nsingletons, phase)
        print >> sys.stderr, msg
        print >> fwphase, "\t".join((bname, str(phase)))

        order_to_agp(bname, ctgorder, sizes, fwagp)

    fwagp.close()
    os.remove(bedfile)

    fastafile = "final.fasta"
    build([newagpfile, ctgfasta, fastafile])
    tidy([fastafile])
Пример #45
0
def scaffold(args):
    """
    %prog scaffold ctgfasta agpfile

    Build scaffolds based on ordering in the AGP file.
    """
    from jcvi.formats.agp import AGP, bed, order_to_agp, build
    from jcvi.formats.bed import Bed

    p = OptionParser(scaffold.__doc__)
    p.add_option("--prefix", default=False, action="store_true",
            help="Keep IDs with same prefix together [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, agpfile = args
    sizes = Sizes(ctgfasta).mapping

    pf = ctgfasta.rsplit(".", 1)[0]
    phasefile = pf + ".phases"
    fwphase = open(phasefile, "w")
    newagpfile = pf + ".new.agp"
    fwagp = open(newagpfile, "w")

    scaffoldbuckets = defaultdict(list)
    seqnames = sorted(sizes.keys())

    bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"])
    bb = Bed(bedfile)
    for s, partialorder in bb.sub_beds():
        name = partialorder[0].accn
        bname = name.rsplit("_", 1)[0] if opts.prefix else s
        scaffoldbuckets[bname].append([(b.accn, b.strand) for b in partialorder])

    # Now the buckets contain a mixture of singletons and partially resolved
    # scaffolds. Print the scaffolds first then remaining singletons.
    for bname, scaffolds in sorted(scaffoldbuckets.items()):
        ctgorder = []
        singletons = set()
        for scaf in sorted(scaffolds):
            for node, orientation in scaf:
                ctgorder.append((node, orientation))
            if len(scaf) == 1:
                singletons.add(node)
        nscaffolds = len(scaffolds)
        nsingletons = len(singletons)
        if nsingletons == 1 and nscaffolds == 0:
            phase = 3
        elif nsingletons == 0 and nscaffolds == 1:
            phase = 2
        else:
            phase = 1

        msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\
            format(bname, nscaffolds, nsingletons, phase)
        print >> sys.stderr, msg
        print >> fwphase, "\t".join((bname, str(phase)))

        order_to_agp(bname, ctgorder, sizes, fwagp)

    fwagp.close()
    os.remove(bedfile)

    fastafile = "final.fasta"
    build([newagpfile, ctgfasta, fastafile])
    tidy([fastafile])
Пример #46
0
def ancestral(args):
    """
    %prog ancestral ancestral.txt assembly.fasta

    Karyotype evolution of pineapple. The figure is inspired by Amphioxus paper
    Figure 3 and Tetradon paper Figure 9.
    """
    p = OptionParser(ancestral.__doc__)
    opts, args, iopts = p.set_image_options(args, figsize="8x7")

    if len(args) != 2:
        sys.exit(not p.print_help())

    regionsfile, sizesfile = args
    regions = RegionsFile(regionsfile)
    sizes = Sizes(sizesfile).mapping
    sizes = dict((k, v) for (k, v) in sizes.iteritems() if k[:2] == "LG")
    maxsize = max(sizes.values())
    ratio = .5 / maxsize

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes((0, 0, 1, 1))

    from jcvi.graphics.base import set2
    a, b, c, d, e, f, g = set2[:7]
    set2 = (c, g, b, e, d, a, f)

    # Upper panel is the evolution of segments
    # All segments belong to one of seven karyotypes 1 to 7
    karyotypes = regions.karyotypes
    xgap = 1. / (1 + len(karyotypes))
    ygap = .05
    mgap = xgap / 4.5
    gwidth = mgap * .75
    tip = .02
    coords = {}
    for i, k in enumerate(regions.karyotypes):
        x = (i + 1) * xgap
        y = .9
        root.text(x, y + tip, "Anc" + k, ha="center")
        root.plot((x, x), (y, y - ygap), "k-", lw=2)
        y -= 2 * ygap
        coords['a'] = (x - 1.5 * mgap , y)
        coords['b'] = (x - .5 * mgap , y)
        coords['c'] = (x + .5 * mgap , y)
        coords['d'] = (x + 1.5 * mgap , y)
        coords['ab'] = join_nodes_vertical(root, coords, 'a', 'b', y + ygap / 2)
        coords['cd'] = join_nodes_vertical(root, coords, 'c', 'd', y + ygap / 2)
        coords['abcd'] = join_nodes_vertical(root, coords, 'ab', 'cd', y + ygap)
        for n in 'abcd':
            nx, ny = coords[n]
            root.text(nx, ny - tip, n, ha="center")
            coords[n] = (nx, ny - ygap / 2)

        kdata = regions.get_karyotype(k)
        for kd in kdata:
            g = kd.group
            gx, gy = coords[g]
            gsize = ratio * kd.span
            gy -= gsize
            p = Rectangle((gx - gwidth / 2, gy),
                           gwidth, gsize, lw=0, color=set2[i])
            root.add_patch(p)
            root.text(gx, gy + gsize / 2, kd.chromosome,
                      ha="center", va="center", color='w')
            coords[g] = (gx, gy - tip)

    # Bottom panel shows the location of segments on chromosomes
    # TODO: redundant code, similar to graphics.chromosome
    ystart = .54
    chr_number = len(sizes)
    xstart, xend = xgap - 2 * mgap, 1 - xgap + 2 * mgap
    xinterval = (xend - xstart - gwidth) / (chr_number - 1)
    chrpos = {}
    for a, (chr, clen) in enumerate(sorted(sizes.items())):
        chr = get_number(chr)
        xx = xstart + a * xinterval + gwidth / 2
        chrpos[chr] = xx
        root.text(xx, ystart + .01, chr, ha="center")
        Chromosome(root, xx, ystart, ystart - clen * ratio, width=gwidth)

    # Start painting
    for r in regions:
        xx = chrpos[r.chromosome]
        yystart = ystart - r.start * ratio
        yyend = ystart - r.end * ratio
        p = Rectangle((xx - gwidth / 2, yystart), gwidth, yyend - yystart,
                      color=set2[int(r.karyotype) - 1], lw=0)
        root.add_patch(p)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    pf = "pineapple-karyotype"
    image_name = pf + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Пример #47
0
    qbed, sbed = opts.qbed, opts.sbed
    proportional = opts.proportional

    if len(args) != 1:
        sys.exit(not p.print_help())

    if qbed:
        qsizes = qsizes or sizes([qbed])
        qbed = Bed(qbed)
    if sbed:
        ssizes = ssizes or sizes([sbed])
        sbed = Bed(sbed)

    assert qsizes and ssizes, "You must specify at least one of --sizes of --bed"

    qsizes = Sizes(qsizes, select=opts.qselect)
    ssizes = Sizes(ssizes, select=opts.sselect)

    (blastfile, ) = args

    image_name = op.splitext(blastfile)[0] + "." + opts.format
    plt.rcParams["xtick.major.pad"] = 16
    plt.rcParams["ytick.major.pad"] = 16

    # Fix the width
    xsize, ysize = qsizes.totalsize, ssizes.totalsize

    # get highlight beds
    qh, sh = opts.qh, opts.sh
    qh = Bed(qh) if qh else None
    sh = Bed(sh) if sh else None
Пример #48
0
def scaffold(args):
    """
    %prog scaffold ctgfasta linksfile

    Use the linksfile to build scaffolds. The linksfile can be
    generated by calling assembly.bundle.link() or assembly.bundle.bundle().
    Use --prefix to place the sequences with same prefix together. The final
    product is an AGP file.
    """
    from jcvi.algorithms.graph import nx
    from jcvi.formats.agp import order_to_agp

    p = OptionParser(scaffold.__doc__)
    p.add_option("--prefix", default=False, action="store_true",
            help="Keep IDs with same prefix together [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, linksfile = args
    sizes = Sizes(ctgfasta).mapping
    logfile = "scaffold.log"
    fwlog = open(logfile, "w")

    pf = ctgfasta.rsplit(".", 1)[0]
    agpfile = pf + ".agp"
    fwagp = open(agpfile, "w")

    clinks = []
    g = nx.MultiGraph()  # use this to get connected components

    fp = open(linksfile)
    for row in fp:
        c = LinkLine(row)
        distance = max(c.distance, 50)

        g.add_edge(c.aseqid, c.bseqid,
                orientation=c.orientation, distance=distance)

    def get_bname(sname, prefix=False):
        return sname.rsplit("_", 1)[0] if prefix else "chr0"

    scaffoldbuckets = defaultdict(list)
    seqnames = sorted(sizes.keys())

    for h in nx.connected_component_subgraphs(g):
        partialorder = solve_component(h, sizes, fwlog)
        name = partialorder[0][0]
        bname = get_bname(name, prefix=opts.prefix)
        scaffoldbuckets[bname].append(partialorder)

    ctgbuckets = defaultdict(set)
    for name in seqnames:
        bname = get_bname(name, prefix=opts.prefix)
        ctgbuckets[bname].add(name)

    # Now the buckets contain a mixture of singletons and partially resolved
    # scaffolds. Print the scaffolds first then remaining singletons.
    scafname = "{0}.scf_{1:04d}"
    for bname, ctgs in sorted(ctgbuckets.items()):
        scaffolds = scaffoldbuckets[bname]
        scaffolded = set()
        ctgorder = []
        for scafID, scaf in enumerate(scaffolds):
            ctgorder = []
            for node, start, end, orientation in scaf:
                ctgorder.append((node, orientation))
                scaffolded.add(node)
            scaf = scafname.format(bname, scafID)
            order_to_agp(scaf, ctgorder, sizes, fwagp)
        singletons = sorted(ctgbuckets[bname] - scaffolded)
        nscaffolds = len(scaffolds)
        nsingletons = len(singletons)

        msg = "{0}: Scaffolds={1} Singletons={2}".\
            format(bname, nscaffolds, nsingletons)
        print >> sys.stderr, msg

        for singleton in singletons:
            ctgorder = [(singleton, "+")]
            order_to_agp(singleton, ctgorder, sizes, fwagp)

    fwagp.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))
Пример #49
0
def stack(args):
    """
    %prog stack fastafile

    Create landscape plots that show the amounts of genic sequences, and repetitive
    sequences along the chromosomes.
    """
    p = OptionParser(stack.__doc__)
    p.add_option("--top", default=10, type="int",
                 help="Draw the first N chromosomes [default: %default]")
    p.add_option("--stacks",
                 default="Exons,Introns,DNA_transposons,Retrotransposons",
                 help="Features to plot in stackplot [default: %default]")
    p.add_option("--switch",
                 help="Change chr names based on two-column file [default: %default]")
    add_window_options(p)
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    top = opts.top
    window, shift, subtract = check_window_options(opts)
    switch = opts.switch
    if switch:
        switch = DictFile(opts.switch)

    stacks = opts.stacks.split(",")
    bedfiles = get_beds(stacks)
    binfiles = get_binfiles(bedfiles, fastafile, shift, subtract=subtract)

    sizes = Sizes(fastafile)
    s = list(sizes.iter_sizes())[:top]
    maxl = max(x[1] for x in s)
    margin = .08
    inner = .02   # y distance between tracks

    pf = fastafile.rsplit(".", 1)[0]
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])

    # Gauge
    ratio = draw_gauge(root, margin, maxl)

    # Per chromosome
    yinterval = (1 - 2 * margin) / (top + 1)
    xx = margin
    yy = 1 - margin
    for chr, clen in s:
        yy -= yinterval
        xlen = clen / ratio
        cc = chr
        if "_" in chr:
            ca, cb = chr.split("_")
            cc = ca[0].upper() + cb

        if switch and cc in switch:
            cc = "\n".join((cc, "({0})".format(switch[cc])))

        root.add_patch(Rectangle((xx, yy), xlen, yinterval - inner, color=gray))
        ax = fig.add_axes([xx, yy, xlen, yinterval - inner])

        nbins = clen / shift
        if clen % shift:
            nbins += 1

        stackplot(ax, binfiles, nbins, palette, chr, window, shift)
        root.text(xx - .04, yy + .5 * (yinterval - inner), cc, ha="center", va="center")

        ax.set_xlim(0, nbins)
        ax.set_ylim(0, 1)
        ax.set_axis_off()

    # Legends
    yy -= yinterval
    xx = margin
    for b, p in zip(bedfiles, palette):
        b = b.rsplit(".", 1)[0].replace("_", " ")
        b = Registration.get(b, b)

        root.add_patch(Rectangle((xx, yy), inner, inner, color=p, lw=0))
        xx += 2 * inner
        root.text(xx, yy, b, size=13)
        xx += len(b) * .012 + inner

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    image_name = pf + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Пример #50
0
def connect(args):
    """
    %prog connect assembly.fasta read_mapping.blast

    Connect contigs using long reads.
    """
    p = OptionParser(connect.__doc__)
    p.add_option(
        "--clip",
        default=2000,
        type="int",
        help="Only consider end of contigs",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, blastfile = args
    clip = opts.clip

    sizes = Sizes(fastafile).mapping
    blast = Blast(blastfile)
    blasts = []
    for b in blast:
        seqid = b.subject
        size = sizes[seqid]
        start, end = b.sstart, b.sstop
        cstart, cend = min(size, clip), max(0, size - clip)
        if start > cstart and end < cend:
            continue
        blasts.append(b)

    key = lambda x: x.query
    blasts.sort(key=key)
    g = BiGraph()
    for query, bb in groupby(blasts, key=key):
        bb = sorted(bb, key=lambda x: x.qstart)
        nsubjects = len(set(x.subject for x in bb))
        if nsubjects == 1:
            continue
        print("\n".join(str(x) for x in bb))
        for a, b in pairwise(bb):
            astart, astop = a.qstart, a.qstop
            bstart, bstop = b.qstart, b.qstop
            if a.subject == b.subject:
                continue

            arange = astart, astop
            brange = bstart, bstop
            ov = range_intersect(arange, brange)
            alen = astop - astart + 1
            blen = bstop - bstart + 1
            if ov:
                ostart, ostop = ov
                ov = ostop - ostart + 1

            print(ov, alen, blen)
            if ov and (ov > alen / 2 or ov > blen / 2):
                print("Too much overlap ({0})".format(ov))
                continue

            asub = a.subject
            bsub = b.subject
            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(asub, bsub, atag, btag)

    graph_to_agp(g, blastfile, fastafile, verbose=False)
Пример #51
0
def fromagp(args):
    """
    %prog fromagp agpfile componentfasta objectfasta

    Generate chain file from AGP format. The components represent the old
    genome (target) and the objects represent new genome (query).
    """
    from jcvi.formats.agp import AGP
    from jcvi.formats.sizes import Sizes

    p = OptionParser(fromagp.__doc__)
    p.add_option("--novalidate",
                 default=False,
                 action="store_true",
                 help="Do not validate AGP")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    agpfile, componentfasta, objectfasta = args
    chainfile = agpfile.rsplit(".", 1)[0] + ".chain"
    fw = open(chainfile, "w")
    agp = AGP(agpfile, validate=(not opts.novalidate))
    componentsizes = Sizes(componentfasta).mapping
    objectsizes = Sizes(objectfasta).mapping
    chain = "chain"
    score = 1000
    tStrand = "+"
    id = 0
    for a in agp:
        if a.is_gap:
            continue

        tName = a.component_id
        tSize = componentsizes[tName]
        tStart = a.component_beg
        tEnd = a.component_end
        tStart -= 1

        qName = a.object
        qSize = objectsizes[qName]
        qStrand = "-" if a.orientation == "-" else "+"
        qStart = a.object_beg
        qEnd = a.object_end
        if qStrand == '-':
            _qStart = qSize - qEnd + 1
            _qEnd = qSize - qStart + 1
            qStart, qEnd = _qStart, _qEnd
        qStart -= 1

        id += 1
        size = a.object_span
        headerline = "\t".join(
            str(x) for x in (chain, score, tName, tSize, tStrand, tStart, tEnd,
                             qName, qSize, qStrand, qStart, qEnd, id))
        alignmentline = size
        print >> fw, headerline
        print >> fw, alignmentline
        print >> fw

    fw.close()
    logging.debug("File written to `{0}`.".format(chainfile))
Пример #52
0
def fromblast(args):
    """
    %prog fromblast blastfile subject.fasta

    Generate path from BLAST file. If multiple subjects map to the same query,
    an edge is constructed between them (with the link provided by the query).

    The BLAST file MUST be filtered, chained, supermapped.
    """
    from jcvi.formats.blast import sort
    from jcvi.utils.range import range_distance

    p = OptionParser(fromblast.__doc__)
    p.add_option("--clique", default=False, action="store_true",
                 help="Populate clique instead of linear path [default: %default]")
    p.add_option("--maxdist", default=100000, type="int",
                 help="Create edge within certain distance [default: %default]")
    p.add_option("--verbose", default=False, action="store_true",
                 help="Print verbose reports to stdout [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, subjectfasta = args
    clique = opts.clique
    maxdist = opts.maxdist
    sort([blastfile, "--query"])
    blast = BlastSlow(blastfile, sorted=True)
    g = BiGraph()
    for query, blines in groupby(blast, key=lambda x: x.query):
        blines = list(blines)
        iterator = combinations(blines, 2) if clique else pairwise(blines)
        for a, b in iterator:
            asub, bsub = a.subject, b.subject
            if asub == bsub:
                continue

            arange = (a.query, a.qstart, a.qstop, "+")
            brange = (b.query, b.qstart, b.qstop, "+")
            dist, oo = range_distance(arange, brange, distmode="ee")
            if dist > maxdist:
                continue

            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(BiEdge(asub, bsub, atag, btag))

    g.write("graph.txt")
    #g.draw("graph.pdf")

    logging.debug(str(g))
    paths = []
    for path in g.iter_paths():
        m, oo = g.path(path)
        if len(oo) == 1:  # Singleton path
            continue
        paths.append(oo)
        if opts.verbose:
            print m
            print oo

    npaths = len(paths)
    ntigs = sum(len(x) for x in paths)
    logging.debug("Graph decomposed to {0} paths with {1} components.".\
                  format(npaths, ntigs))

    agpfile = blastfile + ".agp"
    sizes = Sizes(subjectfasta)
    fwagp = open(agpfile, "w")
    scaffolded = set()
    for i, oo in enumerate(paths):
        ctgorder = [(str(ctg), ("+" if strand else "-")) \
                     for ctg, strand in oo]
        scaffolded |= set(ctg for ctg, strand in ctgorder)
        object = "pmol_{0:04d}".format(i)
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)

    # Get the singletons as well
    nsingletons = 0
    for ctg, size in sizes.iter_sizes():
        if ctg in scaffolded:
            continue

        ctgorder = [(ctg, "+")]
        object = ctg
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)
        nsingletons += 1
    logging.debug("Written {0} unscaffolded singletons.".format(nsingletons))

    fwagp.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))
Пример #53
0
def get_offsets(fastafile):
    s = Sizes(fastafile)
    fastasize = s.totalsize
    sizes = s.mapping
    offsets = s.cumsizes_mapping
    return fastasize, sizes, offsets
Пример #54
0
def score(args):
    """
    %prog score main_results/ cached_data/ contigsfasta

    Score the current LACHESIS CLM.
    """
    p = OptionParser(score.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    mdir, cdir, contigsfasta = args
    orderingfiles = natsorted(iglob(mdir, "*.ordering"))
    sizes = Sizes(contigsfasta)
    contig_names = list(sizes.iter_names())
    contig_ids = dict((name, i) for (i, name) in enumerate(contig_names))

    oo = []
    # Load contact matrix
    glm = op.join(cdir, "all.GLM")
    N = len(contig_ids)
    M = np.zeros((N, N), dtype=int)
    fp = open(glm)
    for row in fp:
        if row[0] == '#':
            continue
        x, y, z = row.split()
        if x == 'X':
            continue
        M[int(x), int(y)] = int(z)

    fwtour = open("tour", "w")

    def callback(tour, gen, oo):
        fitness = tour.fitness if hasattr(tour, "fitness") else None
        label = "GA-{0}".format(gen)
        if fitness:
            fitness = "{0}".format(fitness).split(",")[0].replace("(", "")
            label += "-" + fitness
        print_tour(fwtour, tour, label, contig_names, oo)
        return tour

    for ofile in orderingfiles:
        co = ContigOrdering(ofile)
        for x in co:
            contig_id = contig_ids[x.contig_name]
            oo.append(contig_id)
        pf = op.basename(ofile).split(".")[0]
        print pf
        print oo

        tour, tour_sizes, tour_M = prepare_ec(oo, sizes, M)
        # Store INIT tour
        print_tour(fwtour, tour, "INIT", contig_names, oo)

        # Faster Cython version for evaluation
        from .chic import score_evaluate_M
        callbacki = partial(callback, oo=oo)
        toolbox = GA_setup(tour)
        toolbox.register("evaluate", score_evaluate_M,
                         tour_sizes=tour_sizes, tour_M=tour_M)
        tour, tour.fitness = GA_run(toolbox, npop=100, cpus=opts.cpus,
                                    callback=callbacki)
        print tour, tour.fitness
        break

    fwtour.close()