예제 #1
0
파일: kmer.py 프로젝트: arvin580/jcvi
def bincount(args):
    """
    %prog bincount fastafile binfile

    Count K-mers in the bin.
    """
    from bitarray import bitarray
    from jcvi.formats.sizes import Sizes

    p = OptionParser(bincount.__doc__)
    p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, binfile = args
    K = opts.K

    fp = open(binfile)
    a = bitarray()
    a.fromfile(fp)
    f = Sizes(fastafile)
    tsize = 0
    fw = must_open(opts.outfile, "w")
    for name, seqlen in f.iter_sizes():
        ksize = seqlen - K + 1
        b = a[tsize : tsize + ksize]
        bcount = b.count()
        print >> fw, "\t".join(str(x) for x in (name, bcount))
        tsize += ksize
예제 #2
0
def pasteprepare(args):
    """
    %prog pasteprepare bacs.fasta

    Prepare sequences for paste.
    """
    p = OptionParser(pasteprepare.__doc__)
    p.add_option(
        "--flank",
        default=5000,
        type="int",
        help="Get the seq of size on two ends",
    )
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (goodfasta,) = args
    flank = opts.flank
    pf = goodfasta.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"

    sizes = Sizes(goodfasta)
    fw = open(extbed, "w")
    for bac, size in sizes.iter_sizes():
        print("\t".join(str(x) for x in (bac, 0, min(flank, size), bac + "L")), file=fw)
        print(
            "\t".join(str(x) for x in (bac, max(size - flank, 0), size, bac + "R")),
            file=fw,
        )
    fw.close()

    fastaFromBed(extbed, goodfasta, name=True)
예제 #3
0
파일: patch.py 프로젝트: JinfengChen/jcvi
def pasteprepare(args):
    """
    %prog pasteprepare bacs.fasta

    Prepare sequences for paste.
    """
    p = OptionParser(pasteprepare.__doc__)
    p.add_option("--flank", default=5000, type="int",
                 help="Get the seq of size on two ends [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    goodfasta, = args
    flank = opts.flank
    pf = goodfasta.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"

    sizes = Sizes(goodfasta)
    fw = open(extbed, "w")
    for bac, size in sizes.iter_sizes():
        print >> fw, "\t".join(str(x) for x in \
                               (bac, 0, min(flank, size), bac + "L"))
        print >> fw, "\t".join(str(x) for x in \
                               (bac, max(size - flank, 0), size, bac + "R"))
    fw.close()

    fastaFromBed(extbed, goodfasta, name=True)
예제 #4
0
def bincount(args):
    """
    %prog bincount fastafile binfile

    Count K-mers in the bin.
    """
    from bitarray import bitarray
    from jcvi.formats.sizes import Sizes

    p = OptionParser(bincount.__doc__)
    p.add_option("-K", default=23, type="int", help="K-mer size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, binfile = args
    K = opts.K

    fp = open(binfile)
    a = bitarray()
    a.fromfile(fp)
    f = Sizes(fastafile)
    tsize = 0
    fw = must_open(opts.outfile, "w")
    for name, seqlen in f.iter_sizes():
        ksize = seqlen - K + 1
        b = a[tsize:tsize + ksize]
        bcount = b.count()
        print("\t".join(str(x) for x in (name, bcount)), file=fw)
        tsize += ksize
예제 #5
0
def bed(args):
    """
    %prog bed binfile fastafile

    Write bed files where the bases have at least certain depth.
    """
    p = OptionParser(bed.__doc__)
    p.add_option(
        "-o",
        dest="output",
        default="stdout",
        help="Output file name",
    )
    p.add_option(
        "--cutoff",
        dest="cutoff",
        default=10,
        type="int",
        help="Minimum read depth to report intervals",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    binfile, fastafile = args
    fw = must_open(opts.output, "w")
    cutoff = opts.cutoff
    assert cutoff >= 0, "Need non-negative cutoff"

    b = BinFile(binfile)
    ar = b.array

    fastasize, sizes, offsets = get_offsets(fastafile)
    s = Sizes(fastafile)
    for ctg, ctglen in s.iter_sizes():
        offset = offsets[ctg]
        subarray = ar[offset:offset + ctglen]
        key = lambda x: x[1] >= cutoff
        for tf, array_elements in groupby(enumerate(subarray), key=key):
            array_elements = list(array_elements)
            if not tf:
                continue

            # 0-based system => 1-based system
            start = array_elements[0][0] + 1
            end = array_elements[-1][0] + 1

            mean_depth = sum([x[1]
                              for x in array_elements]) / len(array_elements)
            mean_depth = int(mean_depth)

            name = "na"
            print(
                "\t".join(
                    str(x) for x in (ctg, start - 1, end, name, mean_depth)),
                file=fw,
            )
예제 #6
0
파일: sspace.py 프로젝트: arvin580/jcvi
 def __init__(self, filename, fastafile):
     super(EvidenceFile, self).__init__(filename)
     sz = Sizes(fastafile)
     sizes = [None]  # tig-list starts at 1
     for name, size in sz.iter_sizes():
         sizes.append((name, size))
     self.sizes = sizes
     self.sz = sz.mapping
     self.scf = {}
예제 #7
0
파일: sspace.py 프로젝트: biologyguy/jcvi
 def __init__(self, filename, fastafile):
     super(EvidenceFile, self).__init__(filename)
     sz = Sizes(fastafile)
     sizes = [None]  # tig-list starts at 1
     for name, size in sz.iter_sizes():
         sizes.append((name, size))
     self.sizes = sizes
     self.sz = sz.mapping
     self.scf = {}
예제 #8
0
def scaffold(args):
    """
    %prog scaffold scaffold.fasta synteny.blast synteny.sizes synteny.bed
                         physicalmap.blast physicalmap.sizes physicalmap.bed

    As evaluation of scaffolding, visualize external line of evidences:
    * Plot synteny to an external genome
    * Plot alignments to physical map
    * Plot alignments to genetic map (TODO)

    Each trio defines one panel to be plotted. blastfile defines the matchings
    between the evidences vs scaffolds. Then the evidence sizes, and evidence
    bed to plot dot plots.

    This script will plot a dot in the dot plot in the corresponding location
    the plots are one contig/scaffold per plot.
    """
    from jcvi.graphics.base import set_image_options
    from jcvi.utils.iter import grouper

    p = OptionParser(scaffold.__doc__)
    p.add_option("--cutoff", type="int", default=1000000,
            help="Plot scaffolds with size larger than [default: %default]")
    p.add_option("--highlights",
            help="A set of regions in BED format to highlight [default: %default]")
    opts, args, iopts = set_image_options(p, args, figsize="14x8", dpi=150)

    if len(args) < 4 or len(args) % 3 != 1:
        sys.exit(not p.print_help())

    highlights = opts.highlights
    scafsizes = Sizes(args[0])
    trios = list(grouper(3, args[1:]))
    trios = [(a, Sizes(b), Bed(c)) for a, b, c in trios]
    if highlights:
        hlbed = Bed(highlights)

    for scaffoldID, scafsize in scafsizes.iter_sizes():
        if scafsize < opts.cutoff:
            continue
        logging.debug("Loading {0} (size={1})".format(scaffoldID,
            thousands(scafsize)))

        tmpname = scaffoldID + ".sizes"
        tmp = open(tmpname, "w")
        tmp.write("{0}\t{1}".format(scaffoldID, scafsize))
        tmp.close()

        tmpsizes = Sizes(tmpname)
        tmpsizes.close(clean=True)

        if highlights:
            subhighlights = list(hlbed.sub_bed(scaffoldID))

        imagename = ".".join((scaffoldID, opts.format))
        plot_one_scaffold(scaffoldID, tmpsizes, None, trios, imagename, iopts,
                          highlights=subhighlights)
예제 #9
0
def graph_to_agp(g, blastfile, subjectfasta, exclude=[], verbose=False):

    from jcvi.formats.agp import order_to_agp

    logging.debug(str(g))
    g.write("graph.txt")
    # g.draw("graph.pdf")

    paths = []
    for path in g.iter_paths():
        m, oo = g.path(path)
        if len(oo) == 1:  # Singleton path
            continue
        paths.append(oo)
        if verbose:
            print(m)
            print(oo)

    npaths = len(paths)
    ntigs = sum(len(x) for x in paths)
    logging.debug(
        "Graph decomposed to {0} paths with {1} components.".format(npaths, ntigs)
    )

    agpfile = blastfile + ".agp"
    sizes = Sizes(subjectfasta)
    fwagp = open(agpfile, "w")
    scaffolded = set()
    for i, oo in enumerate(paths):
        ctgorder = [(str(ctg), ("+" if strand else "-")) for ctg, strand in oo]
        scaffolded |= set(ctg for ctg, strand in ctgorder)
        object = "pmol_{0:04d}".format(i)
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)

    # Get the singletons as well
    nsingletons = nscaffolded = nexcluded = 0
    for ctg, size in sizes.iter_sizes():
        if ctg in scaffolded:
            nscaffolded += 1
            continue
        if ctg in exclude:
            nexcluded += 1
            continue

        ctgorder = [(ctg, "+")]
        object = ctg
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)
        nsingletons += 1
    logging.debug(
        "scaffolded={} excluded={} singletons={}".format(
            nscaffolded, nexcluded, nsingletons
        )
    )

    fwagp.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))
예제 #10
0
파일: syntenypath.py 프로젝트: ascendo/jcvi
def graph_to_agp(g, blastfile, subjectfasta, exclude=[], verbose=False):

    from jcvi.formats.agp import order_to_agp

    logging.debug(str(g))
    g.write("graph.txt")
    #g.draw("graph.pdf")

    paths = []
    for path in g.iter_paths():
        m, oo = g.path(path)
        if len(oo) == 1:  # Singleton path
            continue
        paths.append(oo)
        if verbose:
            print m
            print oo

    npaths = len(paths)
    ntigs = sum(len(x) for x in paths)
    logging.debug("Graph decomposed to {0} paths with {1} components.".\
                  format(npaths, ntigs))

    agpfile = blastfile + ".agp"
    sizes = Sizes(subjectfasta)
    fwagp = open(agpfile, "w")
    scaffolded = set()
    for i, oo in enumerate(paths):
        ctgorder = [(str(ctg), ("+" if strand else "-")) \
                     for ctg, strand in oo]
        scaffolded |= set(ctg for ctg, strand in ctgorder)
        object = "pmol_{0:04d}".format(i)
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)

    # Get the singletons as well
    nsingletons = nscaffolded = nexcluded = 0
    for ctg, size in sizes.iter_sizes():
        if ctg in scaffolded:
            nscaffolded += 1
            continue
        if ctg in exclude:
            nexcluded += 1
            continue

        ctgorder = [(ctg, "+")]
        object = ctg
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)
        nsingletons += 1
    logging.debug("scaffolded={} excluded={} singletons={}".\
                    format(nscaffolded, nexcluded, nsingletons))

    fwagp.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))
예제 #11
0
파일: assembly.py 프로젝트: Hensonmw/jcvi
def covlen(args):
    """
    %prog covlen covfile fastafile

    Plot coverage vs length. `covfile` is two-column listing contig id and
    depth of coverage.
    """
    import numpy as np
    import pandas as pd
    import seaborn as sns
    from jcvi.formats.base import DictFile

    p = OptionParser(covlen.__doc__)
    p.add_option("--maxsize", default=1000000, type="int", help="Max contig size")
    p.add_option("--maxcov", default=100, type="int", help="Max contig size")
    p.add_option("--color", default='m', help="Color of the data points")
    p.add_option("--kind", default="scatter",
                 choices=("scatter", "reg", "resid", "kde", "hex"),
                 help="Kind of plot to draw")
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 2:
        sys.exit(not p.print_help())

    covfile, fastafile = args
    cov = DictFile(covfile, cast=float)
    s = Sizes(fastafile)
    data = []
    maxsize, maxcov = opts.maxsize, opts.maxcov
    for ctg, size in s.iter_sizes():
        c = cov.get(ctg, 0)
        if size > maxsize:
            continue
        if c > maxcov:
            continue
        data.append((size, c))

    x, y = zip(*data)
    x = np.array(x)
    y = np.array(y)
    logging.debug("X size {0}, Y size {1}".format(x.size, y.size))

    df = pd.DataFrame()
    xlab, ylab = "Length", "Coverage of depth (X)"
    df[xlab] = x
    df[ylab] = y
    sns.jointplot(xlab, ylab, kind=opts.kind, data=df,
                  xlim=(0, maxsize), ylim=(0, maxcov),
                  stat_func=None, edgecolor="w", color=opts.color)

    figname = covfile + ".pdf"
    savefig(figname, dpi=iopts.dpi, iopts=iopts)
예제 #12
0
파일: depth.py 프로젝트: Hensonmw/jcvi
def bed(args):
    """
    %prog bed binfile fastafile

    Write bed files where the bases have at least certain depth.
    """
    p = OptionParser(bed.__doc__)
    p.add_option("-o", dest="output", default="stdout",
            help="Output file name [default: %default]")
    p.add_option("--cutoff", dest="cutoff", default=10, type="int",
            help="Minimum read depth to report intervals [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    binfile, fastafile = args
    fw = must_open(opts.output, "w")
    cutoff = opts.cutoff
    assert cutoff >= 0, "Need non-negative cutoff"

    b = BinFile(binfile)
    ar = b.array

    fastasize, sizes, offsets = get_offsets(fastafile)
    s = Sizes(fastafile)
    for ctg, ctglen in s.iter_sizes():
        offset = offsets[ctg]
        subarray = ar[offset:offset + ctglen]
        key = lambda x: x[1] >= cutoff
        for tf, array_elements in groupby(enumerate(subarray), key=key):
            array_elements = list(array_elements)
            if not tf:
                continue

            # 0-based system => 1-based system
            start = array_elements[0][0] + 1
            end = array_elements[-1][0] + 1

            mean_depth = sum([x[1] for x in array_elements]) / \
                len(array_elements)
            mean_depth = int(mean_depth)

            name = "na"
            print >> fw, "\t".join(str(x) for x in (ctg, \
                    start - 1, end, name, mean_depth))
예제 #13
0
def covlen(args):
    """
    %prog covlen covfile fastafile

    Plot coverage vs lenght. `covfile` is two-column listing contig id and
    depth of coverage.
    """
    import numpy as np
    import seaborn as sns
    from jcvi.formats.base import DictFile

    p = OptionParser(covlen.__doc__)
    p.add_option("--maxsize", default=100000, type="int", help="Max contig size")
    p.add_option("--maxcov", default=100, type="int", help="Max contig size")
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 2:
        sys.exit(not p.print_help())

    covfile, fastafile = args
    cov = DictFile(covfile, cast=float)
    s = Sizes(fastafile)
    data = []
    maxsize, maxcov = opts.maxsize, opts.maxcov
    for ctg, size in s.iter_sizes():
        c = cov[ctg]
        if size > maxsize:
            continue
        if c > maxcov:
            continue
        data.append((size, c))

    x, y = zip(*data)
    x = np.array(x)
    y = np.array(y)
    logging.debug("X size {0}, Y size {1}".format(x.size, y.size))
    sns.jointplot(x, y, kind="kde")

    figname = covfile + ".pdf"
    savefig(figname, dpi=iopts.dpi, iopts=iopts)
예제 #14
0
파일: patch.py 프로젝트: JinfengChen/jcvi
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None):
    # Shuffle the two bedfiles together
    sz = Sizes(bbfasta)
    sizes = sz.mapping
    shuffled = "shuffled.bed"
    border = bfbed.order

    all = []
    afbed.sort(key=afbed.nullkey)
    totalids = len(sizes)
    pad = int(math.log10(totalids)) + 1
    cj = 0
    seen = set()
    accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad)

    for seqid, aa in afbed.sub_beds():
        cj += 1
        abeds, bbeds, beds = [], [], []
        size = sizes[seqid]
        ranges = [(x.seqid, x.start, x.end) for x in aa]
        cranges = range_interleave(ranges, sizes={seqid: size}, empty=True)
        for crange in cranges:
            if crange:
                seqid, start, end = crange
                bedline = "\t".join(str(x) for x in (seqid, start - 1, end))
                abeds.append(BedLine(bedline))
            else:
                abeds.append(None)

        for a in aa:
            gapid = a.accn
            bi, b = border[gapid]
            if a.strand == '-':
                b.extra[1] = b.strand = ('-' if b.strand == '+' else '+')

            bbeds.append(b)

        n_abeds = len(abeds)
        n_bbeds = len(bbeds)
        assert n_abeds - n_bbeds == 1, \
            "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds)

        beds = [x for x in roundrobin(abeds, bbeds) if x]
        if prefix:
            for b in beds:
                b.accn = accn(cj)

        all.extend(beds)
        seen.add(seqid)

    # Singletons
    for seqid, size in sz.iter_sizes():
        if seqid in seen:
            continue

        bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj)))
        b = BedLine(bedline)

        cj += 1
        if prefix:
            b.accn = accn(cj)

        all.append(b)

    shuffledbed = Bed()
    shuffledbed.extend(all)
    shuffledbed.print_to_file(shuffled)

    return shuffledbed
예제 #15
0
def install(args):
    """
    %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta

    Install patches into backbone, using sequences from alternative assembly.
    The patches sequences are generated via jcvi.assembly.patch.fill().

    The output is a bedfile that can be converted to AGP using
    jcvi.formats.agp.frombed().
    """
    from jcvi.apps.base import blast
    from jcvi.formats.blast import BlastSlow
    from jcvi.formats.fasta import SeqIO
    from jcvi.utils.iter import roundrobin

    p = OptionParser(install.__doc__)
    p.add_option(
        "--rclip",
        default=1,
        type="int",
        help="Pair ID is derived from rstrip N chars [default: %default]")
    p.add_option(
        "--maxsize",
        default=1000000,
        type="int",
        help="Maximum size of patchers to be replaced [default: %default]")
    p.add_option("--prefix",
                 help="Prefix of the new object [default: %default]")
    p.add_option(
        "--strict",
        default=False,
        action="store_true",
        help="Only update if replacement has no gaps [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    pbed, pfasta, bbfasta, altfasta = args
    Max = opts.maxsize  # Max DNA size to replace gap
    rclip = opts.rclip
    prefix = opts.prefix

    blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"])
    order = Bed(pbed).order

    beforebed, afterbed = "before.bed", "after.bed"
    fwa = open(beforebed, "w")
    fwb = open(afterbed, "w")

    key1 = lambda x: x.query
    key2 = lambda x: x.query[:-rclip] if rclip else key1
    data = BlastSlow(blastfile)

    for pe, lines in groupby(data, key=key2):
        lines = list(lines)
        if len(lines) != 2:
            continue

        a, b = lines

        aquery, bquery = a.query, b.query
        asubject, bsubject = a.subject, b.subject
        if asubject != bsubject:
            continue

        astrand, bstrand = a.orientation, b.orientation
        assert aquery[-1] == 'L' and bquery[-1] == 'R', str((aquery, bquery))

        ai, ax = order[aquery]
        bi, bx = order[bquery]
        qstart, qstop = ax.start + a.qstart - 1, bx.start + b.qstop - 1

        if astrand == '+' and bstrand == '+':
            sstart, sstop = a.sstart, b.sstop

        elif astrand == '-' and bstrand == '-':
            sstart, sstop = b.sstart, a.sstop

        else:
            continue

        if sstart > sstop:
            continue

        if sstop > sstart + Max:
            continue

        name = aquery[:-1] + "LR"
        print >> fwa, "\t".join(str(x) for x in \
                    (ax.seqid, qstart - 1, qstop, name, 1000, "+"))
        print >> fwb, "\t".join(str(x) for x in \
                    (asubject, sstart - 1, sstop, name, 1000, astrand))

    fwa.close()
    fwb.close()

    beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True)
    afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True)

    # Exclude the replacements that contain more Ns than before
    ah = SeqIO.parse(beforefasta, "fasta")
    bh = SeqIO.parse(afterfasta, "fasta")
    count_Ns = lambda x: x.seq.count('n') + x.seq.count('N')
    exclude = set()
    for arec, brec in zip(ah, bh):
        an = count_Ns(arec)
        bn = count_Ns(brec)
        if opts.strict:
            if bn == 0:
                continue

        elif bn < an:
            continue

        id = arec.id
        exclude.add(id)

    logging.debug("Ignore {0} updates because of decreasing quality."\
                    .format(len(exclude)))

    abed = Bed(beforebed, sorted=False)
    bbed = Bed(afterbed, sorted=False)
    abed = [x for x in abed if x.accn not in exclude]
    bbed = [x for x in bbed if x.accn not in exclude]

    abedfile = "before.filtered.bed"
    bbedfile = "after.filtered.bed"
    afbed = Bed()
    afbed.extend(abed)
    bfbed = Bed()
    bfbed.extend(bbed)

    afbed.print_to_file(abedfile)
    bfbed.print_to_file(bbedfile)

    # Shuffle the two bedfiles together
    sz = Sizes(bbfasta)
    sizes = sz.mapping
    shuffled = "shuffled.bed"
    border = bfbed.order

    all = []
    afbed.sort(key=afbed.nullkey)
    totalids = len(sizes)
    import math
    pad = int(math.log10(totalids)) + 1
    cj = 0
    seen = set()
    accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad)

    for seqid, aa in afbed.sub_beds():
        cj += 1
        abeds, bbeds, beds = [], [], []
        size = sizes[seqid]
        ranges = [(x.seqid, x.start, x.end) for x in aa]
        cranges = range_interleave(ranges, sizes={seqid: size})
        for seqid, start, end in cranges:
            bedline = "\t".join(str(x) for x in (seqid, start - 1, end))
            abeds.append(BedLine(bedline))

        for a in aa:
            gapid = a.accn
            bi, b = border[gapid]
            bbeds.append(b)

        a = abeds[0] if abeds else []
        assert abs(len(abeds) - len(bbeds)) <= 1
        if (not a) or a.start > 1:
            abeds, bbeds = bbeds, abeds

        beds = list(roundrobin(abeds, bbeds))
        if prefix:
            for b in beds:
                b.accn = accn(cj)

        all.extend(beds)
        seen.add(seqid)

    # Singletons
    for seqid, size in sz.iter_sizes():
        if seqid in seen:
            continue

        bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj)))
        b = BedLine(bedline)

        cj += 1
        if prefix:
            b.accn = accn(cj)

        all.append(b)

    shuffledbed = Bed()
    shuffledbed.extend(all)
    shuffledbed.print_to_file(shuffled)
예제 #16
0
def stack(args):
    """
    %prog stack fastafile

    Create landscape plots that show the amounts of genic sequences, and repetitive
    sequences along the chromosomes.
    """
    p = OptionParser(stack.__doc__)
    p.add_option("--top",
                 default=10,
                 type="int",
                 help="Draw the first N chromosomes")
    p.add_option(
        "--stacks",
        default="Exons,Introns,DNA_transposons,Retrotransposons",
        help="Features to plot in stackplot",
    )
    p.add_option("--switch", help="Change chr names based on two-column file")
    add_window_options(p)
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    top = opts.top
    window, shift, subtract, merge = check_window_options(opts)
    switch = opts.switch
    if switch:
        switch = DictFile(opts.switch)

    stacks = opts.stacks.split(",")
    bedfiles = get_beds(stacks)
    binfiles = get_binfiles(bedfiles,
                            fastafile,
                            shift,
                            subtract=subtract,
                            merge=merge)

    sizes = Sizes(fastafile)
    s = list(sizes.iter_sizes())[:top]
    maxl = max(x[1] for x in s)
    margin = 0.08
    inner = 0.02  # y distance between tracks

    pf = fastafile.rsplit(".", 1)[0]
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])

    # Gauge
    ratio = draw_gauge(root, margin, maxl)

    # Per chromosome
    yinterval = (1 - 2 * margin) / (top + 1)
    xx = margin
    yy = 1 - margin
    for chr, clen in s:
        yy -= yinterval
        xlen = clen / ratio
        cc = chr
        if "_" in chr:
            ca, cb = chr.split("_")
            cc = ca[0].upper() + cb

        if switch and cc in switch:
            cc = "\n".join((cc, "({0})".format(switch[cc])))

        root.add_patch(Rectangle((xx, yy), xlen, yinterval - inner,
                                 color=gray))
        ax = fig.add_axes([xx, yy, xlen, yinterval - inner])

        nbins = clen / shift
        if clen % shift:
            nbins += 1

        stackplot(ax, binfiles, nbins, palette, chr, window, shift)
        root.text(xx - 0.04,
                  yy + 0.5 * (yinterval - inner),
                  cc,
                  ha="center",
                  va="center")

        ax.set_xlim(0, nbins)
        ax.set_ylim(0, 1)
        ax.set_axis_off()

    # Legends
    yy -= yinterval
    xx = margin
    for b, p in zip(bedfiles, palette):
        b = b.rsplit(".", 1)[0].replace("_", " ")
        b = Registration.get(b, b)

        root.add_patch(Rectangle((xx, yy), inner, inner, color=p, lw=0))
        xx += 2 * inner
        root.text(xx, yy, b, size=13)
        xx += len(b) * 0.012 + inner

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    image_name = pf + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
예제 #17
0
def covlen(args):
    """
    %prog covlen covfile fastafile

    Plot coverage vs length. `covfile` is two-column listing contig id and
    depth of coverage.
    """
    import numpy as np
    import pandas as pd
    import seaborn as sns
    from jcvi.formats.base import DictFile

    p = OptionParser(covlen.__doc__)
    p.add_option("--maxsize",
                 default=1000000,
                 type="int",
                 help="Max contig size")
    p.add_option("--maxcov", default=100, type="int", help="Max contig size")
    p.add_option("--color", default='m', help="Color of the data points")
    p.add_option("--kind",
                 default="scatter",
                 choices=("scatter", "reg", "resid", "kde", "hex"),
                 help="Kind of plot to draw")
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 2:
        sys.exit(not p.print_help())

    covfile, fastafile = args
    cov = DictFile(covfile, cast=float)
    s = Sizes(fastafile)
    data = []
    maxsize, maxcov = opts.maxsize, opts.maxcov
    for ctg, size in s.iter_sizes():
        c = cov.get(ctg, 0)
        if size > maxsize:
            continue
        if c > maxcov:
            continue
        data.append((size, c))

    x, y = zip(*data)
    x = np.array(x)
    y = np.array(y)
    logging.debug("X size {0}, Y size {1}".format(x.size, y.size))

    df = pd.DataFrame()
    xlab, ylab = "Length", "Coverage of depth (X)"
    df[xlab] = x
    df[ylab] = y
    sns.jointplot(xlab,
                  ylab,
                  kind=opts.kind,
                  data=df,
                  xlim=(0, maxsize),
                  ylim=(0, maxcov),
                  stat_func=None,
                  edgecolor="w",
                  color=opts.color)

    figname = covfile + ".pdf"
    savefig(figname, dpi=iopts.dpi, iopts=iopts)
예제 #18
0
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None):
    # Shuffle the two bedfiles together
    sz = Sizes(bbfasta)
    sizes = sz.mapping
    shuffled = "shuffled.bed"
    border = bfbed.order

    all = []
    afbed.sort(key=afbed.nullkey)
    totalids = len(sizes)
    pad = int(math.log10(totalids)) + 1
    cj = 0
    seen = set()
    accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad)

    for seqid, aa in afbed.sub_beds():
        cj += 1
        abeds, bbeds, beds = [], [], []
        size = sizes[seqid]
        ranges = [(x.seqid, x.start, x.end) for x in aa]
        cranges = range_interleave(ranges, sizes={seqid: size}, empty=True)
        for crange in cranges:
            if crange:
                seqid, start, end = crange
                bedline = "\t".join(str(x) for x in (seqid, start - 1, end))
                abeds.append(BedLine(bedline))
            else:
                abeds.append(None)

        for a in aa:
            gapid = a.accn
            bi, b = border[gapid]
            if a.strand == "-":
                b.extra[1] = b.strand = "-" if b.strand == "+" else "+"

            bbeds.append(b)

        n_abeds = len(abeds)
        n_bbeds = len(bbeds)
        assert n_abeds - n_bbeds == 1, "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds)

        beds = [x for x in roundrobin(abeds, bbeds) if x]
        if prefix:
            for b in beds:
                b.accn = accn(cj)

        all.extend(beds)
        seen.add(seqid)

    # Singletons
    for seqid, size in sz.iter_sizes():
        if seqid in seen:
            continue

        bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj)))
        b = BedLine(bedline)

        cj += 1
        if prefix:
            b.accn = accn(cj)

        all.append(b)

    shuffledbed = Bed()
    shuffledbed.extend(all)
    shuffledbed.print_to_file(shuffled)

    return shuffledbed
예제 #19
0
def fromblast(args):
    """
    %prog fromblast blastfile subject.fasta

    Generate path from BLAST file. If multiple subjects map to the same query,
    an edge is constructed between them (with the link provided by the query).

    The BLAST file MUST be filtered, chained, supermapped.
    """
    from jcvi.formats.blast import sort
    from jcvi.utils.range import range_distance

    p = OptionParser(fromblast.__doc__)
    p.add_option(
        "--clique",
        default=False,
        action="store_true",
        help="Populate clique instead of linear path [default: %default]")
    p.add_option(
        "--maxdist",
        default=100000,
        type="int",
        help="Create edge within certain distance [default: %default]")
    p.add_option("--verbose",
                 default=False,
                 action="store_true",
                 help="Print verbose reports to stdout [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, subjectfasta = args
    clique = opts.clique
    maxdist = opts.maxdist
    sort([blastfile, "--query"])
    blast = BlastSlow(blastfile, sorted=True)
    g = BiGraph()
    for query, blines in groupby(blast, key=lambda x: x.query):
        blines = list(blines)
        iterator = combinations(blines, 2) if clique else pairwise(blines)
        for a, b in iterator:
            asub, bsub = a.subject, b.subject
            if asub == bsub:
                continue

            arange = (a.query, a.qstart, a.qstop, "+")
            brange = (b.query, b.qstart, b.qstop, "+")
            dist, oo = range_distance(arange, brange, distmode="ee")
            if dist > maxdist:
                continue

            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(BiEdge(asub, bsub, atag, btag))

    g.write("graph.txt")
    #g.draw("graph.pdf")

    logging.debug(str(g))
    paths = []
    for path in g.iter_paths():
        m, oo = g.path(path)
        if len(oo) == 1:  # Singleton path
            continue
        paths.append(oo)
        if opts.verbose:
            print m
            print oo

    npaths = len(paths)
    ntigs = sum(len(x) for x in paths)
    logging.debug("Graph decomposed to {0} paths with {1} components.".\
                  format(npaths, ntigs))

    agpfile = blastfile + ".agp"
    sizes = Sizes(subjectfasta)
    fwagp = open(agpfile, "w")
    scaffolded = set()
    for i, oo in enumerate(paths):
        ctgorder = [(str(ctg), ("+" if strand else "-")) \
                     for ctg, strand in oo]
        scaffolded |= set(ctg for ctg, strand in ctgorder)
        object = "pmol_{0:04d}".format(i)
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)

    # Get the singletons as well
    nsingletons = 0
    for ctg, size in sizes.iter_sizes():
        if ctg in scaffolded:
            continue

        ctgorder = [(ctg, "+")]
        object = ctg
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)
        nsingletons += 1
    logging.debug("Written {0} unscaffolded singletons.".format(nsingletons))

    fwagp.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))
예제 #20
0
파일: landscape.py 프로젝트: arvin580/jcvi
def stack(args):
    """
    %prog stack fastafile

    Create landscape plots that show the amounts of genic sequences, and repetitive
    sequences along the chromosomes.
    """
    p = OptionParser(stack.__doc__)
    p.add_option("--top", default=10, type="int",
                 help="Draw the first N chromosomes [default: %default]")
    p.add_option("--stacks",
                 default="Exons,Introns,DNA_transposons,Retrotransposons",
                 help="Features to plot in stackplot [default: %default]")
    p.add_option("--switch",
                 help="Change chr names based on two-column file [default: %default]")
    add_window_options(p)
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    top = opts.top
    window, shift, subtract = check_window_options(opts)
    switch = opts.switch
    if switch:
        switch = DictFile(opts.switch)

    stacks = opts.stacks.split(",")
    bedfiles = get_beds(stacks)
    binfiles = get_binfiles(bedfiles, fastafile, shift, subtract=subtract)

    sizes = Sizes(fastafile)
    s = list(sizes.iter_sizes())[:top]
    maxl = max(x[1] for x in s)
    margin = .08
    inner = .02   # y distance between tracks

    pf = fastafile.rsplit(".", 1)[0]
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])

    # Gauge
    ratio = draw_gauge(root, margin, maxl)

    # Per chromosome
    yinterval = (1 - 2 * margin) / (top + 1)
    xx = margin
    yy = 1 - margin
    for chr, clen in s:
        yy -= yinterval
        xlen = clen / ratio
        cc = chr
        if "_" in chr:
            ca, cb = chr.split("_")
            cc = ca[0].upper() + cb

        if switch and cc in switch:
            cc = "\n".join((cc, "({0})".format(switch[cc])))

        root.add_patch(Rectangle((xx, yy), xlen, yinterval - inner, color=gray))
        ax = fig.add_axes([xx, yy, xlen, yinterval - inner])

        nbins = clen / shift
        if clen % shift:
            nbins += 1

        stackplot(ax, binfiles, nbins, palette, chr, window, shift)
        root.text(xx - .04, yy + .5 * (yinterval - inner), cc, ha="center", va="center")

        ax.set_xlim(0, nbins)
        ax.set_ylim(0, 1)
        ax.set_axis_off()

    # Legends
    yy -= yinterval
    xx = margin
    for b, p in zip(bedfiles, palette):
        b = b.rsplit(".", 1)[0].replace("_", " ")
        b = Registration.get(b, b)

        root.add_patch(Rectangle((xx, yy), inner, inner, color=p, lw=0))
        xx += 2 * inner
        root.text(xx, yy, b, size=13)
        xx += len(b) * .012 + inner

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    image_name = pf + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
예제 #21
0
파일: blast.py 프로젝트: ascendo/jcvi
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    from jcvi.algorithms.supermap import supermap
    from jcvi.utils.range import range_union

    allowed_iterby = ("query", "query_sbjct")

    p = OptionParser(covfilter.__doc__)
    p.set_align(pctid=95, pctcov=50)
    p.add_option("--scov", default=False, action="store_true",
            help="Subject coverage instead of query [default: %default]")
    p.add_option("--supermap", action="store_true",
            help="Use supermap instead of union")
    p.add_option("--ids", dest="ids", default=None,
            help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list", dest="list", default=False, action="store_true",
            help="List the id% and cov% per gene [default: %default]")
    p.add_option("--iterby", dest="iterby", default="query", choices=allowed_iterby,
            help="Choose how to iterate through BLAST [default: %default]")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    pctid = opts.pctid
    pctcov = opts.pctcov
    union = not opts.supermap
    scov = opts.scov
    sz = Sizes(fastafile)
    sizes = sz.mapping
    iterby = opts.iterby
    qspair = iterby == "query_sbjct"

    if not union:
        querysupermap = blastfile + ".query.supermap"
        if not op.exists(querysupermap):
            supermap(blastfile, filter="query")

        blastfile = querysupermap

    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(blastfile)
    iterator = blast.iter_hits_pair if qspair else blast.iter_hits

    covidstore = {}
    for query, blines in iterator():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0
        this_identity = 0

        ranges = []
        for b in blines:
            if scov:
                s, start, stop = b.subject, b.sstart, b.sstop
            else:
                s, start, stop = b.query, b.qstart, b.qstop
            cov_id = s

            if b.pctid < pctid:
                continue

            if start > stop:
                start, stop = stop, start
            this_covered += stop - start + 1
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps
            ranges.append(("1", start, stop))

        if ranges:
            this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen

        if union:
            this_covered = range_union(ranges)

        this_coverage = this_covered * 100. / sizes[cov_id]
        covidstore[query] = (this_identity, this_coverage)
        if this_identity >= pctid and this_coverage >= pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    if opts.list:
        if qspair:
            allpairs = defaultdict(list)
            for (q, s) in covidstore:
                allpairs[q].append((q, s))
                allpairs[s].append((q, s))

            for id, size in sz.iter_sizes():
                if id not in allpairs:
                    print "\t".join((id, "na", "0", "0"))
                else:
                    for qs in allpairs[id]:
                        this_identity, this_coverage = covidstore[qs]
                        print "{0}\t{1:.1f}\t{2:.1f}".format("\t".join(qs), this_identity, this_coverage)
        else:
            for query, size in sz.iter_sizes():
                this_identity, this_coverage = covidstore.get(query, (0, 0))
                print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage)

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    m += "Total mapped: {0} ({1:.1f}% of {2})\n".\
            format(mapped_count, mapped_count * 100. / total, total)
    m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    m += "Average id = {0:.2f}%\n".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sz.totalsize
    m += "Coverage: {0} covered, {1} total\n".\
            format(covered, queries_combined)
    m += "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    logfile = blastfile + ".covfilter.log"
    fw = open(logfile, "w")
    for f in (sys.stderr, fw):
        print >> f, m
    fw.close()

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast:
        query = (b.query, b.subject) if qspair else b.query
        if query in valid:
            print >> fw, b
예제 #22
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    from jcvi.algorithms.supermap import supermap
    from jcvi.utils.range import range_union

    allowed_iterby = ("query", "query_sbjct")

    p = OptionParser(covfilter.__doc__)
    p.set_align(pctid=95, pctcov=50)
    p.add_option("--scov",
                 default=False,
                 action="store_true",
                 help="Subject coverage instead of query [default: %default]")
    p.add_option("--supermap",
                 action="store_true",
                 help="Use supermap instead of union")
    p.add_option("--ids",
                 dest="ids",
                 default=None,
                 help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list",
                 dest="list",
                 default=False,
                 action="store_true",
                 help="List the id% and cov% per gene [default: %default]")
    p.add_option(
        "--iterby",
        dest="iterby",
        default="query",
        choices=allowed_iterby,
        help="Choose how to iterate through BLAST [default: %default]")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    pctid = opts.pctid
    pctcov = opts.pctcov
    union = not opts.supermap
    scov = opts.scov
    sz = Sizes(fastafile)
    sizes = sz.mapping
    iterby = opts.iterby
    qspair = iterby == "query_sbjct"

    if not union:
        querysupermap = blastfile + ".query.supermap"
        if not op.exists(querysupermap):
            supermap(blastfile, filter="query")

        blastfile = querysupermap

    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(blastfile)
    iterator = blast.iter_hits_pair if qspair else blast.iter_hits

    covidstore = {}
    for query, blines in iterator():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0
        this_identity = 0

        ranges = []
        for b in blines:
            if scov:
                s, start, stop = b.subject, b.sstart, b.sstop
            else:
                s, start, stop = b.query, b.qstart, b.qstop
            cov_id = s

            if b.pctid < pctid:
                continue

            if start > stop:
                start, stop = stop, start
            this_covered += stop - start + 1
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps
            ranges.append(("1", start, stop))

        if ranges:
            this_identity = 100. - (this_mismatches +
                                    this_gaps) * 100. / this_alignlen

        if union:
            this_covered = range_union(ranges)

        this_coverage = this_covered * 100. / sizes[cov_id]
        covidstore[query] = (this_identity, this_coverage)
        if this_identity >= pctid and this_coverage >= pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    if opts.list:
        if qspair:
            allpairs = defaultdict(list)
            for (q, s) in covidstore:
                allpairs[q].append((q, s))
                allpairs[s].append((q, s))

            for id, size in sz.iter_sizes():
                if id not in allpairs:
                    print "\t".join((id, "na", "0", "0"))
                else:
                    for qs in allpairs[id]:
                        this_identity, this_coverage = covidstore[qs]
                        print "{0}\t{1:.1f}\t{2:.1f}".format(
                            "\t".join(qs), this_identity, this_coverage)
        else:
            for query, size in sz.iter_sizes():
                this_identity, this_coverage = covidstore.get(query, (0, 0))
                print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity,
                                                     this_coverage)

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    m += "Total mapped: {0} ({1:.1f}% of {2})\n".\
            format(mapped_count, mapped_count * 100. / total, total)
    m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    m += "Average id = {0:.2f}%\n".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sz.totalsize
    m += "Coverage: {0} covered, {1} total\n".\
            format(covered, queries_combined)
    m += "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    logfile = blastfile + ".covfilter.log"
    fw = open(logfile, "w")
    for f in (sys.stderr, fw):
        print >> f, m
    fw.close()

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast:
        query = (b.query, b.subject) if qspair else b.query
        if query in valid:
            print >> fw, b
예제 #23
0
def fromblast(args):
    """
    %prog fromblast blastfile subject.fasta

    Generate path from BLAST file. If multiple subjects map to the same query,
    an edge is constructed between them (with the link provided by the query).

    The BLAST file MUST be filtered, chained, supermapped.
    """
    from jcvi.formats.blast import sort
    from jcvi.utils.range import range_distance

    p = OptionParser(fromblast.__doc__)
    p.add_option("--clique", default=False, action="store_true",
                 help="Populate clique instead of linear path [default: %default]")
    p.add_option("--maxdist", default=100000, type="int",
                 help="Create edge within certain distance [default: %default]")
    p.add_option("--verbose", default=False, action="store_true",
                 help="Print verbose reports to stdout [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, subjectfasta = args
    clique = opts.clique
    maxdist = opts.maxdist
    sort([blastfile, "--query"])
    blast = BlastSlow(blastfile, sorted=True)
    g = BiGraph()
    for query, blines in groupby(blast, key=lambda x: x.query):
        blines = list(blines)
        iterator = combinations(blines, 2) if clique else pairwise(blines)
        for a, b in iterator:
            asub, bsub = a.subject, b.subject
            if asub == bsub:
                continue

            arange = (a.query, a.qstart, a.qstop, "+")
            brange = (b.query, b.qstart, b.qstop, "+")
            dist, oo = range_distance(arange, brange, distmode="ee")
            if dist > maxdist:
                continue

            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(BiEdge(asub, bsub, atag, btag))

    g.write("graph.txt")
    #g.draw("graph.pdf")

    logging.debug(str(g))
    paths = []
    for path in g.iter_paths():
        m, oo = g.path(path)
        if len(oo) == 1:  # Singleton path
            continue
        paths.append(oo)
        if opts.verbose:
            print m
            print oo

    npaths = len(paths)
    ntigs = sum(len(x) for x in paths)
    logging.debug("Graph decomposed to {0} paths with {1} components.".\
                  format(npaths, ntigs))

    agpfile = blastfile + ".agp"
    sizes = Sizes(subjectfasta)
    fwagp = open(agpfile, "w")
    scaffolded = set()
    for i, oo in enumerate(paths):
        ctgorder = [(str(ctg), ("+" if strand else "-")) \
                     for ctg, strand in oo]
        scaffolded |= set(ctg for ctg, strand in ctgorder)
        object = "pmol_{0:04d}".format(i)
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)

    # Get the singletons as well
    nsingletons = 0
    for ctg, size in sizes.iter_sizes():
        if ctg in scaffolded:
            continue

        ctgorder = [(ctg, "+")]
        object = ctg
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)
        nsingletons += 1
    logging.debug("Written {0} unscaffolded singletons.".format(nsingletons))

    fwagp.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))