Пример #1
0
def graph(args):
    """
    %prog graph all.ov contigs.fasta

    Load overlaps to create bidirectional graph.
    """
    from jcvi.algorithms.graph import BiGraph, BiEdge
    from jcvi.assembly.syntenypath import graph_to_agp

    p = OptionParser(graph.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    allov, contigsfasta = args
    g = BiGraph()
    fp = open(allov)
    # ctg7180000045865 - ctg7180000086408: a ~ b Overlap: 5108 Identity: 99.02%
    # Orientation: +
    contained = set()
    for row in fp:
        atoms = row.strip().split(":")
        pair = atoms[0]
        a, b = pair.split(" - ")
        orientation = atoms[-1].strip()
        tag = atoms[1].replace("Overlap", "").strip()
        oa = ">"
        ob = "<" if orientation == '-' else ">"
        if tag == "a ~ b":
            g.add_edge(BiEdge(a, b, oa, ob))
        elif tag == "b ~ a":
            g.add_edge(BiEdge(b, a, ob, oa))
        elif tag == "a in b":
            contained.add(a)
        elif tag == "b in a":
            contained.add(b)
    graph_to_agp(g, allov, contigsfasta)

    containedfile = "contained.ids"
    fw = open(containedfile, "w")
    print >> fw, "\n".join(contained)
    fw.close()
Пример #2
0
def connect(args):
    """
    %prog connect assembly.fasta read_mapping.blast

    Connect contigs using long reads.
    """
    from jcvi.formats.sizes import Sizes
    from jcvi.formats.blast import Blast
    from jcvi.utils.iter import pairwise
    from jcvi.utils.range import range_intersect
    from jcvi.algorithms.graph import BiGraph, BiEdge
    from jcvi.assembly.syntenypath import graph_to_agp

    p = OptionParser(connect.__doc__)
    p.add_option("--clip", default=2000, type="int",
            help="Only consider end of contigs [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, blastfile = args
    clip = opts.clip

    sizes = Sizes(fastafile).mapping
    blast = Blast(blastfile)
    blasts = []
    for b in blast:
        seqid = b.subject
        size = sizes[seqid]
        start, end = b.sstart, b.sstop
        cstart, cend = min(size, clip), max(0, size - clip)
        if start > cstart and end < cend:
            continue
        blasts.append(b)

    key = lambda x: x.query
    blasts.sort(key=key)
    g = BiGraph()
    for query, bb in groupby(blasts, key=key):
        bb = sorted(bb, key=lambda x: x.qstart)
        nsubjects = len(set(x.subject for x in bb))
        if nsubjects == 1:
            continue
        print "\n".join(str(x) for x in bb)
        for a, b in pairwise(bb):
            astart, astop = a.qstart, a.qstop
            bstart, bstop = b.qstart, b.qstop
            if a.subject == b.subject:
                continue

            arange = astart, astop
            brange = bstart, bstop
            ov = range_intersect(arange, brange)
            alen = astop - astart + 1
            blen = bstop - bstart + 1
            if ov:
                ostart, ostop = ov
                ov = ostop - ostart + 1

            print ov, alen, blen
            if ov and (ov > alen / 2 or ov > blen / 2):
                print "Too much overlap ({0})".format(ov)
                continue

            asub = a.subject
            bsub = b.subject
            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            e = BiEdge(asub, bsub, atag, btag)
            g.add_edge(e)
            print "=" * 5, e

    graph_to_agp(g, blastfile, fastafile, verbose=False)
Пример #3
0
def connect(args):
    """
    %prog connect assembly.fasta read_mapping.blast

    Connect contigs using long reads.
    """
    from jcvi.formats.sizes import Sizes
    from jcvi.formats.blast import Blast
    from jcvi.utils.iter import pairwise
    from jcvi.utils.range import range_intersect
    from jcvi.algorithms.graph import BiGraph, BiEdge
    from jcvi.assembly.syntenypath import graph_to_agp

    p = OptionParser(connect.__doc__)
    p.add_option("--clip",
                 default=2000,
                 type="int",
                 help="Only consider end of contigs [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, blastfile = args
    clip = opts.clip

    sizes = Sizes(fastafile).mapping
    blast = Blast(blastfile)
    blasts = []
    for b in blast:
        seqid = b.subject
        size = sizes[seqid]
        start, end = b.sstart, b.sstop
        cstart, cend = min(size, clip), max(0, size - clip)
        if start > cstart and end < cend:
            continue
        blasts.append(b)

    key = lambda x: x.query
    blasts.sort(key=key)
    g = BiGraph()
    for query, bb in groupby(blasts, key=key):
        bb = sorted(bb, key=lambda x: x.qstart)
        nsubjects = len(set(x.subject for x in bb))
        if nsubjects == 1:
            continue
        print "\n".join(str(x) for x in bb)
        for a, b in pairwise(bb):
            astart, astop = a.qstart, a.qstop
            bstart, bstop = b.qstart, b.qstop
            if a.subject == b.subject:
                continue

            arange = astart, astop
            brange = bstart, bstop
            ov = range_intersect(arange, brange)
            alen = astop - astart + 1
            blen = bstop - bstart + 1
            if ov:
                ostart, ostop = ov
                ov = ostop - ostart + 1

            print ov, alen, blen
            if ov and (ov > alen / 2 or ov > blen / 2):
                print "Too much overlap ({0})".format(ov)
                continue

            asub = a.subject
            bsub = b.subject
            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            e = BiEdge(asub, bsub, atag, btag)
            g.add_edge(e)
            print "=" * 5, e

    graph_to_agp(g, blastfile, fastafile, verbose=False)