示例#1
0
class OVL (LineFile):

    def __init__(self, filename):
        super(OVL, self).__init__(filename)
        fp = must_open(filename)
        contained = set()
        for row in fp:
            o = OVLLine(row)
            self.append(o)
            if o.tag == "a in b":
                contained.add(o.a)
            elif o.tag == "b in a":
                contained.add(o.b)
        logging.debug("Imported {} links. Contained tigs: {}".\
                        format(len(self), len(contained)))
        self.contained = contained

        self.graph = BiGraph()
        for o in self:
            if o.tag == "a->b":
                a, b = o.a, o.b
            elif o.tag == "b->a":
                a, b = o.b, o.a
            if a in contained or b in contained:
                continue
            bstrand = '<' if o.bstrand == '-' else '>'
            self.graph.add_edge(a, b, '>', bstrand, length=o.score)
示例#2
0
def partition(args):
    """
    %prog partition happy.txt synteny.graph

    Select edges from another graph and merge it with the certain edges built
    from the HAPPY mapping data.
    """
    allowed_format = ("png", "ps")
    p = OptionParser(partition.__doc__)
    p.add_option("--prefix", help="Add prefix to the name [default: %default]")
    p.add_option("--namestart", default=0, type="int",
                 help="Use a shorter name, starting index [default: %default]")
    p.add_option("--format", default="png", choices=allowed_format,
            help="Generate image of format [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    happyfile, graphfile = args
    bg = BiGraph()
    bg.read(graphfile, color="red")
    prefix = opts.prefix
    fp = open(happyfile)
    for i, row in enumerate(fp):
        nns = happy_nodes(row, prefix=prefix)
        nodes = set(nns)
        edges = happy_edges(row, prefix=prefix)

        small_graph = BiGraph()
        for e, is_uncertain in edges:
            if is_uncertain:
                e.color = "gray"
            small_graph.add_edge(e)

        for (u, v), e in bg.edges.items():
            # Grab edge if both vertices are on the same line
            if u in nodes and v in nodes:
                uv = (str(u), str(v))
                if uv in small_graph.edges:
                    e = small_graph.edges[uv]
                    e.color = "blue"  # supported by both evidences
                else:
                    small_graph.add_edge(e)

        print >> sys.stderr, small_graph

        pngfile = "A{0:02d}.{1}".format(i + 1, opts.format)
        telomeres = (nns[0], nns[-1])
        small_graph.draw(pngfile, namestart=opts.namestart,
                         nodehighlight=telomeres, dpi=72)

    legend = ["Edge colors:"]
    legend.append("[BLUE] Experimental + Synteny")
    legend.append("[BLACK] Experimental certain")
    legend.append("[GRAY] Experimental uncertain")
    legend.append("[RED] Synteny only")
    legend.append("Rectangle nodes are telomeres.")
    print >> sys.stderr, "\n".join(legend)
示例#3
0
def fromblast(args):
    """
    %prog fromblast blastfile subject.fasta

    Generate path from BLAST file. If multiple subjects map to the same query,
    an edge is constructed between them (with the link provided by the query).

    The BLAST file MUST be filtered, chained, supermapped.
    """
    from jcvi.formats.blast import sort
    from jcvi.utils.range import range_distance

    p = OptionParser(fromblast.__doc__)
    p.add_option(
        "--clique",
        default=False,
        action="store_true",
        help="Populate clique instead of linear path",
    )
    p.add_option(
        "--maxdist",
        default=100000,
        type="int",
        help="Create edge within certain distance",
    )
    p.set_verbose(help="Print verbose reports to stdout")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, subjectfasta = args
    clique = opts.clique
    maxdist = opts.maxdist
    sort([blastfile, "--query"])
    blast = BlastSlow(blastfile, sorted=True)
    g = BiGraph()
    for query, blines in groupby(blast, key=lambda x: x.query):
        blines = list(blines)
        iterator = combinations(blines, 2) if clique else pairwise(blines)
        for a, b in iterator:
            asub, bsub = a.subject, b.subject
            if asub == bsub:
                continue

            arange = (a.query, a.qstart, a.qstop, "+")
            brange = (b.query, b.qstart, b.qstop, "+")
            dist, oo = range_distance(arange, brange, distmode="ee")
            if dist > maxdist:
                continue

            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(asub, bsub, atag, btag)

    graph_to_agp(g, blastfile, subjectfasta, verbose=opts.verbose)
示例#4
0
    def graph(self):
        g = BiGraph()
        for scaffold, lines in self.iter_scaffold():
            self.scf[scaffold] = [x.tig for x in lines]

            for a, b in pairwise(lines):
                g.add_edge(a.tig, b.tig, a.o, b.o, length=a.gaps)

            if len(lines) == 1:  # Singleton scaffold
                a = lines[0]
                g.add_node(a.tig)

        return g
示例#5
0
文件: sspace.py 项目: tanghaibao/jcvi
    def graph(self):
        g = BiGraph()
        for scaffold, lines in self.iter_scaffold():
            self.scf[scaffold] = [x.tig for x in lines]

            for a, b in pairwise(lines):
                g.add_edge(a.tig, b.tig, a.o, b.o, length=a.gaps)

            if len(lines) == 1:  # Singleton scaffold
                a = lines[0]
                g.add_node(a.tig)

        return g
示例#6
0
文件: syntenypath.py 项目: rrane/jcvi
def fromblast(args):
    """
    %prog fromblast blastfile subject.fasta

    Generate path from BLAST file. If multiple subjects map to the same query,
    an edge is constructed between them (with the link provided by the query).

    The BLAST file MUST be filtered, chained, supermapped.
    """
    from jcvi.formats.blast import sort
    from jcvi.utils.range import range_distance

    p = OptionParser(fromblast.__doc__)
    p.add_option("--clique", default=False, action="store_true",
                 help="Populate clique instead of linear path [default: %default]")
    p.add_option("--maxdist", default=100000, type="int",
                 help="Create edge within certain distance [default: %default]")
    p.add_option("--verbose", default=False, action="store_true",
                 help="Print verbose reports to stdout [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, subjectfasta = args
    clique = opts.clique
    maxdist = opts.maxdist
    sort([blastfile, "--query"])
    blast = BlastSlow(blastfile, sorted=True)
    g = BiGraph()
    for query, blines in groupby(blast, key=lambda x: x.query):
        blines = list(blines)
        iterator = combinations(blines, 2) if clique else pairwise(blines)
        for a, b in iterator:
            asub, bsub = a.subject, b.subject
            if asub == bsub:
                continue

            arange = (a.query, a.qstart, a.qstop, "+")
            brange = (b.query, b.qstart, b.qstop, "+")
            dist, oo = range_distance(arange, brange, distmode="ee")
            if dist > maxdist:
                continue

            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(BiEdge(asub, bsub, atag, btag))

    graph_to_agp(g, blastfile, subjectfasta, verbose=opts.verbose)
示例#7
0
def graph(args):
    """
    %prog graph all.ov contigs.fasta

    Load overlaps to create bidirectional graph.
    """
    from jcvi.algorithms.graph import BiGraph, BiEdge
    from jcvi.assembly.syntenypath import graph_to_agp

    p = OptionParser(graph.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    allov, contigsfasta = args
    g = BiGraph()
    fp = open(allov)
    # ctg7180000045865 - ctg7180000086408: a ~ b Overlap: 5108 Identity: 99.02%
    # Orientation: +
    contained = set()
    for row in fp:
        atoms = row.strip().split(":")
        pair = atoms[0]
        a, b = pair.split(" - ")
        orientation = atoms[-1].strip()
        tag = atoms[1].replace("Overlap", "").strip()
        oa = ">"
        ob = "<" if orientation == '-' else ">"
        if tag == "a ~ b":
            g.add_edge(BiEdge(a, b, oa, ob))
        elif tag == "b ~ a":
            g.add_edge(BiEdge(b, a, ob, oa))
        elif tag == "a in b":
            contained.add(a)
        elif tag == "b in a":
            contained.add(b)
    graph_to_agp(g, allov, contigsfasta)

    containedfile = "contained.ids"
    fw = open(containedfile, "w")
    print >> fw, "\n".join(contained)
    fw.close()
示例#8
0
class OVL(LineFile):
    def __init__(self, filename):
        super(OVL, self).__init__(filename)
        fp = must_open(filename)
        contained = set()
        alledges = defaultdict(list)
        for row in fp:
            o = OVLLine(row)
            self.append(o)
            if o.tag == "a in b":
                contained.add(o.a)
            elif o.tag == "b in a":
                contained.add(o.b)
            if o.tag == "a->b":
                alledges[o.a + "-3`"].append(o)
            elif o.tag == "b->a":
                alledges[o.a + "-5`"].append(o)
        logging.debug(
            "Imported {} links. Contained tigs: {}".format(len(self), len(contained))
        )
        self.contained = contained

        logging.debug("Pruning edges to keep the mutual best")
        for k, v in alledges.items():
            bo = max(v, key=lambda x: x.score)
            bo.best = True

        self.graph = BiGraph()
        for o in self:
            if not o.best:
                continue
            if o.tag == "a->b":
                a, b = o.a, o.b
            elif o.tag == "b->a":
                a, b = o.b, o.a
            if a in contained or b in contained:
                continue
            bstrand = "<" if o.bstrand == "-" else ">"
            self.graph.add_edge(a, b, ">", bstrand, length=o.score)
示例#9
0
class OVL (LineFile):

    def __init__(self, filename):
        super(OVL, self).__init__(filename)
        fp = must_open(filename)
        contained = set()
        alledges = defaultdict(list)
        for row in fp:
            o = OVLLine(row)
            self.append(o)
            if o.tag == "a in b":
                contained.add(o.a)
            elif o.tag == "b in a":
                contained.add(o.b)
            if o.tag == "a->b":
                alledges[o.a + "-3`"].append(o)
            elif o.tag == "b->a":
                alledges[o.a + "-5`"].append(o)
        logging.debug("Imported {} links. Contained tigs: {}".\
                        format(len(self), len(contained)))
        self.contained = contained

        logging.debug("Pruning edges to keep the mutual best")
        for k, v in alledges.items():
            bo = max(v, key=lambda x: x.score)
            bo.best = True

        self.graph = BiGraph()
        for o in self:
            if not o.best:
                continue
            if o.tag == "a->b":
                a, b = o.a, o.b
            elif o.tag == "b->a":
                a, b = o.b, o.a
            if a in contained or b in contained:
                continue
            bstrand = '<' if o.bstrand == '-' else '>'
            self.graph.add_edge(a, b, '>', bstrand, length=o.score)
示例#10
0
def connect(args):
    """
    %prog connect assembly.fasta read_mapping.blast

    Connect contigs using long reads.
    """
    p = OptionParser(connect.__doc__)
    p.add_option(
        "--clip",
        default=2000,
        type="int",
        help="Only consider end of contigs",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, blastfile = args
    clip = opts.clip

    sizes = Sizes(fastafile).mapping
    blast = Blast(blastfile)
    blasts = []
    for b in blast:
        seqid = b.subject
        size = sizes[seqid]
        start, end = b.sstart, b.sstop
        cstart, cend = min(size, clip), max(0, size - clip)
        if start > cstart and end < cend:
            continue
        blasts.append(b)

    key = lambda x: x.query
    blasts.sort(key=key)
    g = BiGraph()
    for query, bb in groupby(blasts, key=key):
        bb = sorted(bb, key=lambda x: x.qstart)
        nsubjects = len(set(x.subject for x in bb))
        if nsubjects == 1:
            continue
        print("\n".join(str(x) for x in bb))
        for a, b in pairwise(bb):
            astart, astop = a.qstart, a.qstop
            bstart, bstop = b.qstart, b.qstop
            if a.subject == b.subject:
                continue

            arange = astart, astop
            brange = bstart, bstop
            ov = range_intersect(arange, brange)
            alen = astop - astart + 1
            blen = bstop - bstart + 1
            if ov:
                ostart, ostop = ov
                ov = ostop - ostart + 1

            print(ov, alen, blen)
            if ov and (ov > alen / 2 or ov > blen / 2):
                print("Too much overlap ({0})".format(ov))
                continue

            asub = a.subject
            bsub = b.subject
            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(asub, bsub, atag, btag)

    graph_to_agp(g, blastfile, fastafile, verbose=False)
示例#11
0
def partition(args):
    """
    %prog partition happy.txt synteny.graph

    Select edges from another graph and merge it with the certain edges built
    from the HAPPY mapping data.
    """
    allowed_format = ("png", "ps")
    p = OptionParser(partition.__doc__)
    p.add_option("--prefix", help="Add prefix to the name")
    p.add_option(
        "--namestart",
        default=0,
        type="int",
        help="Use a shorter name, starting index",
    )
    p.add_option(
        "--format",
        default="png",
        choices=allowed_format,
        help="Generate image of format",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    happyfile, graphfile = args
    bg = BiGraph()
    bg.read(graphfile, color="red")
    prefix = opts.prefix
    fp = open(happyfile)
    for i, row in enumerate(fp):
        nns = happy_nodes(row, prefix=prefix)
        nodes = set(nns)
        edges = happy_edges(row, prefix=prefix)

        small_graph = BiGraph()
        for (a, b, oa, ob), is_uncertain in edges:
            color = "gray" if is_uncertain else "black"
            small_graph.add_edge(a, b, oa, ob, color=color)

        for (u, v), e in bg.edges.items():
            # Grab edge if both vertices are on the same line
            if u in nodes and v in nodes:
                uv = (str(u), str(v))
                if uv in small_graph.edges:
                    e = small_graph.edges[uv]
                    e.color = "blue"  # supported by both evidences
                else:
                    small_graph.add_edge(e)

        print(small_graph, file=sys.stderr)

        pngfile = "A{0:02d}.{1}".format(i + 1, opts.format)
        telomeres = (nns[0], nns[-1])
        small_graph.draw(
            pngfile, namestart=opts.namestart, nodehighlight=telomeres, dpi=72
        )

    legend = [
        "Edge colors:",
        "[BLUE] Experimental + Synteny",
        "[BLACK] Experimental certain",
        "[GRAY] Experimental uncertain",
        "[RED] Synteny only",
        "Rectangle nodes are telomeres.",
    ]
    print("\n".join(legend), file=sys.stderr)
示例#12
0
文件: scaffold.py 项目: rrane/jcvi
def connect(args):
    """
    %prog connect assembly.fasta read_mapping.blast

    Connect contigs using long reads.
    """
    from jcvi.formats.sizes import Sizes
    from jcvi.formats.blast import Blast
    from jcvi.utils.iter import pairwise
    from jcvi.utils.range import range_intersect
    from jcvi.algorithms.graph import BiGraph, BiEdge
    from jcvi.assembly.syntenypath import graph_to_agp

    p = OptionParser(connect.__doc__)
    p.add_option("--clip", default=2000, type="int",
            help="Only consider end of contigs [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, blastfile = args
    clip = opts.clip

    sizes = Sizes(fastafile).mapping
    blast = Blast(blastfile)
    blasts = []
    for b in blast:
        seqid = b.subject
        size = sizes[seqid]
        start, end = b.sstart, b.sstop
        cstart, cend = min(size, clip), max(0, size - clip)
        if start > cstart and end < cend:
            continue
        blasts.append(b)

    key = lambda x: x.query
    blasts.sort(key=key)
    g = BiGraph()
    for query, bb in groupby(blasts, key=key):
        bb = sorted(bb, key=lambda x: x.qstart)
        nsubjects = len(set(x.subject for x in bb))
        if nsubjects == 1:
            continue
        print "\n".join(str(x) for x in bb)
        for a, b in pairwise(bb):
            astart, astop = a.qstart, a.qstop
            bstart, bstop = b.qstart, b.qstop
            if a.subject == b.subject:
                continue

            arange = astart, astop
            brange = bstart, bstop
            ov = range_intersect(arange, brange)
            alen = astop - astart + 1
            blen = bstop - bstart + 1
            if ov:
                ostart, ostop = ov
                ov = ostop - ostart + 1

            print ov, alen, blen
            if ov and (ov > alen / 2 or ov > blen / 2):
                print "Too much overlap ({0})".format(ov)
                continue

            asub = a.subject
            bsub = b.subject
            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            e = BiEdge(asub, bsub, atag, btag)
            g.add_edge(e)
            print "=" * 5, e

    graph_to_agp(g, blastfile, fastafile, verbose=False)
示例#13
0
def fromblast(args):
    """
    %prog fromblast blastfile subject.fasta

    Generate path from BLAST file. If multiple subjects map to the same query,
    an edge is constructed between them (with the link provided by the query).

    The BLAST file MUST be filtered, chained, supermapped.
    """
    from jcvi.formats.blast import sort
    from jcvi.utils.range import range_distance

    p = OptionParser(fromblast.__doc__)
    p.add_option(
        "--clique",
        default=False,
        action="store_true",
        help="Populate clique instead of linear path [default: %default]")
    p.add_option(
        "--maxdist",
        default=100000,
        type="int",
        help="Create edge within certain distance [default: %default]")
    p.add_option("--verbose",
                 default=False,
                 action="store_true",
                 help="Print verbose reports to stdout [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, subjectfasta = args
    clique = opts.clique
    maxdist = opts.maxdist
    sort([blastfile, "--query"])
    blast = BlastSlow(blastfile, sorted=True)
    g = BiGraph()
    for query, blines in groupby(blast, key=lambda x: x.query):
        blines = list(blines)
        iterator = combinations(blines, 2) if clique else pairwise(blines)
        for a, b in iterator:
            asub, bsub = a.subject, b.subject
            if asub == bsub:
                continue

            arange = (a.query, a.qstart, a.qstop, "+")
            brange = (b.query, b.qstart, b.qstop, "+")
            dist, oo = range_distance(arange, brange, distmode="ee")
            if dist > maxdist:
                continue

            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(BiEdge(asub, bsub, atag, btag))

    g.write("graph.txt")
    #g.draw("graph.pdf")

    logging.debug(str(g))
    paths = []
    for path in g.iter_paths():
        m, oo = g.path(path)
        if len(oo) == 1:  # Singleton path
            continue
        paths.append(oo)
        if opts.verbose:
            print m
            print oo

    npaths = len(paths)
    ntigs = sum(len(x) for x in paths)
    logging.debug("Graph decomposed to {0} paths with {1} components.".\
                  format(npaths, ntigs))

    agpfile = blastfile + ".agp"
    sizes = Sizes(subjectfasta)
    fwagp = open(agpfile, "w")
    scaffolded = set()
    for i, oo in enumerate(paths):
        ctgorder = [(str(ctg), ("+" if strand else "-")) \
                     for ctg, strand in oo]
        scaffolded |= set(ctg for ctg, strand in ctgorder)
        object = "pmol_{0:04d}".format(i)
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)

    # Get the singletons as well
    nsingletons = 0
    for ctg, size in sizes.iter_sizes():
        if ctg in scaffolded:
            continue

        ctgorder = [(ctg, "+")]
        object = ctg
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)
        nsingletons += 1
    logging.debug("Written {0} unscaffolded singletons.".format(nsingletons))

    fwagp.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))
示例#14
0
def fromblast(args):
    """
    %prog fromblast blastfile subject.fasta

    Generate path from BLAST file. If multiple subjects map to the same query,
    an edge is constructed between them (with the link provided by the query).

    The BLAST file MUST be filtered, chained, supermapped.
    """
    from jcvi.formats.blast import sort
    from jcvi.utils.range import range_distance

    p = OptionParser(fromblast.__doc__)
    p.add_option("--clique", default=False, action="store_true",
                 help="Populate clique instead of linear path [default: %default]")
    p.add_option("--maxdist", default=100000, type="int",
                 help="Create edge within certain distance [default: %default]")
    p.add_option("--verbose", default=False, action="store_true",
                 help="Print verbose reports to stdout [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, subjectfasta = args
    clique = opts.clique
    maxdist = opts.maxdist
    sort([blastfile, "--query"])
    blast = BlastSlow(blastfile, sorted=True)
    g = BiGraph()
    for query, blines in groupby(blast, key=lambda x: x.query):
        blines = list(blines)
        iterator = combinations(blines, 2) if clique else pairwise(blines)
        for a, b in iterator:
            asub, bsub = a.subject, b.subject
            if asub == bsub:
                continue

            arange = (a.query, a.qstart, a.qstop, "+")
            brange = (b.query, b.qstart, b.qstop, "+")
            dist, oo = range_distance(arange, brange, distmode="ee")
            if dist > maxdist:
                continue

            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(BiEdge(asub, bsub, atag, btag))

    g.write("graph.txt")
    #g.draw("graph.pdf")

    logging.debug(str(g))
    paths = []
    for path in g.iter_paths():
        m, oo = g.path(path)
        if len(oo) == 1:  # Singleton path
            continue
        paths.append(oo)
        if opts.verbose:
            print m
            print oo

    npaths = len(paths)
    ntigs = sum(len(x) for x in paths)
    logging.debug("Graph decomposed to {0} paths with {1} components.".\
                  format(npaths, ntigs))

    agpfile = blastfile + ".agp"
    sizes = Sizes(subjectfasta)
    fwagp = open(agpfile, "w")
    scaffolded = set()
    for i, oo in enumerate(paths):
        ctgorder = [(str(ctg), ("+" if strand else "-")) \
                     for ctg, strand in oo]
        scaffolded |= set(ctg for ctg, strand in ctgorder)
        object = "pmol_{0:04d}".format(i)
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)

    # Get the singletons as well
    nsingletons = 0
    for ctg, size in sizes.iter_sizes():
        if ctg in scaffolded:
            continue

        ctgorder = [(ctg, "+")]
        object = ctg
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)
        nsingletons += 1
    logging.debug("Written {0} unscaffolded singletons.".format(nsingletons))

    fwagp.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))