class OVL (LineFile): def __init__(self, filename): super(OVL, self).__init__(filename) fp = must_open(filename) contained = set() for row in fp: o = OVLLine(row) self.append(o) if o.tag == "a in b": contained.add(o.a) elif o.tag == "b in a": contained.add(o.b) logging.debug("Imported {} links. Contained tigs: {}".\ format(len(self), len(contained))) self.contained = contained self.graph = BiGraph() for o in self: if o.tag == "a->b": a, b = o.a, o.b elif o.tag == "b->a": a, b = o.b, o.a if a in contained or b in contained: continue bstrand = '<' if o.bstrand == '-' else '>' self.graph.add_edge(a, b, '>', bstrand, length=o.score)
def partition(args): """ %prog partition happy.txt synteny.graph Select edges from another graph and merge it with the certain edges built from the HAPPY mapping data. """ allowed_format = ("png", "ps") p = OptionParser(partition.__doc__) p.add_option("--prefix", help="Add prefix to the name [default: %default]") p.add_option("--namestart", default=0, type="int", help="Use a shorter name, starting index [default: %default]") p.add_option("--format", default="png", choices=allowed_format, help="Generate image of format [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) happyfile, graphfile = args bg = BiGraph() bg.read(graphfile, color="red") prefix = opts.prefix fp = open(happyfile) for i, row in enumerate(fp): nns = happy_nodes(row, prefix=prefix) nodes = set(nns) edges = happy_edges(row, prefix=prefix) small_graph = BiGraph() for e, is_uncertain in edges: if is_uncertain: e.color = "gray" small_graph.add_edge(e) for (u, v), e in bg.edges.items(): # Grab edge if both vertices are on the same line if u in nodes and v in nodes: uv = (str(u), str(v)) if uv in small_graph.edges: e = small_graph.edges[uv] e.color = "blue" # supported by both evidences else: small_graph.add_edge(e) print >> sys.stderr, small_graph pngfile = "A{0:02d}.{1}".format(i + 1, opts.format) telomeres = (nns[0], nns[-1]) small_graph.draw(pngfile, namestart=opts.namestart, nodehighlight=telomeres, dpi=72) legend = ["Edge colors:"] legend.append("[BLUE] Experimental + Synteny") legend.append("[BLACK] Experimental certain") legend.append("[GRAY] Experimental uncertain") legend.append("[RED] Synteny only") legend.append("Rectangle nodes are telomeres.") print >> sys.stderr, "\n".join(legend)
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option( "--clique", default=False, action="store_true", help="Populate clique instead of linear path", ) p.add_option( "--maxdist", default=100000, type="int", help="Create edge within certain distance", ) p.set_verbose(help="Print verbose reports to stdout") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(asub, bsub, atag, btag) graph_to_agp(g, blastfile, subjectfasta, verbose=opts.verbose)
def graph(self): g = BiGraph() for scaffold, lines in self.iter_scaffold(): self.scf[scaffold] = [x.tig for x in lines] for a, b in pairwise(lines): g.add_edge(a.tig, b.tig, a.o, b.o, length=a.gaps) if len(lines) == 1: # Singleton scaffold a = lines[0] g.add_node(a.tig) return g
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option("--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]") p.add_option("--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]") p.add_option("--verbose", default=False, action="store_true", help="Print verbose reports to stdout [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(BiEdge(asub, bsub, atag, btag)) graph_to_agp(g, blastfile, subjectfasta, verbose=opts.verbose)
def graph(args): """ %prog graph all.ov contigs.fasta Load overlaps to create bidirectional graph. """ from jcvi.algorithms.graph import BiGraph, BiEdge from jcvi.assembly.syntenypath import graph_to_agp p = OptionParser(graph.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) allov, contigsfasta = args g = BiGraph() fp = open(allov) # ctg7180000045865 - ctg7180000086408: a ~ b Overlap: 5108 Identity: 99.02% # Orientation: + contained = set() for row in fp: atoms = row.strip().split(":") pair = atoms[0] a, b = pair.split(" - ") orientation = atoms[-1].strip() tag = atoms[1].replace("Overlap", "").strip() oa = ">" ob = "<" if orientation == '-' else ">" if tag == "a ~ b": g.add_edge(BiEdge(a, b, oa, ob)) elif tag == "b ~ a": g.add_edge(BiEdge(b, a, ob, oa)) elif tag == "a in b": contained.add(a) elif tag == "b in a": contained.add(b) graph_to_agp(g, allov, contigsfasta) containedfile = "contained.ids" fw = open(containedfile, "w") print >> fw, "\n".join(contained) fw.close()
class OVL(LineFile): def __init__(self, filename): super(OVL, self).__init__(filename) fp = must_open(filename) contained = set() alledges = defaultdict(list) for row in fp: o = OVLLine(row) self.append(o) if o.tag == "a in b": contained.add(o.a) elif o.tag == "b in a": contained.add(o.b) if o.tag == "a->b": alledges[o.a + "-3`"].append(o) elif o.tag == "b->a": alledges[o.a + "-5`"].append(o) logging.debug( "Imported {} links. Contained tigs: {}".format(len(self), len(contained)) ) self.contained = contained logging.debug("Pruning edges to keep the mutual best") for k, v in alledges.items(): bo = max(v, key=lambda x: x.score) bo.best = True self.graph = BiGraph() for o in self: if not o.best: continue if o.tag == "a->b": a, b = o.a, o.b elif o.tag == "b->a": a, b = o.b, o.a if a in contained or b in contained: continue bstrand = "<" if o.bstrand == "-" else ">" self.graph.add_edge(a, b, ">", bstrand, length=o.score)
class OVL (LineFile): def __init__(self, filename): super(OVL, self).__init__(filename) fp = must_open(filename) contained = set() alledges = defaultdict(list) for row in fp: o = OVLLine(row) self.append(o) if o.tag == "a in b": contained.add(o.a) elif o.tag == "b in a": contained.add(o.b) if o.tag == "a->b": alledges[o.a + "-3`"].append(o) elif o.tag == "b->a": alledges[o.a + "-5`"].append(o) logging.debug("Imported {} links. Contained tigs: {}".\ format(len(self), len(contained))) self.contained = contained logging.debug("Pruning edges to keep the mutual best") for k, v in alledges.items(): bo = max(v, key=lambda x: x.score) bo.best = True self.graph = BiGraph() for o in self: if not o.best: continue if o.tag == "a->b": a, b = o.a, o.b elif o.tag == "b->a": a, b = o.b, o.a if a in contained or b in contained: continue bstrand = '<' if o.bstrand == '-' else '>' self.graph.add_edge(a, b, '>', bstrand, length=o.score)
def connect(args): """ %prog connect assembly.fasta read_mapping.blast Connect contigs using long reads. """ p = OptionParser(connect.__doc__) p.add_option( "--clip", default=2000, type="int", help="Only consider end of contigs", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, blastfile = args clip = opts.clip sizes = Sizes(fastafile).mapping blast = Blast(blastfile) blasts = [] for b in blast: seqid = b.subject size = sizes[seqid] start, end = b.sstart, b.sstop cstart, cend = min(size, clip), max(0, size - clip) if start > cstart and end < cend: continue blasts.append(b) key = lambda x: x.query blasts.sort(key=key) g = BiGraph() for query, bb in groupby(blasts, key=key): bb = sorted(bb, key=lambda x: x.qstart) nsubjects = len(set(x.subject for x in bb)) if nsubjects == 1: continue print("\n".join(str(x) for x in bb)) for a, b in pairwise(bb): astart, astop = a.qstart, a.qstop bstart, bstop = b.qstart, b.qstop if a.subject == b.subject: continue arange = astart, astop brange = bstart, bstop ov = range_intersect(arange, brange) alen = astop - astart + 1 blen = bstop - bstart + 1 if ov: ostart, ostop = ov ov = ostop - ostart + 1 print(ov, alen, blen) if ov and (ov > alen / 2 or ov > blen / 2): print("Too much overlap ({0})".format(ov)) continue asub = a.subject bsub = b.subject atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(asub, bsub, atag, btag) graph_to_agp(g, blastfile, fastafile, verbose=False)
def partition(args): """ %prog partition happy.txt synteny.graph Select edges from another graph and merge it with the certain edges built from the HAPPY mapping data. """ allowed_format = ("png", "ps") p = OptionParser(partition.__doc__) p.add_option("--prefix", help="Add prefix to the name") p.add_option( "--namestart", default=0, type="int", help="Use a shorter name, starting index", ) p.add_option( "--format", default="png", choices=allowed_format, help="Generate image of format", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) happyfile, graphfile = args bg = BiGraph() bg.read(graphfile, color="red") prefix = opts.prefix fp = open(happyfile) for i, row in enumerate(fp): nns = happy_nodes(row, prefix=prefix) nodes = set(nns) edges = happy_edges(row, prefix=prefix) small_graph = BiGraph() for (a, b, oa, ob), is_uncertain in edges: color = "gray" if is_uncertain else "black" small_graph.add_edge(a, b, oa, ob, color=color) for (u, v), e in bg.edges.items(): # Grab edge if both vertices are on the same line if u in nodes and v in nodes: uv = (str(u), str(v)) if uv in small_graph.edges: e = small_graph.edges[uv] e.color = "blue" # supported by both evidences else: small_graph.add_edge(e) print(small_graph, file=sys.stderr) pngfile = "A{0:02d}.{1}".format(i + 1, opts.format) telomeres = (nns[0], nns[-1]) small_graph.draw( pngfile, namestart=opts.namestart, nodehighlight=telomeres, dpi=72 ) legend = [ "Edge colors:", "[BLUE] Experimental + Synteny", "[BLACK] Experimental certain", "[GRAY] Experimental uncertain", "[RED] Synteny only", "Rectangle nodes are telomeres.", ] print("\n".join(legend), file=sys.stderr)
def connect(args): """ %prog connect assembly.fasta read_mapping.blast Connect contigs using long reads. """ from jcvi.formats.sizes import Sizes from jcvi.formats.blast import Blast from jcvi.utils.iter import pairwise from jcvi.utils.range import range_intersect from jcvi.algorithms.graph import BiGraph, BiEdge from jcvi.assembly.syntenypath import graph_to_agp p = OptionParser(connect.__doc__) p.add_option("--clip", default=2000, type="int", help="Only consider end of contigs [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, blastfile = args clip = opts.clip sizes = Sizes(fastafile).mapping blast = Blast(blastfile) blasts = [] for b in blast: seqid = b.subject size = sizes[seqid] start, end = b.sstart, b.sstop cstart, cend = min(size, clip), max(0, size - clip) if start > cstart and end < cend: continue blasts.append(b) key = lambda x: x.query blasts.sort(key=key) g = BiGraph() for query, bb in groupby(blasts, key=key): bb = sorted(bb, key=lambda x: x.qstart) nsubjects = len(set(x.subject for x in bb)) if nsubjects == 1: continue print "\n".join(str(x) for x in bb) for a, b in pairwise(bb): astart, astop = a.qstart, a.qstop bstart, bstop = b.qstart, b.qstop if a.subject == b.subject: continue arange = astart, astop brange = bstart, bstop ov = range_intersect(arange, brange) alen = astop - astart + 1 blen = bstop - bstart + 1 if ov: ostart, ostop = ov ov = ostop - ostart + 1 print ov, alen, blen if ov and (ov > alen / 2 or ov > blen / 2): print "Too much overlap ({0})".format(ov) continue asub = a.subject bsub = b.subject atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" e = BiEdge(asub, bsub, atag, btag) g.add_edge(e) print "=" * 5, e graph_to_agp(g, blastfile, fastafile, verbose=False)
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option( "--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]") p.add_option( "--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]") p.add_option("--verbose", default=False, action="store_true", help="Print verbose reports to stdout [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(BiEdge(asub, bsub, atag, btag)) g.write("graph.txt") #g.draw("graph.pdf") logging.debug(str(g)) paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if opts.verbose: print m print oo npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug("Graph decomposed to {0} paths with {1} components.".\ format(npaths, ntigs)) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) \ for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug("Written {0} unscaffolded singletons.".format(nsingletons)) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option("--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]") p.add_option("--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]") p.add_option("--verbose", default=False, action="store_true", help="Print verbose reports to stdout [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(BiEdge(asub, bsub, atag, btag)) g.write("graph.txt") #g.draw("graph.pdf") logging.debug(str(g)) paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if opts.verbose: print m print oo npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug("Graph decomposed to {0} paths with {1} components.".\ format(npaths, ntigs)) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) \ for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug("Written {0} unscaffolded singletons.".format(nsingletons)) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))