def path(self, path): from jcvi.utils.iter import pairwise oo = [] if len(path) == 1: m = "Singleton {0}".format(path[0]) oo.append((path[0].v, True)) return m, oo edges = [] for a, b in pairwise(path): av, bv = a.v, b.v flip = False if av > bv: av, bv = bv, av flip = True e = self.edges[(av, bv)] if flip: e.flip() if not oo: # First edge imports two nodes oo.append((e.v1.v, e.o1 == ">")) last = oo[-1] assert last == (e.v1.v, e.o1 == ">") oo.append((e.v2.v, e.o2 == ">")) if flip: se = str(e) e.flip() else: se = str(e) edges.append(se) return "|".join(edges), oo
def range_interleave(ranges): """ Returns the ranges in between the given ranges. >>> ranges = [("1", 30, 40), ("1", 45, 50), ("1", 10, 30)] >>> range_interleave(ranges) [('1', 41, 44)] >>> ranges = [("1", 30, 40), ("1", 42, 50)] >>> range_interleave(ranges) [('1', 41, 41)] """ from jcvi.utils.iter import pairwise ranges = range_merge(ranges) interleaved_ranges = [] for ch, cranges in groupby(ranges, key=lambda x: x[0]): for i, (a, b) in enumerate(pairwise(cranges)): ch, astart, aend = a ch, bstart, bend = b istart, iend = aend + 1, bstart - 1 if istart > iend: continue interleaved_ranges.append((ch, istart, iend)) return interleaved_ranges
def distance(args): """ %prog distance bedfile Calculate distance between bed features. The output file is a list of distances, which can be used to plot histogram, etc. """ from jcvi.utils.iter import pairwise p = OptionParser(distance.__doc__) p.add_option("--distmode", default="ss", choices=("ss", "ee"), help="Distance mode between paired reads. ss is outer distance, " \ "ee is inner distance [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args sortedbedfile = sort([bedfile]) valid = total = 0 fp = open(sortedbedfile) for a, b in pairwise(fp): a = BedLine(a) b = BedLine(b) ar = (a.seqid, a.start, a.end, "+") br = (b.seqid, b.start, b.end, "+") dist, oo = range_distance(ar, br, distmode=opts.distmode) total += 1 if dist > 0: print dist valid += 1 logging.debug("Total valid (> 0) distances: {0}.".\ format(percentage(valid, total)))
def path(self, path, flip=False): oo = [] if len(path) == 1: m = "Singleton {0}".format(path[0]) oo.append((path[0].v, True)) return m, oo edges = [] for a, b in pairwise(path): av, bv = a.v, b.v e = self.get_edge(av, bv) if not oo: # First edge imports two nodes oo.append((e.v1.v, e.o1 == ">")) last = oo[-1] assert last == (e.v1.v, e.o1 == ">") oo.append((e.v2.v, e.o2 == ">")) if flip: se = str(e) e.flip() else: se = str(e) edges.append(se) return "|".join(edges), oo
def silicosoma(args): """ %prog silicosoma in.silico > out.soma Convert .silico to .soma file. Format of .silico A text file containing in-silico digested contigs. This file contains pairs of lines. The first line in each pair constains an identifier, this contig length in bp, and the number of restriction sites, separated by white space. The second line contains a white space delimited list of the restriction site positions. Format of .soma Each line of the text file contains two decimal numbers: The size of the fragment and the standard deviation (both in kb), separated by white space. The standard deviation is ignored. """ p = OptionParser(silicosoma.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) silicofile, = args fp = must_open(silicofile) fw = must_open(opts.outfile, "w") fp.next() positions = [int(x) for x in fp.next().split()] for a, b in pairwise(positions): assert a <= b fragsize = int(round((b - a) / 1000.)) # kb if fragsize: print >> fw, fragsize, 0
def chimera(args): """ %prog chimera bedfile Scan the bed file to break scaffolds that multi-maps. """ p = OptionParser(chimera.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args bed = Bed(bedfile) selected = select_bed(bed) mapped = defaultdict(set) # scaffold => chr chimerabed = "chimera.bed" fw = open(chimerabed, "w") for b in selected: scf = range_parse(b.accn).seqid chr = b.seqid mapped[scf].add(chr) nchimera = 0 for s, chrs in sorted(mapped.items()): if len(chrs) == 1: continue print >> sys.stderr, "=" * 80 print >> sys.stderr, "{0} mapped to multiple locations: {1}".\ format(s, ",".join(sorted(chrs))) ranges = [] for b in selected: rr = range_parse(b.accn) scf = rr.seqid if scf == s: print >> sys.stderr, b ranges.append(rr) # Identify breakpoints ranges.sort(key=lambda x: (x.seqid, x.start, x.end)) for a, b in pairwise(ranges): seqid = a.seqid if seqid != b.seqid: continue start, end = a.end, b.start if start > end: start, end = end, start chimeraline = "\t".join(str(x) for x in (seqid, start, end)) print >> fw, chimeraline print >> sys.stderr, chimeraline nchimera += 1 fw.close() logging.debug("A total of {0} junctions written to `{1}`.".\ format(nchimera, chimerabed))
def min_feedback_arc_set(edges, remove=False, maxcycles=20000): """ A directed graph may contain directed cycles, when such cycles are undesirable, we wish to eliminate them and obtain a directed acyclic graph (DAG). A feedback arc set has the property that it has at least one edge of every cycle in the graph. A minimum feedback arc set is the set that minimizes the total weight of the removed edges; or alternatively maximize the remaining edges. See: <http://en.wikipedia.org/wiki/Feedback_arc_set>. The MIP formulation proceeds as follows: use 0/1 indicator variable to select whether an edge is in the set, subject to constraint that each cycle must pick at least one such edge. >>> g = [(1, 2, 2), (2, 3, 2), (3, 4, 2)] + [(1, 3, 1), (3, 2, 1), (2, 4, 1)] >>> min_feedback_arc_set(g) ([(3, 2, 1)], 1) >>> min_feedback_arc_set(g, remove=True) # Return DAG ([(1, 2, 2), (2, 3, 2), (3, 4, 2), (1, 3, 1), (2, 4, 1)], 1) """ G = nx.DiGraph() edge_to_index = {} for i, (a, b, w) in enumerate(edges): G.add_edge(a, b) edge_to_index[a, b] = i nedges = len(edges) L = LPInstance() L.add_objective(edges, objective=MINIMIZE) constraints = [] ncycles = 0 for c in nx.simple_cycles(G): cycle_edges = [] rc = c + [c[0]] # Rotate the cycle for a, b in pairwise(rc): cycle_edges.append(edge_to_index[a, b]) cc = summation(cycle_edges) constraints.append("{0} >= 1".format(cc)) ncycles += 1 if ncycles == maxcycles: break logging.debug("A total of {0} cycles found.".format(ncycles)) L.constraints = constraints L.add_vars(nedges) selected, obj_val = L.lpsolve(clean=False) if remove: results = [x for i, x in enumerate(edges) if i not in selected] \ if selected else None else: results = [x for i, x in enumerate(edges) if i in selected] \ if selected else None return results, obj_val
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option( "--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]", ) p.add_option( "--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]", ) p.set_verbose(help="Print verbose reports to stdout") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(asub, bsub, atag, btag) graph_to_agp(g, blastfile, subjectfasta, verbose=opts.verbose)
def plot_data(x, y, tour, M): from jcvi.graphics.base import plt, savefig plt.plot(x, y, "ro") for ia, ib in pairwise(tour): plt.plot((x[ia], x[ib]), (y[ia], y[ib]), "r-") score = evaluate(tour, M) plt.title("Score={0:.2f}".format(score)) savefig("demo.pdf")
def validate_one(self, object, lines): object_beg = lines[0].object_beg assert object_beg == 1, \ "object %s must start at 1 (instead of %d)" % \ (object, object_beg) for a, b in pairwise(lines): assert b.object_beg - a.object_end == 1, \ "lines not continuous coords between:\n%s\n%s" % \ (a, b)
def compute_all_gaps(self, minsize=100, maxsize=500000, verbose=False): self.gapsizes = [] for (a, b), gappos in zip(pairwise(self.scaffolds), self.pp): gapsize = self.compute_one_gap(a, b, gappos, minsize, maxsize, verbose=verbose) self.gapsizes.append(gapsize)
def pairinplace(args): """ %prog pairinplace bulk.fastq Pair up the records in bulk.fastq by comparing the names for adjancent records. If they match, print to bulk.pairs.fastq, else print to bulk.frags.fastq. """ from jcvi.utils.iter import pairwise p = OptionParser(pairinplace.__doc__) p.add_option("-r", dest="rclip", default=1, type="int", help="pair ID is derived from rstrip N chars [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args base = op.basename(fastqfile).split(".")[0] frags = base + ".frags.fastq" pairs = base + ".pairs.fastq" if fastqfile.endswith(".gz"): frags += ".gz" pairs += ".gz" fragsfw = must_open(frags, "w") pairsfw = must_open(pairs, "w") N = opts.rclip strip_name = lambda x: x[:-N] if N else str fh_iter = iter_fastq(fastqfile, key=strip_name) skipflag = False # controls the iterator skip for a, b in pairwise(fh_iter): if b is None: # hit the eof break if skipflag: skipflag = False continue if a.id == b.id: print >> pairsfw, a print >> pairsfw, b skipflag = True else: print >> fragsfw, a # don't forget the last one, when b is None if not skipflag: print >> fragsfw, a logging.debug("Reads paired into `%s` and `%s`" % (pairs, frags))
def sequence_to_graph(G, seq, color='black'): """ Automatically construct graph given a sequence of characters. """ for x in seq: if x.endswith("_1"): # Mutation G.node(x, color=color, width="0.1", shape="circle", label="") else: G.node(x, color=color) for a, b in pairwise(seq): G.edge(a, b, color=color)
def draw(self): ar = self.ar pad = self.pad pads = 0 for (a, b), w, color in zip(pairwise(ar), self.wiggles, self.colors): yf = self.ystart + w * 1. / self.wiggle if color: p = Rectangle((a + pads, yf), b - a, self.height, color=color) self.append(p) pads += pad self.add_patches()
def from_silico(self, filename="Ecoli.silico", nfrags=25): fp = open(filename) next(fp) ar = [0] + [int(x) for x in fp.next().split()] sizes = [] # Only retain frags beyond certain size for a, b in pairwise(ar): size = b - a if size < max(ar[:nfrags]) / 100: continue sizes.append(size) sizes = [choice(sizes) for x in xrange(nfrags)] return sizes
def graph(self): g = BiGraph() for scaffold, lines in self.iter_scaffold(): self.scf[scaffold] = [x.tig for x in lines] for a, b in pairwise(lines): g.add_edge(a.tig, b.tig, a.o, b.o, length=a.gaps) if len(lines) == 1: # Singleton scaffold a = lines[0] g.add_node(a.tig) return g
def make_paths(paths, weights=None): """ Zip together paths. Called by merge_paths(). """ npaths = len(paths) weights = weights or [1] * npaths assert len(paths) == len(weights) G = nx.DiGraph() for path, w in zip(paths, weights): for a, b in pairwise(path): update_weight(G, a, b, w) return G
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option("--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]") p.add_option("--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]") p.add_option("--verbose", default=False, action="store_true", help="Print verbose reports to stdout [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(BiEdge(asub, bsub, atag, btag)) graph_to_agp(g, blastfile, subjectfasta, verbose=opts.verbose)
def merge_paths(paths): """ Zip together sorted lists. >>> paths = [[1, 2, 3], [1, 3, 4], [2, 4, 5]] >>> merge_paths(paths) [1, 2, 3, 4, 5] """ from jcvi.utils.iter import pairwise edges = [] for a in paths: edges.extend(list(pairwise(a))) g = nx.DiGraph(edges) return topological_sort(g)
def subtourelim(model, where): if where != GRB.callback.MIPSOL: return selected = [] # make a list of edges selected in the solution sol = model.cbGetSolution([model._vars[i] for i in range(nedges)]) selected = [edges[i] for i, x in enumerate(sol) if x > .5] selected = [(idx[a], idx[b]) for a, b, w in selected] # find the shortest cycle in the selected edge list tour = subtour(selected) if len(tour) == n: return # add a subtour elimination constraint c = tour incident = [edge_store[a, b] for a, b in pairwise(c + [c[0]])] model.cbLazy(quicksum(model._vars[x] for x in incident) <= len(tour) - 1)
def path_to_agp(g, path, object, sizes, status): lines = [] for (a, ao), (b, bo) in pairwise(path): ao = get_orientation(ao, status) e = g.get_edge(a.v, b.v) cline = AGPLine.cline(object, a.v, sizes, ao) gline = AGPLine.gline(object, e.length) lines.append(cline) lines.append(gline) # Do not forget the last one z, zo = path[-1] zo = get_orientation(zo, status) cline = AGPLine.cline(object, z.v, sizes, zo) lines.append(cline) return lines
def breakpoint(args): """ %prog breakpoint mstmap.input > breakpoints.bed Find scaffold breakpoints using genetic map. Use variation.vcf.mstmap() to generate the input for this routine. """ from jcvi.utils.iter import pairwise p = OptionParser(breakpoint.__doc__) p.add_option( "--diff", default=0.1, type="float", help="Maximum ratio of differences allowed", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (mstmap, ) = args diff = opts.diff data = MSTMap(mstmap) # Remove singleton markers (avoid double cross-over) good = [] nsingletons = 0 for i in range(1, len(data) - 1): a = data[i] left_label, left_rr = check_markers(data[i - 1], a, diff) right_label, right_rr = check_markers(a, data[i + 1], diff) if left_label == BREAK and right_label == BREAK: nsingletons += 1 continue good.append(a) logging.debug( "A total of {0} singleton markers removed.".format(nsingletons)) for a, b in pairwise(good): label, rr = check_markers(a, b, diff) if label == BREAK: print("\t".join(str(x) for x in rr))
def range_depth(ranges, size, verbose=True): """ Overlay ranges on [start, end], and summarize the ploidy of the intervals. """ from jcvi.utils.iter import pairwise from jcvi.utils.cbook import percentage # Make endpoints endpoints = [] for a, b in ranges: endpoints.append((a, LEFT)) endpoints.append((b, RIGHT)) endpoints.sort() vstart, vend = min(endpoints)[0], max(endpoints)[0] assert 0 <= vstart < size assert 0 <= vend < size depth = 0 depthstore = defaultdict(int) depthstore[depth] += vstart depthdetails = [(0, vstart, depth)] for (a, atag), (b, btag) in pairwise(endpoints): if atag == LEFT: depth += 1 elif atag == RIGHT: depth -= 1 depthstore[depth] += b - a depthdetails.append((a, b, depth)) assert btag == RIGHT depth -= 1 assert depth == 0 depthstore[depth] += size - vend depthdetails.append((vend, size, depth)) assert sum(depthstore.values()) == size if verbose: for depth, count in sorted(depthstore.items()): print >> sys.stderr, "Depth {0}: {1}".\ format(depth, percentage(count, size)) return depthstore, depthdetails
def range_interleave(ranges, sizes={}, empty=False): """ Returns the ranges in between the given ranges. >>> ranges = [("1", 30, 40), ("1", 45, 50), ("1", 10, 30)] >>> range_interleave(ranges) [('1', 41, 44)] >>> ranges = [("1", 30, 40), ("1", 42, 50)] >>> range_interleave(ranges) [('1', 41, 41)] >>> range_interleave(ranges, sizes={"1": 70}) [('1', 1, 29), ('1', 41, 41), ('1', 51, 70)] """ from jcvi.utils.iter import pairwise ranges = range_merge(ranges) interleaved_ranges = [] for ch, cranges in groupby(ranges, key=lambda x: x[0]): cranges = list(cranges) size = sizes.get(ch, None) if size: ch, astart, aend = cranges[0] if astart > 1: interleaved_ranges.append((ch, 1, astart - 1)) elif empty: interleaved_ranges.append(None) for a, b in pairwise(cranges): ch, astart, aend = a ch, bstart, bend = b istart, iend = aend + 1, bstart - 1 if istart <= iend: interleaved_ranges.append((ch, istart, iend)) elif empty: interleaved_ranges.append(None) if size: ch, astart, aend = cranges[-1] if aend < size: interleaved_ranges.append((ch, aend + 1, size)) elif empty: interleaved_ranges.append(None) return interleaved_ranges
def write_agp(self, filename): sizes = self.sz agp = [] for scaffold, lines in self.iter_scaffold(): for a, b in pairwise(lines): cline = AGPLine.cline(scaffold, a.tig, sizes, a.oo) gline = AGPLine.gline(scaffold, a.gaps) agp.append(cline) agp.append(gline) a = lines[-1] cline = AGPLine.cline(scaffold, a.tig, sizes, a.oo) agp.append(cline) fw = open(filename, "w") for a in agp: print >> fw, a fw.close() reindex([filename, "--inplace"]) return filename
def breakpoint(args): """ %prog breakpoint mstmap.input > breakpoints.bed Find scaffold breakpoints using genetic map. Use variation.vcf.mstmap() to generate the input for this routine. """ from jcvi.utils.iter import pairwise p = OptionParser(breakpoint.__doc__) p.add_option("--diff", default=.1, type="float", help="Maximum ratio of differences allowed [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) mstmap, = args diff = opts.diff data = MSTMap(mstmap) # Remove singleton markers (avoid double cross-over) good = [] nsingletons = 0 for i in xrange(1, len(data) - 1): a = data[i] left_label, left_rr = check_markers(data[i - 1], a, diff) right_label, right_rr = check_markers(a, data[i + 1], diff) if left_label == BREAK and right_label == BREAK: nsingletons += 1 continue good.append(a) logging.debug("A total of {0} singleton markers removed.".format(nsingletons)) for a, b in pairwise(good): label, rr = check_markers(a, b, diff) if label == BREAK: print "\t".join(str(x) for x in rr)
def write_PAD_bed(bedfile, prefix, pads, bed): fw = open(bedfile, "w") padnames = ["{0}:{1:05d}-{2:05d}".format(prefix, a, b) for a, b in pads] for a, b in pairwise(padnames): assert a != b, a j = 0 # Assign all genes to new partitions for i, x in enumerate(bed): a, b = pads[j] if i > b: j += 1 a, b = pads[j] print("\t".join((padnames[j], str(i), str(i + 1), x.accn)), file=fw) fw.close() npads = len(pads) logging.debug("{0} partition written in `{1}`.".format(npads, bedfile)) return npads, padnames
def happy_edges(row, prefix=None): """ Convert a row in HAPPY file and yield edges. """ trans = maketrans("[](){}", " ") row = row.strip().strip("+") row = row.translate(trans) scfs = [x.strip("+") for x in row.split(":")] for a, b in pairwise(scfs): oa = '<' if a.strip()[0] == '-' else '>' ob = '<' if b.strip()[0] == '-' else '>' is_uncertain = a[-1] == ' ' or b[0] == ' ' a = a.strip().strip('-') b = b.strip().strip('-') if prefix: a = prefix + a b = prefix + b yield (a, b, oa, ob), is_uncertain
def happy_edges(row, prefix=None): """ Convert a row in HAPPY file and yield edges. """ trans = str.maketrans("[](){}", " ") row = row.strip().strip("+") row = row.translate(trans) scfs = [x.strip("+") for x in row.split(":")] for a, b in pairwise(scfs): oa = "<" if a.strip()[0] == "-" else ">" ob = "<" if b.strip()[0] == "-" else ">" is_uncertain = a[-1] == " " or b[0] == " " a = a.strip().strip("-") b = b.strip().strip("-") if prefix: a = prefix + a b = prefix + b yield (a, b, oa, ob), is_uncertain
def gaps(args): """ %prog gaps OM.bed fastafile Create patches around OM gaps. """ from jcvi.formats.bed import uniq from jcvi.utils.iter import pairwise p = OptionParser(gaps.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ombed, fastafile = args ombed = uniq([ombed]) bed = Bed(ombed) for a, b in pairwise(bed): om_a = (a.seqid, a.start, a.end, "+") om_b = (b.seqid, b.start, b.end, "+") ch_a = range_parse(a.accn) ch_b = range_parse(b.accn) ch_a = (ch_a.seqid, ch_a.start, ch_a.end, "+") ch_b = (ch_b.seqid, ch_b.start, ch_b.end, "+") om_dist, x = range_distance(om_a, om_b, distmode="ee") ch_dist, x = range_distance(ch_a, ch_b, distmode="ee") if om_dist <= 0 and ch_dist <= 0: continue print a print b print om_dist, ch_dist
def adjgraph(args): """ %prog adjgraph adjacency.txt subgraph.txt Construct adjacency graph for graphviz. The file may look like sample below. The lines with numbers are chromosomes with gene order information. genome 0 chr 0 -1 -13 -16 3 4 -6126 -5 17 -6 7 18 5357 8 -5358 5359 -9 -10 -11 5362 5360 chr 1 138 6133 -5387 144 -6132 -139 140 141 146 -147 6134 145 -170 -142 -143 """ import pygraphviz as pgv from jcvi.utils.iter import pairwise from jcvi.formats.base import SetFile p = OptionParser(adjgraph.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) infile, subgraph = args subgraph = SetFile(subgraph) subgraph = set(x.strip("-") for x in subgraph) G = pgv.AGraph(strict=False) # allow multi-edge SG = pgv.AGraph(strict=False) palette = ("green", "magenta", "tomato", "peachpuff") fp = open(infile) genome_id = -1 key = 0 for row in fp: if row.strip() == "": continue atoms = row.split() tag = atoms[0] if tag in ("ChrNumber", "chr"): continue if tag == "genome": genome_id += 1 gcolor = palette[genome_id] continue nodeseq = [] for p in atoms: np = p.strip("-") nodeL, nodeR = np + "L", np + "R" if p[0] == "-": # negative strand nodeseq += [nodeR, nodeL] else: nodeseq += [nodeL, nodeR] for a, b in pairwise(nodeseq): G.add_edge(a, b, key, color=gcolor) key += 1 na, nb = a[:-1], b[:-1] if na not in subgraph and nb not in subgraph: continue SG.add_edge(a, b, key, color=gcolor) G.graph_attr.update(dpi="300") fw = open("graph.dot", "w") G.write(fw) fw.close() fw = open("subgraph.dot", "w") SG.write(fw) fw.close()
def cut(args): """ %prog cut agpfile bedfile Cut at the boundaries of the ranges in the bedfile. Use --shrink to control the exact boundaries where you cut. """ p = OptionParser(cut.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) agpfile, bedfile = args agp = AGP(agpfile) bed = Bed(bedfile) simple_agp = agp.order newagpfile = agpfile.replace(".agp", ".cut.agp") fw = open(newagpfile, "w") agp_fixes = defaultdict(list) for component, intervals in bed.sub_beds(): i, a = simple_agp[component] object = a.object component_span = a.component_span orientation = a.orientation assert a.component_beg, a.component_end arange = a.component_beg, a.component_end cuts = set() for i in intervals: start, end = i.start, i.end end -= 1 assert start <= end cuts.add(start) cuts.add(end) cuts.add(0) cuts.add(component_span) cuts = list(sorted(cuts)) sum_of_spans = 0 for i, (a, b) in enumerate(pairwise(cuts)): oid = object + "_{0}".format(i) aline = [oid, 0, 0, 0] cspan = b - a aline += ['D', component, a + 1, b, orientation] sum_of_spans += cspan aline = "\t".join(str(x) for x in aline) agp_fixes[component].append(aline) assert component_span == sum_of_spans # Finally write the masked agp for a in agp: if not a.is_gap and a.component_id in agp_fixes: print >> fw, "\n".join(agp_fixes[a.component_id]) else: print >> fw, a fw.close() # Reindex idxagpfile = reindex([newagpfile]) shutil.move(idxagpfile, newagpfile) return newagpfile
def tsp(edges, constraint_generation=False): """ Calculates shortest cycle that traverses each node exactly once. Also known as the Traveling Salesman Problem (TSP). """ edges = populate_edge_weights(edges) incoming, outgoing, nodes = node_to_edge(edges) nedges, nnodes = len(edges), len(nodes) L = LPInstance() L.add_objective(edges, objective=MINIMIZE) balance = [] # For each node, select exactly 1 incoming and 1 outgoing edge for v in nodes: incoming_edges = incoming[v] outgoing_edges = outgoing[v] icc = summation(incoming_edges) occ = summation(outgoing_edges) balance.append("{0} = 1".format(icc)) balance.append("{0} = 1".format(occ)) # Subtour elimination - Miller-Tucker-Zemlin (MTZ) formulation # <http://en.wikipedia.org/wiki/Travelling_salesman_problem> # Desrochers and laporte, 1991 (DFJ) has a stronger constraint # See also: # G. Laporte / The traveling salesman problem: Overview of algorithms start_step = nedges + 1 u0 = nodes[0] nodes_to_steps = dict((n, start_step + i) for i, n in enumerate(nodes[1:])) edge_store = dict((e[:2], i) for i, e in enumerate(edges)) mtz = [] for i, e in enumerate(edges): a, b = e[:2] if u0 in (a, b): continue na, nb = nodes_to_steps[a], nodes_to_steps[b] con_ab = " x{0} - x{1} + {2}x{3}".format(na, nb, nnodes - 1, i + 1) if (b, a) in edge_store: # This extra term is the stronger DFJ formulation j = edge_store[(b, a)] con_ab += " + {0}x{1}".format(nnodes - 3, j + 1) con_ab += " <= {0}".format(nnodes - 2) mtz.append(con_ab) # Step variables u_i bound between 1 and n, as additional variables bounds = [] for i in xrange(start_step, nedges + nnodes): bounds.append(" 1 <= x{0} <= {1}".format(i, nnodes - 1)) L.add_vars(nedges) """ Constraint generation seek to find 'cuts' in the LP problem, by solving the relaxed form. The subtours were then incrementally added to the constraints. """ if constraint_generation: L.constraints = balance subtours = [] while True: selected, obj_val = L.lpsolve() results = sorted(x for i, x in enumerate(edges) if i in selected) \ if selected else None if not results: break G = edges_to_graph(results) cycles = list(nx.simple_cycles(G)) if len(cycles) == 1: break for c in cycles: incident = [edge_store[a, b] for a, b in pairwise(c + [c[0]])] icc = summation(incident) subtours.append("{0} <= {1}".format(icc, len(incident) - 1)) L.constraints = balance + subtours else: L.constraints = balance + mtz L.add_vars(nnodes - 1, offset=start_step, binary=False) L.bounds = bounds selected, obj_val = L.lpsolve() results = sorted(x for i, x in enumerate(edges) if i in selected) \ if selected else None return results
def mask(args): """ %prog mask agpfile bedfile Mask given ranges in componets to gaps. """ p = OptionParser(mask.__doc__) p.add_option("--split", default=False, action="store_true", help="Split object and create new names [default: %default]") p.add_option("--log", default=False, action="store_true", help="Write verbose logs to .masklog file [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile) bed = Bed(bedfile) simple_agp = agp.order # agp lines to replace original ones, keyed by the component agp_fixes = defaultdict(list) newagpfile = agpfile.replace(".agp", ".masked.agp") logfile = bedfile.replace(".bed", ".masklog") fw = open(newagpfile, "w") if opts.log: fwlog = open(logfile, "w") for component, intervals in bed.sub_beds(): if opts.log: print >> fwlog, "\n".join(str(x) for x in intervals) i, a = simple_agp[component] object = a.object component_span = a.component_span orientation = a.orientation if opts.log: print >> fwlog, a assert a.component_beg, a.component_end arange = a.component_beg, a.component_end # Make sure `ivs` contain DISJOINT ranges, and located within `arange` ivs = [] for i in intervals: iv = range_intersect(arange, (i.start, i.end)) if iv is not None: ivs.append(iv) # Sort the ends of `ivs` as well as the arange arange = a.component_beg - 1, a.component_end + 1 endpoints = sorted(flatten(ivs + [arange])) # reverse if component on negative strand if orientation == '-': endpoints.reverse() sum_of_spans = 0 # assign complements as sequence components for i, (a, b) in enumerate(pairwise(endpoints)): if orientation == '-': a, b = b, a if orientation not in ('+', '-'): orientation = '+' oid = object + "_{0}".format(i / 2) if opts.split else object aline = [oid, 0, 0, 0] if i % 2 == 0: cspan = b - a - 1 aline += ['D', component, a + 1, b - 1, orientation] is_gap = False else: cspan = b - a + 1 aline += ["N", cspan, "fragment", "yes"] is_gap = True if cspan <= 0: continue sum_of_spans += cspan aline = "\t".join(str(x) for x in aline) if not (opts.split and is_gap): agp_fixes[component].append(aline) if opts.log: print >> fwlog, aline assert component_span == sum_of_spans if opts.log: print >> fwlog # Finally write the masked agp for a in agp: if not a.is_gap and a.component_id in agp_fixes: print >> fw, "\n".join(agp_fixes[a.component_id]) else: print >> fw, a fw.close() # Reindex idxagpfile = reindex([newagpfile]) shutil.move(idxagpfile, newagpfile) return newagpfile
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option( "--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]") p.add_option( "--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]") p.add_option("--verbose", default=False, action="store_true", help="Print verbose reports to stdout [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(BiEdge(asub, bsub, atag, btag)) g.write("graph.txt") #g.draw("graph.pdf") logging.debug(str(g)) paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if opts.verbose: print m print oo npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug("Graph decomposed to {0} paths with {1} components.".\ format(npaths, ntigs)) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) \ for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug("Written {0} unscaffolded singletons.".format(nsingletons)) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def links(self): r = [] for s, sb in self.sub_beds(): for a, b in pairwise(sb): r.append(((a.accn, a.strand), (b.accn, b.strand))) return r
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.utils.iter import pairwise from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option("--prefix", default="scaffold", help="Prefix of the unplaced scaffolds [default: %default]") p.add_option("--minlinks", default=3, type="int", help="Minimum number of links to place [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print >> log ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print >> log, a print >> log, b flip_b = (astrand == bstrand) fbstrand = '-' if flip_b else '+' if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ('+', '-') if astrand == '+': offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print >> log, "*" + "\t".join(str(x) for x in start_range) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print >> log, alldepths maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print >> log, "Insufficient links ({0} < {1})".format(maxdepth, minlinks) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) msg = "Multiple conflicting candidates found" if nseqids != 1: print >> log, msg continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if (mmax - mmin) > maxdist: print >> log, msg continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == '+': nplus += 1 else: nminus += 1 fbstrand = '+' if nplus >= nminus else '-' candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print >> log, "Plus: {0}, Minus: {1}".format(nplus, nminus) print >> log, candidate candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".\ format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)