def bed2chain(args): from maize.formats.sizes import Sizes tdic = Sizes(args.tsize) qdic = Sizes(args.qsize) firstline = True cid0, tName0, qName0, srd0, locs = '', '', '', '', [] for line in must_open(args.fi): line = line.rstrip("\n") if not line: continue tName, tStart, tEnd, srd, qName, qStart, qEnd, cid = line.split()[:8] tStart, tEnd, qStart, qEnd = int(tStart), int(tEnd), int(qStart), int(qEnd) if firstline: cid0, tName0, qName0, srd0 = cid, tName, qName, srd locs.append([tStart, tEnd, qStart, qEnd]) firstline = False elif cid0 == cid: assert tName == tName0 and qName == qName0 and srd == srd0, "inconsistent info in chain" locs.append([tStart, tEnd, qStart, qEnd]) else: print_chain(cid0, tName0, qName0, srd0, tdic.get_size(tName0), qdic.get_size(qName0), locs) cid0, tName0, qName0, srd0 = cid, tName, qName, srd locs = [[tStart, tEnd, qStart, qEnd]] print_chain(cid0, tName0, qName0, srd0, tdic.get_size(tName0), qdic.get_size(qName0), locs)
def bed2chain(args): from jcvi.formats.sizes import Sizes tdic = Sizes(args.tsize) qdic = Sizes(args.qsize) firstline = True cid0, tName0, qName0, srd0, locs = '', '', '', '', [] for line in must_open(args.fi): line = line.rstrip("\n") if not line: continue tName, tStart, tEnd, srd, qName, qStart, qEnd, cid = line.split()[:8] tStart, tEnd, qStart, qEnd = int(tStart), int(tEnd), int(qStart), int(qEnd) if firstline: cid0, tName0, qName0, srd0 = cid, tName, qName, srd locs.append([tStart, tEnd, qStart, qEnd]) firstline = False elif cid0 == cid: assert tName == tName0 and qName == qName0 and srd == srd0, "inconsistent info in chain" locs.append([tStart, tEnd, qStart, qEnd]) else: print_chain(cid0, tName0, qName0, srd0, tdic.get_size(tName0), qdic.get_size(qName0), locs) cid0, tName0, qName0, srd0 = cid, tName, qName, srd locs = [[tStart, tEnd, qStart, qEnd]] print_chain(cid0, tName0, qName0, srd0, tdic.get_size(tName0), qdic.get_size(qName0), locs)
def main(): p = OptionParser(__doc__) p.add_option("--order", help="The order to plot the tracks, comma-separated") opts, args, iopts = p.set_image_options() if len(args) != 3: sys.exit(not p.print_help()) chr, sizes, datadir = args order = opts.order hlsuffix = opts.hlsuffix if order: order = order.split(",") sizes = Sizes(sizes) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) canvas = (.12, .35, .8, .35) chr_size = sizes.get_size(chr) c = Coverage(fig, root, canvas, chr, (0, chr_size), datadir, order=order, hlsuffix=hlsuffix) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = chr + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def main(): p = OptionParser(__doc__) p.add_option("--order", help="The order to plot the tracks, comma-separated") opts, args, iopts = p.set_image_options() if len(args) != 3: sys.exit(not p.print_help()) chr, sizes, datadir = args order = opts.order hlsuffix = opts.hlsuffix if order: order = order.split(",") sizes = Sizes(sizes) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) canvas = (.12, .35, .8, .35) chr_size = sizes.get_size(chr) Coverage(fig, root, canvas, chr, (0, chr_size), datadir, order=order, hlsuffix=hlsuffix) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = chr + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def coordT(args): sizes = Sizes(args.fs) for line in must_open(args.fi): if not re.match(r'\d+', line[0]): continue p = PslLine(line) tnames = p.tName.split("-") if len(tnames) == 3: x = p.tName p.tName, tosStart, tosEnd = tnames[0], int(tnames[1]), int( tnames[2]) assert tosEnd - tosStart + 1 == p.tSize cSize = sizes.get_size(p.tName) p.tStart += tosStart - 1 p.tEnd += tosStart - 1 p.tStarts = [x + tosStart - 1 for x in p.tStarts] p.tSize = cSize print(str(p))
def coordQ(args): sizes = Sizes(args.fs) for line in must_open(args.fi): if not re.match(r'\d+', line[0]): continue p = PslLine(line) qnames = p.qName.split("-") if len(qnames) == 3: x = p.qName p.qName, qosStart, qosEnd = qnames[0], int(qnames[1]), int( qnames[2]) assert qosEnd - qosStart + 1 == p.qSize cSize = sizes.get_size(p.qName) p.qStart += qosStart - 1 p.qEnd += qosStart - 1 if p.qstrand == "-": p.qStarts = [x + cSize - qosEnd for x in p.qStarts] else: p.qStarts = [x + qosStart - 1 for x in p.qStarts] p.qSize = cSize print(str(p))
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.formats.bed import BedLine from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option( "--prefix", default="scaffold", help="Prefix of the unplaced scaffolds", ) p.add_option( "--minlinks", default=3, type="int", help="Minimum number of links to place", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print(file=log) ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print(a, file=log) print(b, file=log) flip_b = astrand == bstrand fbstrand = "-" if flip_b else "+" if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ("+", "-") if astrand == "+": offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print("*" + "\t".join(str(x) for x in start_range), file=log) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print(alldepths, file=log) maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print("Insufficient links ({0} < {1})".format(maxdepth, minlinks), file=log) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) if nseqids != 1: msg = "Multiple conflicting candidates found" print(msg, file=log) continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if mmin >= mmax: msg = "Invalid (min, max) range" print("Invalid (min, max) range", file=log) continue if (mmax - mmin) > maxdist: msg = "(min, max) distance greater than library maxdist" print(msg, file=log) continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == "+": nplus += 1 else: nminus += 1 fbstrand = "+" if nplus >= nminus else "-" candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print("Plus: {0}, Minus: {1}".format(nplus, nminus), file=log) print(candidate, file=log) candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.utils.iter import pairwise from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option("--prefix", default="scaffold", help="Prefix of the unplaced scaffolds [default: %default]") p.add_option("--minlinks", default=3, type="int", help="Minimum number of links to place [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print >> log ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print >> log, a print >> log, b flip_b = (astrand == bstrand) fbstrand = '-' if flip_b else '+' if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ('+', '-') if astrand == '+': offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print >> log, "*" + "\t".join(str(x) for x in start_range) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print >> log, alldepths maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print >> log, "Insufficient links ({0} < {1})".format(maxdepth, minlinks) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) msg = "Multiple conflicting candidates found" if nseqids != 1: print >> log, msg continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if (mmax - mmin) > maxdist: print >> log, msg continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == '+': nplus += 1 else: nminus += 1 fbstrand = '+' if nplus >= nminus else '-' candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print >> log, "Plus: {0}, Minus: {1}".format(nplus, nminus) print >> log, candidate candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".\ format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)