def get_2D_overlap(chain, eclusters): """ Implements a sweep line algorithm, that has better running time than naive O(n^2): assume block has x_ends, and y_ends for the bounds 1. sort x_ends, and take a sweep line to scan the x_ends 2. if left end, test y-axis intersection of current block with `active` set; also put this block in the `active` set 3. if right end, remove block from the `active` set """ mergeables = Grouper() active = set() x_ends = [] for i, (range_x, range_y, score) in enumerate(eclusters): chr, left, right = range_x x_ends.append((chr, left, 0, i)) # 0/1 for left/right-ness x_ends.append((chr, right, 1, i)) x_ends.sort() chr_last = "" for chr, pos, left_right, i in x_ends: if chr != chr_last: active.clear() if left_right == 0: active.add(i) for x in active: # check y-overlap if range_overlap(eclusters[x][1], eclusters[i][1]): mergeables.join(x, i) else: # right end active.remove(i) chr_last = chr return mergeables
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.formats.bed import BedLine from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option( "--prefix", default="scaffold", help="Prefix of the unplaced scaffolds", ) p.add_option( "--minlinks", default=3, type="int", help="Minimum number of links to place", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print(file=log) ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print(a, file=log) print(b, file=log) flip_b = astrand == bstrand fbstrand = "-" if flip_b else "+" if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ("+", "-") if astrand == "+": offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print("*" + "\t".join(str(x) for x in start_range), file=log) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print(alldepths, file=log) maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print("Insufficient links ({0} < {1})".format(maxdepth, minlinks), file=log) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) if nseqids != 1: msg = "Multiple conflicting candidates found" print(msg, file=log) continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if mmin >= mmax: msg = "Invalid (min, max) range" print("Invalid (min, max) range", file=log) continue if (mmax - mmin) > maxdist: msg = "(min, max) distance greater than library maxdist" print(msg, file=log) continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == "+": nplus += 1 else: nminus += 1 fbstrand = "+" if nplus >= nminus else "-" candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print("Plus: {0}, Minus: {1}".format(nplus, nminus), file=log) print(candidate, file=log) candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)
def loss(args): """ %prog loss a.b.i1.blocks [a.b-genomic.blast] Extract likely gene loss candidates between genome a and b. """ p = OptionParser(loss.__doc__) p.add_option("--bed", default=False, action="store_true", help="Genomic BLAST is in bed format [default: %default]") p.add_option("--gdist", default=20, type="int", help="Gene distance [default: %default]") p.add_option("--bdist", default=20000, type="int", help="Base pair distance [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) blocksfile = args[0] emptyblast = (len(args) == 1) if emptyblast: genomicblast = "empty.blast" sh("touch {0}".format(genomicblast)) else: genomicblast = args[1] gdist, bdist = opts.gdist, opts.bdist qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts) blocks = [] fp = open(blocksfile) genetrack = {} proxytrack = {} for row in fp: a, b = row.split() genetrack[a] = b blocks.append((a, b)) data = [] for key, rows in groupby(blocks, key=lambda x: x[-1]): rows = list(rows) data.append((key, rows)) imax = len(data) - 1 for i, (key, rows) in enumerate(data): if i == 0 or i == imax: continue if key != '.': continue before, br = data[i - 1] after, ar = data[i + 1] bi, bx = sorder[before] ai, ax = sorder[after] dist = abs(bi - ai) if bx.seqid != ax.seqid or dist > gdist: continue start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end))) start, end = max(start - bdist, 1), end + bdist proxy = (bx.seqid, start, end) for a, b in rows: proxytrack[a] = proxy tags = {} if opts.bed: bed = Bed(genomicblast, sorted=False) key = lambda x: gene_name(x.accn.rsplit(".", 1)[0]) for query, bb in groupby(bed, key=key): bb = list(bb) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.seqid, b.start, b.end) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.seqid, best_b.start, best_b.end) proxytrack[query] = hsp tags[query] = tag else: blast = Blast(genomicblast) for query, bb in blast.iter_hits(): bb = list(bb) query = gene_name(query) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.subject, b.sstart, b.sstop) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.subject, best_b.sstart, best_b.sstop) proxytrack[query] = hsp tags[query] = tag for b in qbed: accn = b.accn target_region = genetrack[accn] if accn in proxytrack: target_region = region_str(proxytrack[accn]) if accn in tags: ptag = "[{0}]".format(tags[accn]) else: ptag = "[NF]" target_region = ptag + target_region print "\t".join((b.seqid, accn, target_region)) if emptyblast: sh("rm -f {0}".format(genomicblast))
def napus(args): """ %prog napus napus.bed brapa.boleracea.i1.blocks diploid.napus.fractionation Extract napus gene loss vs diploid ancestors. We are looking specifically for anything that has the pattern: BR - BO or BR - BO | | AN CN Step 1: extract BR - BO syntenic pairs Step 2: get diploid gene retention patterns from BR or BO as query Step 3: look for if AN or CN is NS(non-syntenic) or NF(not found) and specifically with NS, the NS location is actually the homeologous site. Step 4: categorize gene losses into singleton, or segmental (defined as consecutive losses with a maximum skip of 1 """ from jcvi.utils.cbook import SummaryStats p = OptionParser(napus.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) napusbed, brbo, dpnp = args retention = {} fp = open(dpnp) for row in fp: seqid, query, hit = row.split() retention[query] = hit order = Bed(napusbed).order quartetsfile = "quartets" fp = open(brbo) fw = open(quartetsfile, "w") AL = "AN LOST" CL = "CN LOST" for row in fp: br, bo = row.split() if '.' in (br, bo): continue an, cn = retention[br], retention[bo] row = "\t".join((br, bo, an, cn)) if '.' in (an, cn): #print row continue # label loss candidates antag, anrange = get_tag(an, order) cntag, cnrange = get_tag(cn, order) if range_overlap(anrange, cnrange): if (antag, cntag) == ("NS", None): row = row + "\t{0}|{1}".format(AL, br) if (antag, cntag) == (None, "NS"): row = row + "\t{0}|{1}".format(CL, bo) print >> fw, row fw.close() logging.debug("Quartets and gene losses written to `{0}`.".\ format(quartetsfile)) # Parse the quartets file to extract singletons vs.segmental losses fp = open(quartetsfile) fw = open(quartetsfile + ".summary", "w") data = [x.rstrip().split("\t") for x in fp] skip = 1 # max distance between losses g = Grouper() losses = [(len(x) == 5) for x in data] for i, d in enumerate(losses): if not d: continue g.join(i, i) itag = data[i][-1].split("|")[0] for j in xrange(i + 1, i + skip + 1): jtag = data[j][-1].split("|")[0] if j < len(losses) and losses[j] and itag == jtag: g.join(i, j) losses = list(g) singletons = [x for x in losses if len(x) == 1] segments = [x for x in losses if len(x) > 1] ns, nm = len(singletons), len(segments) assert len(losses) == ns + nm grab_tag = lambda pool, tag: \ [x for x in pool if all(data[z][-1].startswith(tag) for z in x)] an_loss_singletons = grab_tag(singletons, AL) cn_loss_singletons = grab_tag(singletons, CL) als, cls = len(an_loss_singletons), len(cn_loss_singletons) an_loss_segments = grab_tag(segments, AL) cn_loss_segments = grab_tag(segments, CL) alm, clm = len(an_loss_segments), len(cn_loss_segments) mixed = len(segments) - alm - clm assert mixed == 0 logging.debug("Singletons: {0} (AN LOSS: {1}, CN LOSS: {2})".\ format(ns, als, cls)) logging.debug("Segments: {0} (AN LOSS: {1}, CN LOSS: {2})".\ format(nm, alm, clm)) print >> sys.stderr, SummaryStats([len(x) for x in losses]) for x in singletons + segments: print >> fw, "### LENGTH =", len(x) for i in x: print >> fw, "\t".join(data[i]) fw.close()
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.utils.iter import pairwise from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option("--prefix", default="scaffold", help="Prefix of the unplaced scaffolds [default: %default]") p.add_option("--minlinks", default=3, type="int", help="Minimum number of links to place [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print >> log ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print >> log, a print >> log, b flip_b = (astrand == bstrand) fbstrand = '-' if flip_b else '+' if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ('+', '-') if astrand == '+': offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print >> log, "*" + "\t".join(str(x) for x in start_range) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print >> log, alldepths maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print >> log, "Insufficient links ({0} < {1})".format(maxdepth, minlinks) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) msg = "Multiple conflicting candidates found" if nseqids != 1: print >> log, msg continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if (mmax - mmin) > maxdist: print >> log, msg continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == '+': nplus += 1 else: nminus += 1 fbstrand = '+' if nplus >= nminus else '-' candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print >> log, "Plus: {0}, Minus: {1}".format(nplus, nminus) print >> log, candidate candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".\ format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)
def paint(self, a, b, color): if range_overlap((0, self.start + 1, self.end - 1), (0, a, b)): self.r1.color = self.r2.color = self.color = color
def trimUTR(args): """ %prog trimUTR gffile Remove UTRs in the annotation set. If reference GFF3 is provided, reinstate UTRs from reference transcripts after trimming. Note: After running trimUTR, it is advised to also run `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3 to adjust the boundaries of all parent 'gene' features """ import gffutils from jcvi.formats.base import SetFile p = OptionParser(trimUTR.__doc__) p.add_option( "--trim5", default=None, type="str", help="File containing gene list for 5' UTR trimming", ) p.add_option( "--trim3", default=None, type="str", help="File containing gene list for 3' UTR trimming", ) p.add_option( "--trimrange", default=None, type="str", help="File containing gene list for UTR trim back" + "based on suggested (start, stop) coordinate range", ) p.add_option( "--refgff", default=None, type="str", help="Reference GFF3 used as fallback to replace UTRs", ) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (gffile, ) = args gff = make_index(gffile) trim_both = False if (opts.trim5 or opts.trim3) else True trim5 = SetFile(opts.trim5) if opts.trim5 else set() trim3 = SetFile(opts.trim3) if opts.trim3 else set() trimrange = dict() if opts.trimrange: trf = must_open(opts.trimrange) for tr in trf: assert (len(tr.split("\t")) == 3 ), "Must specify (start, stop) coordinate range" id, start, stop = tr.split("\t") trimrange[id] = (int(start), int(stop)) trf.close() refgff = make_index(opts.refgff) if opts.refgff else None fw = must_open(opts.outfile, "w") for feat in gff.iter_by_parent_childs(featuretype="gene", order_by=("seqid", "start"), level=1): for c in feat: cid, ctype, cparent = ( c.id, c.featuretype, c.attributes.get("Parent", [None])[0], ) t5, t3 = False, False if ctype == "gene": t5 = True if cid in trim5 else False t3 = True if cid in trim3 else False start, end = get_cds_minmax(gff, cid) trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) elif ctype == "mRNA": utr_types, extras = [], set() if any(id in trim5 for id in (cid, cparent)): t5 = True trim5.add(cid) if any(id in trim3 for id in (cid, cparent)): t3 = True trim3.add(cid) refc = None if refgff: try: refc = refgff[cid] refctype = refc.featuretype refptype = refgff[refc.attributes["Parent"] [0]].featuretype if refctype == "mRNA" and refptype == "gene": if cmp_children(cid, gff, refgff, cftype="CDS"): reinstate(c, refc, trim5=t5, trim3=t3, both=trim_both) if t5: utr_types.append("five_prime_UTR") if t3: utr_types.append("three_prime_UTR") for utr_type in utr_types: for utr in refgff.children( refc, featuretype=utr_type): extras.add(utr) for exon in refgff.region( region=utr, featuretype="exon"): if exon.attributes["Parent"][ 0] == cid: extras.add(exon) else: refc = None except gffutils.exceptions.FeatureNotFoundError: pass start, end = get_cds_minmax(gff, cid, level=1) if cid in trimrange: start, end = range_minmax([trimrange[cid], (start, end)]) if not refc: trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) for cc in gff.children(cid, order_by="start"): _ctype = cc.featuretype if _ctype not in utr_types: if _ctype != "CDS": if _ctype == "exon": eskip = [ range_overlap(to_range(cc), to_range(x)) for x in extras if x.featuretype == "exon" ] if any(eskip): continue trim(cc, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(cc, fw) else: fprint(cc, fw) for x in extras: fprint(x, fw) fw.close()
def paint(self, a, b, color): if range_overlap((0, self.start + 1 , self.end - 1), (0, a, b)): self.r1.color = self.r2.color = self.color = color
def test_range_overlap(a, b, ratio, expected): from jcvi.utils.range import range_overlap assert range_overlap(a, b, ratio) == expected
def trimUTR(args): """ %prog trimUTR gffile Remove UTRs in the annotation set. If reference GFF3 is provided, reinstate UTRs from reference transcripts after trimming. Note: After running trimUTR, it is advised to also run `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3 to adjust the boundaries of all parent 'gene' features """ import gffutils from jcvi.formats.base import SetFile p = OptionParser(trimUTR.__doc__) p.add_option("--trim5", default=None, type="str", \ help="File containing gene list for 5' UTR trimming") p.add_option("--trim3", default=None, type="str", \ help="File containing gene list for 3' UTR trimming") p.add_option("--trimrange", default=None, type="str", \ help="File containing gene list for UTR trim back" + \ "based on suggested (start, stop) coordinate range") p.add_option("--refgff", default=None, type="str", \ help="Reference GFF3 used as fallback to replace UTRs") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args gff = make_index(gffile) trim_both = False if (opts.trim5 or opts.trim3) else True trim5 = SetFile(opts.trim5) if opts.trim5 else set() trim3 = SetFile(opts.trim3) if opts.trim3 else set() trimrange = dict() if opts.trimrange: trf = must_open(opts.trimrange) for tr in trf: assert len(tr.split("\t")) == 3, \ "Must specify (start, stop) coordinate range" id, start, stop = tr.split("\t") trimrange[id] = (int(start), int(stop)) trf.close() refgff = make_index(opts.refgff) if opts.refgff else None fw = must_open(opts.outfile, "w") for feat in gff.iter_by_parent_childs(featuretype="gene", order_by=("seqid", "start"), level=1): for c in feat: cid, ctype, cparent = c.id, c.featuretype, \ c.attributes.get('Parent', [None])[0] t5, t3 = False, False if ctype == "gene": t5 = True if cid in trim5 else False t3 = True if cid in trim3 else False start, end = get_cds_minmax(gff, cid) trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) elif ctype == "mRNA": utr_types, extras = [], set() if any(id in trim5 for id in (cid, cparent)): t5 = True trim5.add(cid) if any(id in trim3 for id in (cid, cparent)): t3 = True trim3.add(cid) refc = None if refgff: try: refc = refgff[cid] refctype = refc.featuretype refptype = refgff[refc.attributes['Parent'][0]].featuretype if refctype == "mRNA" and refptype == "gene": if cmp_children(cid, gff, refgff, cftype="CDS"): reinstate(c, refc, trim5=t5, trim3=t3, both=trim_both) if t5: utr_types.append('five_prime_UTR') if t3: utr_types.append('three_prime_UTR') for utr_type in utr_types: for utr in refgff.children(refc, featuretype=utr_type): extras.add(utr) for exon in refgff.region(region=utr, featuretype="exon"): if exon.attributes['Parent'][0] == cid: extras.add(exon) else: refc = None except gffutils.exceptions.FeatureNotFoundError: pass start, end = get_cds_minmax(gff, cid, level=1) if cid in trimrange: start, end = range_minmax([trimrange[cid], (start, end)]) if not refc: trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) for cc in gff.children(cid, order_by=("start")): _ctype = cc.featuretype if _ctype not in utr_types: if _ctype != "CDS": if _ctype == "exon": eskip = [range_overlap(to_range(cc), to_range(x)) \ for x in extras if x.featuretype == 'exon'] if any(skip for skip in eskip): continue trim(cc, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(cc, fw) else: fprint(cc, fw) for x in extras: fprint(x, fw) fw.close()