def completeness(args): """ %prog completeness blastfile query.fasta > outfile Print statistics for each gene, the coverage of the alignment onto the best hit in AllGroup.niaa, as an indicator for completeness of the gene model. """ from jcvi.utils.range import range_minmax p = OptionParser(completeness.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args f = Sizes(fastafile).mapping b = BlastSlow(blastfile) for query, blines in groupby(b, key=lambda x: x.query): blines = list(blines) ranges = [(x.sstart, x.sstop) for x in blines] b = blines[0] query, subject = b.query, b.subject rmin, rmax = range_minmax(ranges) subject_len = f[subject] nterminal_dist = rmin - 1 cterminal_dist = subject_len - rmax + 1 print "\t".join( str(x) for x in (b.query, b.subject, nterminal_dist, cterminal_dist))
def completeness(args): """ %prog completeness blastfile query.fasta > outfile Print statistics for each gene, the coverage of the alignment onto the best hit in AllGroup.niaa, as an indicator for completeness of the gene model. """ from jcvi.utils.range import range_minmax p = OptionParser(completeness.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args f = Sizes(fastafile).mapping b = BlastSlow(blastfile) for query, blines in groupby(b, key=lambda x: x.query): blines = list(blines) ranges = [(x.sstart, x.sstop) for x in blines] b = blines[0] query, subject = b.query, b.subject rmin, rmax = range_minmax(ranges) subject_len = f[subject] nterminal_dist = rmin - 1 cterminal_dist = subject_len - rmax + 1 print "\t".join(str(x) for x in (b.query, b.subject, nterminal_dist, cterminal_dist))
def completeness(args): """ %prog completeness blastfile ref.fasta > outfile Print statistics for each gene, the coverage of the alignment onto the best hit, as an indicator for completeness of the gene model. For example, one might BLAST sugarcane ESTs against sorghum annotations as reference, to find full-length transcripts. """ from jcvi.utils.range import range_minmax from jcvi.utils.cbook import SummaryStats p = OptionParser(completeness.__doc__) p.add_option("--ids", help="Save ids that are over 50% complete [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args idsfile = opts.ids f = Sizes(fastafile).mapping b = BlastSlow(blastfile) valid = [] data = [] cutoff = 50 for query, blines in groupby(b, key=lambda x: x.query): blines = list(blines) ranges = [(x.sstart, x.sstop) for x in blines] b = blines[0] query, subject = b.query, b.subject rmin, rmax = range_minmax(ranges) subject_len = f[subject] nterminal_dist = rmin - 1 cterminal_dist = subject_len - rmax covered = (rmax - rmin + 1) * 100 / subject_len if covered > cutoff: valid.append(query) data.append((nterminal_dist, cterminal_dist, covered)) print "\t".join(str(x) for x in (query, subject, nterminal_dist, cterminal_dist, covered)) nd, cd, cv = zip(*data) m = "Total: {0}, Coverage > {1}%: {2}\n".format(len(data), cutoff, len(valid)) m += "N-terminal: {0}\n".format(SummaryStats(nd)) m += "C-terminal: {0}\n".format(SummaryStats(cd)) m += "Coverage: {0}".format(SummaryStats(cv)) print >> sys.stderr, m if idsfile: fw = open(idsfile, "w") print >> fw, "\n".join(valid) logging.debug("A total of {0} ids (cov > {1} %) written to `{2}`.".format(len(valid), cutoff, idsfile)) fw.close()
def gff(args): """ %prog gff btabfile Convert btab file generated by AAT to gff3 format. """ from jcvi.utils.range import range_minmax from jcvi.formats.gff import valid_gff_parent_child, valid_gff_type p = OptionParser(gff.__doc__) p.add_option("--source", default=None, help="Specify GFF source." + " By default, it picks algorithm used to generate btab file." + " [default: %default]") p.add_option("--type", default="protein_match", choices=valid_gff_type, help="GFF feature type [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) btabfile, = args btabdict = {} btab = Btab(btabfile, aat_dialect=True) osource = opts.source or "aat" otype = opts.type octype = valid_gff_parent_child[otype] for b in btab: nargs = b.nargs id = b.query + "-" + otype + "{0:05d}".format(b.chainNum) key = b.key if key not in btabdict: btabdict[key] = { 'id': id, 'method': b.method, 'query': b.query, 'subject': b.subject, 'strand': b.qStrand, 'sDesc': b.sDesc, 'coords': [], 'children': [] } btabdict[key]['coords'].append((b.qStart, b.qStop)) btabdict[key]['children'].append(b.gffline(source=osource, type=octype, id=id)) for v in btabdict.itervalues(): b = BtabLine("\t".join(str(x) for x in [0] * nargs), aat_dialect=True) id = v['id'] b.query = v['query'] b.method = v['method'] b.subject = v['subject'] b.qStrand = v['strand'] b.sDesc = v['sDesc'] b.qStart, b.qStop = range_minmax(v['coords']) print b.gffline(source=osource, type=otype, primary_tag="ID", id=id) print "\n".join(v['children'])
def get_boundary_bases(start, end, order): from jcvi.utils.range import range_minmax (i, s), (j, e) = order[start], order[end] seqid = s.seqid assert seqid == e.seqid startbase, endbase = range_minmax([(s.start, s.end), (e.start, e.end)]) return seqid, startbase, endbase
def clr(args): """ %prog blastfile fastafiles Calculate the vector clear range file based BLAST to the vectors. """ p = OptionParser(clr.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) blastfile = args[0] fastafiles = args[1:] sizes = {} for fa in fastafiles: f = Fasta(fa) sizes.update(f.itersizes()) b = Blast(blastfile) seen = set() for query, hits in b.iter_hits(): qsize = sizes[query] vectors = list((x.qstart, x.qstop) for x in hits) vmin, vmax = range_minmax(vectors) left_size = vmin - 1 right_size = qsize - vmax if left_size > right_size: clr_start, clr_end = 0, vmin else: clr_start, clr_end = vmax, qsize print "\t".join(str(x) for x in (query, clr_start, clr_end)) del sizes[query] for q, size in sorted(sizes.items()): print "\t".join(str(x) for x in (q, 0, size))
def get_cds_minmax(g, cid, level=2): cds = [x for x in g.children(cid, level) if x.featuretype == "CDS"] cdsranges = [(x.start, x.end) for x in cds] return range_minmax(cdsranges)
def pastegenes(args): """ %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly Paste in zero or low coverage genes. For a set of neighboring genes missing, add the whole cassette as unplaced scaffolds. For singletons the program will try to make a patch. """ from jcvi.formats.base import DictFile from jcvi.utils.cbook import gene_name p = OptionParser(pastegenes.__doc__) p.add_option( "--cutoff", default=90, type="int", help="Coverage cutoff to call gene missing", ) p.add_option( "--flank", default=2000, type="int", help="Get the seq of size on two ends", ) p.add_option( "--maxsize", default=50000, type="int", help="Maximum size of patchers to be replaced", ) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) coveragefile, oldbed, newbed, oldassembly = args cutoff = opts.cutoff flank = opts.flank maxsize = opts.maxsize coverage = DictFile(coveragefile, valuepos=2, cast=float) obed = Bed(oldbed) order = obed.order bed = [x for x in obed if x.accn in coverage] key = lambda x: coverage[x.accn] >= cutoff extrabed = "extra.bed" extendbed = "extend.bed" pastebed = "paste.bed" fw = open(extrabed, "w") fwe = open(extendbed, "w") fwp = open(pastebed, "w") fw_ids = open(extendbed + ".ids", "w") singletons, large, large_genes = 0, 0, 0 for chr, chrbed in groupby(bed, key=lambda x: x.seqid): chrbed = list(chrbed) for good, beds in groupby(chrbed, key=key): if good: continue beds = list(beds) blocksize = len(set([gene_name(x.accn) for x in beds])) if blocksize == 1: singletons += 1 accn = beds[0].accn gi, gb = order[accn] leftb = obed[gi - 1] rightb = obed[gi + 1] leftr = leftb.range rightr = rightb.range cur = gb.range distance_to_left, oo = range_distance(leftr, cur) distance_to_right, oo = range_distance(cur, rightr) span, oo = range_distance(leftr, rightr) if distance_to_left <= distance_to_right and distance_to_left > 0: label = "LEFT" else: label = "RIGHT" if 0 < span <= maxsize: print( "\t".join( str(x) for x in (chr, leftb.start, rightb.end, gb.accn) ), file=fwp, ) print(leftb, file=fwe) print(gb, file=fwe) print(rightb, file=fwe) print( "L:{0} R:{1} [{2}]".format( distance_to_left, distance_to_right, label ), file=fwe, ) print(gb.accn, file=fw_ids) continue large += 1 large_genes += blocksize ranges = [(x.start, x.end) for x in beds] rmin, rmax = range_minmax(ranges) rmin -= flank rmax += flank name = "-".join((beds[0].accn, beds[-1].accn)) print("\t".join(str(x) for x in (chr, rmin - 1, rmax, name)), file=fw) fw.close() fwe.close() extrabed = mergeBed(extrabed, d=flank, nms=True) fastaFromBed(extrabed, oldassembly, name=True) summary([extrabed]) logging.debug("Singleton blocks : {0}".format(singletons)) logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.formats.bed import BedLine from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option( "--prefix", default="scaffold", help="Prefix of the unplaced scaffolds", ) p.add_option( "--minlinks", default=3, type="int", help="Minimum number of links to place", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print(file=log) ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print(a, file=log) print(b, file=log) flip_b = astrand == bstrand fbstrand = "-" if flip_b else "+" if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ("+", "-") if astrand == "+": offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print("*" + "\t".join(str(x) for x in start_range), file=log) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print(alldepths, file=log) maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print("Insufficient links ({0} < {1})".format(maxdepth, minlinks), file=log) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) if nseqids != 1: msg = "Multiple conflicting candidates found" print(msg, file=log) continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if mmin >= mmax: msg = "Invalid (min, max) range" print("Invalid (min, max) range", file=log) continue if (mmax - mmin) > maxdist: msg = "(min, max) distance greater than library maxdist" print(msg, file=log) continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == "+": nplus += 1 else: nminus += 1 fbstrand = "+" if nplus >= nminus else "-" candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print("Plus: {0}, Minus: {1}".format(nplus, nminus), file=log) print(candidate, file=log) candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)
def loss(args): """ %prog loss a.b.i1.blocks [a.b-genomic.blast] Extract likely gene loss candidates between genome a and b. """ p = OptionParser(loss.__doc__) p.add_option("--bed", default=False, action="store_true", help="Genomic BLAST is in bed format [default: %default]") p.add_option("--gdist", default=20, type="int", help="Gene distance [default: %default]") p.add_option("--bdist", default=20000, type="int", help="Base pair distance [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) blocksfile = args[0] emptyblast = (len(args) == 1) if emptyblast: genomicblast = "empty.blast" sh("touch {0}".format(genomicblast)) else: genomicblast = args[1] gdist, bdist = opts.gdist, opts.bdist qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts) blocks = [] fp = open(blocksfile) genetrack = {} proxytrack = {} for row in fp: a, b = row.split() genetrack[a] = b blocks.append((a, b)) data = [] for key, rows in groupby(blocks, key=lambda x: x[-1]): rows = list(rows) data.append((key, rows)) imax = len(data) - 1 for i, (key, rows) in enumerate(data): if i == 0 or i == imax: continue if key != '.': continue before, br = data[i - 1] after, ar = data[i + 1] bi, bx = sorder[before] ai, ax = sorder[after] dist = abs(bi - ai) if bx.seqid != ax.seqid or dist > gdist: continue start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end))) start, end = max(start - bdist, 1), end + bdist proxy = (bx.seqid, start, end) for a, b in rows: proxytrack[a] = proxy tags = {} if opts.bed: bed = Bed(genomicblast, sorted=False) key = lambda x: gene_name(x.accn.rsplit(".", 1)[0]) for query, bb in groupby(bed, key=key): bb = list(bb) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.seqid, b.start, b.end) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.seqid, best_b.start, best_b.end) proxytrack[query] = hsp tags[query] = tag else: blast = Blast(genomicblast) for query, bb in blast.iter_hits(): bb = list(bb) query = gene_name(query) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.subject, b.sstart, b.sstop) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.subject, best_b.sstart, best_b.sstop) proxytrack[query] = hsp tags[query] = tag for b in qbed: accn = b.accn target_region = genetrack[accn] if accn in proxytrack: target_region = region_str(proxytrack[accn]) if accn in tags: ptag = "[{0}]".format(tags[accn]) else: ptag = "[NF]" target_region = ptag + target_region print "\t".join((b.seqid, accn, target_region)) if emptyblast: sh("rm -f {0}".format(genomicblast))
def pastegenes(args): """ %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly Paste in zero or low coverage genes. For a set of neighboring genes missing, add the whole cassette as unplaced scaffolds. For singletons the program will try to make a patch. """ from jcvi.formats.base import DictFile from jcvi.utils.cbook import gene_name p = OptionParser(pastegenes.__doc__) p.add_option("--cutoff", default=90, type="int", help="Coverage cutoff to call gene missing [default: %default]") p.add_option("--flank", default=2000, type="int", help="Get the seq of size on two ends [default: %default]") p.add_option("--maxsize", default=50000, type="int", help="Maximum size of patchers to be replaced [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) coveragefile, oldbed, newbed, oldassembly = args cutoff = opts.cutoff flank = opts.flank maxsize = opts.maxsize coverage = DictFile(coveragefile, valuepos=2, cast=float) obed = Bed(oldbed) order = obed.order bed = [x for x in obed if x.accn in coverage] key = lambda x: coverage[x.accn] >= cutoff extrabed = "extra.bed" extendbed = "extend.bed" pastebed = "paste.bed" fw = open(extrabed, "w") fwe = open(extendbed, "w") fwp = open(pastebed, "w") fw_ids = open(extendbed + ".ids", "w") singletons, large, large_genes = 0, 0, 0 for chr, chrbed in groupby(bed, key=lambda x: x.seqid): chrbed = list(chrbed) for good, beds in groupby(chrbed, key=key): if good: continue beds = list(beds) blocksize = len(set([gene_name(x.accn) for x in beds])) if blocksize == 1: singletons += 1 accn = beds[0].accn gi, gb = order[accn] leftb = obed[gi - 1] rightb = obed[gi + 1] leftr = leftb.range rightr = rightb.range cur = gb.range distance_to_left, oo = range_distance(leftr, cur) distance_to_right, oo = range_distance(cur, rightr) span, oo = range_distance(leftr, rightr) if distance_to_left <= distance_to_right and \ distance_to_left > 0: label = "LEFT" else: label = "RIGHT" if 0 < span <= maxsize: print >> fwp, "\t".join(str(x) for x in \ (chr, leftb.start, rightb.end, gb.accn)) print >> fwe, leftb print >> fwe, gb print >> fwe, rightb print >> fwe, "L:{0} R:{1} [{2}]".format(distance_to_left, \ distance_to_right, label) print >> fw_ids, gb.accn continue large += 1 large_genes += blocksize ranges = [(x.start, x.end) for x in beds] rmin, rmax = range_minmax(ranges) rmin -= flank rmax += flank name = "-".join((beds[0].accn, beds[-1].accn)) print >> fw, "\t".join(str(x) for x in (chr, rmin - 1, rmax, name)) fw.close() fwe.close() extrabed = mergeBed(extrabed, d=flank, nms=True) fastaFromBed(extrabed, oldassembly, name=True) summary([extrabed]) logging.debug("Singleton blocks : {0}".format(singletons)) logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.utils.iter import pairwise from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option("--prefix", default="scaffold", help="Prefix of the unplaced scaffolds [default: %default]") p.add_option("--minlinks", default=3, type="int", help="Minimum number of links to place [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print >> log ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print >> log, a print >> log, b flip_b = (astrand == bstrand) fbstrand = '-' if flip_b else '+' if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ('+', '-') if astrand == '+': offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print >> log, "*" + "\t".join(str(x) for x in start_range) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print >> log, alldepths maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print >> log, "Insufficient links ({0} < {1})".format(maxdepth, minlinks) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) msg = "Multiple conflicting candidates found" if nseqids != 1: print >> log, msg continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if (mmax - mmin) > maxdist: print >> log, msg continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == '+': nplus += 1 else: nminus += 1 fbstrand = '+' if nplus >= nminus else '-' candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print >> log, "Plus: {0}, Minus: {1}".format(nplus, nminus) print >> log, candidate candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".\ format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)
def gff(args): """ %prog gff btabfile Convert btab file generated by AAT to gff3 format. """ from jcvi.utils.range import range_minmax from jcvi.formats.gff import valid_gff_parent_child, valid_gff_type p = OptionParser(gff.__doc__) p.add_option( "--source", default=None, help="Specify GFF source." + " By default, it picks algorithm used to generate btab file." + " [default: %default]") p.add_option("--type", default="protein_match", choices=valid_gff_type, help="GFF feature type [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) btabfile, = args btabdict = {} btab = Btab(btabfile, aat_dialect=True) osource = opts.source or "aat" otype = opts.type octype = valid_gff_parent_child[otype] for b in btab: nargs = b.nargs id = b.query + "-" + otype + "{0:05d}".format(b.chainNum) key = b.key if key not in btabdict: btabdict[key] = { 'id': id, 'method': b.method, 'query': b.query, 'subject': b.subject, 'strand': b.qStrand, 'sDesc': b.sDesc, 'coords': [], 'children': [] } btabdict[key]['coords'].append((b.qStart, b.qStop)) btabdict[key]['children'].append( b.gffline(source=osource, type=octype, id=id)) for v in btabdict.itervalues(): b = BtabLine("\t".join(str(x) for x in [0] * nargs), aat_dialect=True) id = v['id'] b.query = v['query'] b.method = v['method'] b.subject = v['subject'] b.qStrand = v['strand'] b.sDesc = v['sDesc'] b.qStart, b.qStop = range_minmax(v['coords']) print b.gffline(source=osource, type=otype, primary_tag="ID", id=id) print "\n".join(v['children'])
def completeness(args): """ %prog completeness blastfile ref.fasta > outfile Print statistics for each gene, the coverage of the alignment onto the best hit, as an indicator for completeness of the gene model. For example, one might BLAST sugarcane ESTs against sorghum annotations as reference, to find full-length transcripts. """ from jcvi.utils.range import range_minmax from jcvi.utils.cbook import SummaryStats p = OptionParser(completeness.__doc__) p.add_option( "--ids", help="Save ids that are over 50% complete [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args idsfile = opts.ids f = Sizes(fastafile).mapping b = BlastSlow(blastfile) valid = [] data = [] cutoff = 50 for query, blines in groupby(b, key=lambda x: x.query): blines = list(blines) ranges = [(x.sstart, x.sstop) for x in blines] b = blines[0] query, subject = b.query, b.subject rmin, rmax = range_minmax(ranges) subject_len = f[subject] nterminal_dist = rmin - 1 cterminal_dist = subject_len - rmax covered = (rmax - rmin + 1) * 100 / subject_len if covered > cutoff: valid.append(query) data.append((nterminal_dist, cterminal_dist, covered)) print "\t".join( str(x) for x in (query, subject, nterminal_dist, cterminal_dist, covered)) nd, cd, cv = zip(*data) m = "Total: {0}, Coverage > {1}%: {2}\n".\ format(len(data), cutoff, len(valid)) m += "N-terminal: {0}\n".format(SummaryStats(nd)) m += "C-terminal: {0}\n".format(SummaryStats(cd)) m += "Coverage: {0}".format(SummaryStats(cv)) print >> sys.stderr, m if idsfile: fw = open(idsfile, "w") print >> fw, "\n".join(valid) logging.debug("A total of {0} ids (cov > {1} %) written to `{2}`.".\ format(len(valid), cutoff, idsfile)) fw.close()
def trimUTR(args): """ %prog trimUTR gffile Remove UTRs in the annotation set. If reference GFF3 is provided, reinstate UTRs from reference transcripts after trimming. Note: After running trimUTR, it is advised to also run `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3 to adjust the boundaries of all parent 'gene' features """ import gffutils from jcvi.formats.base import SetFile p = OptionParser(trimUTR.__doc__) p.add_option("--trim5", default=None, type="str", \ help="File containing gene list for 5' UTR trimming") p.add_option("--trim3", default=None, type="str", \ help="File containing gene list for 3' UTR trimming") p.add_option("--trimrange", default=None, type="str", \ help="File containing gene list for UTR trim back" + \ "based on suggested (start, stop) coordinate range") p.add_option("--refgff", default=None, type="str", \ help="Reference GFF3 used as fallback to replace UTRs") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args gff = make_index(gffile) trim_both = False if (opts.trim5 or opts.trim3) else True trim5 = SetFile(opts.trim5) if opts.trim5 else set() trim3 = SetFile(opts.trim3) if opts.trim3 else set() trimrange = dict() if opts.trimrange: trf = must_open(opts.trimrange) for tr in trf: assert len(tr.split("\t")) == 3, \ "Must specify (start, stop) coordinate range" id, start, stop = tr.split("\t") trimrange[id] = (int(start), int(stop)) trf.close() refgff = make_index(opts.refgff) if opts.refgff else None fw = must_open(opts.outfile, "w") for feat in gff.iter_by_parent_childs(featuretype="gene", order_by=("seqid", "start"), level=1): for c in feat: cid, ctype, cparent = c.id, c.featuretype, \ c.attributes.get('Parent', [None])[0] t5, t3 = False, False if ctype == "gene": t5 = True if cid in trim5 else False t3 = True if cid in trim3 else False start, end = get_cds_minmax(gff, cid) trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) elif ctype == "mRNA": utr_types, extras = [], set() if any(id in trim5 for id in (cid, cparent)): t5 = True trim5.add(cid) if any(id in trim3 for id in (cid, cparent)): t3 = True trim3.add(cid) refc = None if refgff: try: refc = refgff[cid] refctype = refc.featuretype refptype = refgff[refc.attributes['Parent'][0]].featuretype if refctype == "mRNA" and refptype == "gene": if cmp_children(cid, gff, refgff, cftype="CDS"): reinstate(c, refc, trim5=t5, trim3=t3, both=trim_both) if t5: utr_types.append('five_prime_UTR') if t3: utr_types.append('three_prime_UTR') for utr_type in utr_types: for utr in refgff.children(refc, featuretype=utr_type): extras.add(utr) for exon in refgff.region(region=utr, featuretype="exon"): if exon.attributes['Parent'][0] == cid: extras.add(exon) else: refc = None except gffutils.exceptions.FeatureNotFoundError: pass start, end = get_cds_minmax(gff, cid, level=1) if cid in trimrange: start, end = range_minmax([trimrange[cid], (start, end)]) if not refc: trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) for cc in gff.children(cid, order_by=("start")): _ctype = cc.featuretype if _ctype not in utr_types: if _ctype != "CDS": if _ctype == "exon": eskip = [range_overlap(to_range(cc), to_range(x)) \ for x in extras if x.featuretype == 'exon'] if any(skip for skip in eskip): continue trim(cc, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(cc, fw) else: fprint(cc, fw) for x in extras: fprint(x, fw) fw.close()
def trimUTR(args): """ %prog trimUTR gffile Remove UTRs in the annotation set. If reference GFF3 is provided, reinstate UTRs from reference transcripts after trimming. Note: After running trimUTR, it is advised to also run `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3 to adjust the boundaries of all parent 'gene' features """ import gffutils from jcvi.formats.base import SetFile p = OptionParser(trimUTR.__doc__) p.add_option( "--trim5", default=None, type="str", help="File containing gene list for 5' UTR trimming", ) p.add_option( "--trim3", default=None, type="str", help="File containing gene list for 3' UTR trimming", ) p.add_option( "--trimrange", default=None, type="str", help="File containing gene list for UTR trim back" + "based on suggested (start, stop) coordinate range", ) p.add_option( "--refgff", default=None, type="str", help="Reference GFF3 used as fallback to replace UTRs", ) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (gffile, ) = args gff = make_index(gffile) trim_both = False if (opts.trim5 or opts.trim3) else True trim5 = SetFile(opts.trim5) if opts.trim5 else set() trim3 = SetFile(opts.trim3) if opts.trim3 else set() trimrange = dict() if opts.trimrange: trf = must_open(opts.trimrange) for tr in trf: assert (len(tr.split("\t")) == 3 ), "Must specify (start, stop) coordinate range" id, start, stop = tr.split("\t") trimrange[id] = (int(start), int(stop)) trf.close() refgff = make_index(opts.refgff) if opts.refgff else None fw = must_open(opts.outfile, "w") for feat in gff.iter_by_parent_childs(featuretype="gene", order_by=("seqid", "start"), level=1): for c in feat: cid, ctype, cparent = ( c.id, c.featuretype, c.attributes.get("Parent", [None])[0], ) t5, t3 = False, False if ctype == "gene": t5 = True if cid in trim5 else False t3 = True if cid in trim3 else False start, end = get_cds_minmax(gff, cid) trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) elif ctype == "mRNA": utr_types, extras = [], set() if any(id in trim5 for id in (cid, cparent)): t5 = True trim5.add(cid) if any(id in trim3 for id in (cid, cparent)): t3 = True trim3.add(cid) refc = None if refgff: try: refc = refgff[cid] refctype = refc.featuretype refptype = refgff[refc.attributes["Parent"] [0]].featuretype if refctype == "mRNA" and refptype == "gene": if cmp_children(cid, gff, refgff, cftype="CDS"): reinstate(c, refc, trim5=t5, trim3=t3, both=trim_both) if t5: utr_types.append("five_prime_UTR") if t3: utr_types.append("three_prime_UTR") for utr_type in utr_types: for utr in refgff.children( refc, featuretype=utr_type): extras.add(utr) for exon in refgff.region( region=utr, featuretype="exon"): if exon.attributes["Parent"][ 0] == cid: extras.add(exon) else: refc = None except gffutils.exceptions.FeatureNotFoundError: pass start, end = get_cds_minmax(gff, cid, level=1) if cid in trimrange: start, end = range_minmax([trimrange[cid], (start, end)]) if not refc: trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) for cc in gff.children(cid, order_by="start"): _ctype = cc.featuretype if _ctype not in utr_types: if _ctype != "CDS": if _ctype == "exon": eskip = [ range_overlap(to_range(cc), to_range(x)) for x in extras if x.featuretype == "exon" ] if any(eskip): continue trim(cc, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(cc, fw) else: fprint(cc, fw) for x in extras: fprint(x, fw) fw.close()
def test_range_minmax(ranges, expected): from jcvi.utils.range import range_minmax assert range_minmax(ranges) == expected