def longest(args): """ %prog longest bedfile fastafile Select longest feature within overlapping piles. """ from jcvi.formats.sizes import Sizes p = OptionParser(longest.__doc__) p.add_option("--maxsize", default=20000, type="int", help="Limit max size") p.add_option("--minsize", default=60, type="int", help="Limit min size") p.add_option("--precedence", default="Medtr", help="Accessions with prefix take precedence") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, fastafile = args maxsize = opts.maxsize minsize = opts.minsize prec = opts.precedence mergedbed = mergeBed(bedfile, nms=True) sizes = Sizes(fastafile).mapping bed = Bed(mergedbed) pf = bedfile.rsplit(".", 1)[0] ids = set() for b in bed: accns = b.accn.split(";") prec_accns = [x for x in accns if x.startswith(prec)] if prec_accns: accns = prec_accns accn_sizes = [(sizes.get(x, 0), x) for x in accns] accn_sizes = [(size, x) for size, x in accn_sizes if size < maxsize] if not accn_sizes: continue max_size, max_accn = max(accn_sizes) if max_size < minsize: continue ids.add(max_accn) newids = remove_isoforms(ids) logging.debug("Remove isoforms: before={0} after={1}".\ format(len(ids), len(newids))) longestidsfile = pf + ".longest.ids" fw = open(longestidsfile, "w") print >> fw, "\n".join(newids) fw.close() logging.debug("A total of {0} records written to `{1}`.".\ format(len(newids), longestidsfile)) longestbedfile = pf + ".longest.bed" some([ bedfile, longestidsfile, "--outfile={0}".format(longestbedfile), "--no_strip_names" ])
def longest(args): """ %prog longest bedfile fastafile Select longest feature within overlapping piles. """ from jcvi.formats.sizes import Sizes p = OptionParser(longest.__doc__) p.add_option("--maxsize", default=20000, type="int", help="Limit max size") p.add_option("--minsize", default=60, type="int", help="Limit min size") p.add_option("--precedence", default="Medtr", help="Accessions with prefix take precedence") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, fastafile = args maxsize = opts.maxsize minsize = opts.minsize prec = opts.precedence mergedbed = mergeBed(bedfile, nms=True) sizes = Sizes(fastafile).mapping bed = Bed(mergedbed) pf = bedfile.rsplit(".", 1)[0] ids = set() for b in bed: accns = b.accn.split(";") prec_accns = [x for x in accns if x.startswith(prec)] if prec_accns: accns = prec_accns accn_sizes = [(sizes.get(x, 0), x) for x in accns] accn_sizes = [(size, x) for size, x in accn_sizes if size < maxsize] if not accn_sizes: continue max_size, max_accn = max(accn_sizes) if max_size < minsize: continue ids.add(max_accn) newids = remove_isoforms(ids) logging.debug("Remove isoforms: before={0} after={1}".\ format(len(ids), len(newids))) longestidsfile = pf + ".longest.ids" fw = open(longestidsfile, "w") print >> fw, "\n".join(newids) fw.close() logging.debug("A total of {0} records written to `{1}`.".\ format(len(newids), longestidsfile)) longestbedfile = pf + ".longest.bed" some([bedfile, longestidsfile, "--outfile={0}".format(longestbedfile), "--no_strip_names"])