def get_stats(coordsfile): from jcvi.utils.range import range_union logging.debug("Report stats on `%s`" % coordsfile) coords = Coords(coordsfile) ref_ivs = [] qry_ivs = [] identicals = 0 alignlen = 0 for c in coords: qstart, qstop = c.start2, c.end2 if qstart > qstop: qstart, qstop = qstop, qstart qry_ivs.append((c.query, qstart, qstop)) sstart, sstop = c.start1, c.end1 if sstart > sstop: sstart, sstop = sstop, sstart ref_ivs.append((c.ref, sstart, sstop)) alen = sstop - sstart alignlen += alen identicals += c.identity / 100. * alen qrycovered = range_union(qry_ivs) refcovered = range_union(ref_ivs) id_pct = identicals * 100. / alignlen return qrycovered, refcovered, id_pct
def get_stats(blastfile): from jcvi.utils.range import range_union logging.debug("report stats on `%s`" % blastfile) fp = open(blastfile) ref_ivs = [] qry_ivs = [] identicals = 0 alignlen = 0 for row in fp: c = BlastLine(row) qstart, qstop = c.qstart, c.qstop if qstart > qstop: qstart, qstop = qstop, qstart qry_ivs.append((c.query, qstart, qstop)) sstart, sstop = c.sstart, c.sstop if sstart > sstop: sstart, sstop = sstop, sstart ref_ivs.append((c.subject, sstart, sstop)) alen = sstop - sstart alignlen += alen identicals += c.pctid / 100. * alen qrycovered = range_union(qry_ivs) refcovered = range_union(ref_ivs) id_pct = identicals * 100. / alignlen return qrycovered, refcovered, id_pct
def get_stats(blastfile, strict=False): from jcvi.utils.range import range_union, range_span from .pyblast import BlastLine logging.debug("Report stats on `%s`" % blastfile) fp = open(blastfile) ref_ivs = [] qry_ivs = [] identicals = 0 ngaps = 0 alignlens = [] for row in fp: c = BlastLine(row) qstart, qstop = c.qstart, c.qstop if qstart > qstop: qstart, qstop = qstop, qstart qry_ivs.append((c.query, qstart, qstop)) sstart, sstop = c.sstart, c.sstop if sstart > sstop: sstart, sstop = sstop, sstart ref_ivs.append((c.subject, sstart, sstop)) alen = c.hitlen ngaps += c.ngaps identicals += c.hitlen - c.nmismatch - c.ngaps alignlens.append(alen) qrycovered = range_union(qry_ivs) refcovered = range_union(ref_ivs) if strict: # We discount gaps in counting covered bases, since we # did not track individually gaps in qry and ref, we assume # the gaps are opened evenly in the two sequences qrycovered -= ngaps / 2 refcovered -= ngaps / 2 qryspan = range_span(qry_ivs) refspan = range_span(ref_ivs) _, AL50, _ = calculate_A50(alignlens) filename = op.basename(blastfile) alignstats = AlignStats( filename, qrycovered, refcovered, qryspan, refspan, identicals, AL50 ) return alignstats
def get_stats(blastfile, strict=False): from jcvi.utils.range import range_union, range_span from .pyblast import BlastLine logging.debug("Report stats on `%s`" % blastfile) fp = open(blastfile) ref_ivs = [] qry_ivs = [] identicals = 0 ngaps = 0 alignlens = [] for row in fp: c = BlastLine(row) qstart, qstop = c.qstart, c.qstop if qstart > qstop: qstart, qstop = qstop, qstart qry_ivs.append((c.query, qstart, qstop)) sstart, sstop = c.sstart, c.sstop if sstart > sstop: sstart, sstop = sstop, sstart ref_ivs.append((c.subject, sstart, sstop)) alen = c.hitlen ngaps += c.ngaps identicals += c.hitlen - c.nmismatch - c.ngaps alignlens.append(alen) qrycovered = range_union(qry_ivs) refcovered = range_union(ref_ivs) if strict: # We discount gaps in counting covered bases, since we # did not track individually gaps in qry and ref, we assume # the gaps are opened evenly in the two sequences qrycovered -= ngaps / 2 refcovered -= ngaps / 2 qryspan = range_span(qry_ivs) refspan = range_span(ref_ivs) _, AL50, _ = calculate_A50(alignlens) filename = op.basename(blastfile) alignstats = AlignStats(filename, qrycovered, refcovered, qryspan, refspan, identicals, AL50) return alignstats
def sum(self, seqid=None, unique=True): if seqid: ranges = [(x.seqid, x.start, x.end) for x in self if x.seqid == seqid] else: ranges = [(x.seqid, x.start, x.end) for x in self] unique_sum = range_union(ranges) raw_sum = sum(x.span for x in self) return unique_sum if unique else raw_sum
def bed_sum(beds, seqid=None, unique=True): if seqid: ranges = [(x.seqid, x.start, x.end) for x in beds \ if x.seqid == seqid] else: ranges = [(x.seqid, x.start, x.end) for x in beds] unique_sum = range_union(ranges) raw_sum = sum(x.span for x in beds) return unique_sum if unique else raw_sum
def sum(self, seqid=None, unique=True): if seqid: ranges = [(x.seqid, x.start, x.end) for x in self \ if x.seqid == seqid] else: ranges = [(x.seqid, x.start, x.end) for x in self] unique_sum = range_union(ranges) raw_sum = sum(x.span for x in self) return unique_sum if unique else raw_sum
def get_stats(coordsfile): from jcvi.utils.range import range_union logging.debug("Report stats on `%s`" % coordsfile) coords = Coords(coordsfile) ref_ivs = [] qry_ivs = [] identicals = 0 alignlen = 0 alignlens = [] for c in coords: qstart, qstop = c.start2, c.end2 if qstart > qstop: qstart, qstop = qstop, qstart qry_ivs.append((c.query, qstart, qstop)) sstart, sstop = c.start1, c.end1 if sstart > sstop: sstart, sstop = sstop, sstart ref_ivs.append((c.ref, sstart, sstop)) alen = sstop - sstart alignlen += alen identicals += c.identity / 100. * alen alignlens.append(alen) qrycovered = range_union(qry_ivs) refcovered = range_union(ref_ivs) _, AL50, _ = calculate_A50(alignlens) filename = op.basename(coordsfile) alignstats = AlignStats(filename, qrycovered, refcovered, None, None, identicals) return alignstats
def get_stats(coordsfile): from jcvi.utils.range import range_union logging.debug("report stats on `%s`" % coordsfile) fp = open(coordsfile) ref_ivs = [] qry_ivs = [] identicals = 0 alignlen = 0 for row in fp: try: c = CoordsLine(row) except AssertionError: continue qstart, qstop = c.start2, c.end2 if qstart > qstop: qstart, qstop = qstop, qstart qry_ivs.append((c.query, qstart, qstop)) sstart, sstop = c.start1, c.end1 if sstart > sstop: sstart, sstop = sstop, sstart ref_ivs.append((c.ref, sstart, sstop)) alen = sstop - sstart alignlen += alen identicals += c.identity / 100. * alen qrycovered = range_union(qry_ivs) refcovered = range_union(ref_ivs) id_pct = identicals * 100. / alignlen return qrycovered, refcovered, id_pct
def covfilter(args): """ %prog covfilter blastfile fastafile Fastafile is used to get the sizes of the queries. Two filters can be applied, the id% and cov%. """ from jcvi.algorithms.supermap import supermap from jcvi.utils.range import range_union allowed_iterby = ("query", "query_sbjct") p = OptionParser(covfilter.__doc__) p.set_align(pctid=95, pctcov=50) p.add_option("--scov", default=False, action="store_true", help="Subject coverage instead of query [default: %default]") p.add_option("--supermap", action="store_true", help="Use supermap instead of union") p.add_option("--ids", dest="ids", default=None, help="Print out the ids that satisfy [default: %default]") p.add_option("--list", dest="list", default=False, action="store_true", help="List the id% and cov% per gene [default: %default]") p.add_option( "--iterby", dest="iterby", default="query", choices=allowed_iterby, help="Choose how to iterate through BLAST [default: %default]") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args pctid = opts.pctid pctcov = opts.pctcov union = not opts.supermap scov = opts.scov sz = Sizes(fastafile) sizes = sz.mapping iterby = opts.iterby qspair = iterby == "query_sbjct" if not union: querysupermap = blastfile + ".query.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") blastfile = querysupermap assert op.exists(blastfile) covered = 0 mismatches = 0 gaps = 0 alignlen = 0 queries = set() valid = set() blast = BlastSlow(blastfile) iterator = blast.iter_hits_pair if qspair else blast.iter_hits covidstore = {} for query, blines in iterator(): blines = list(blines) queries.add(query) # per gene report this_covered = 0 this_alignlen = 0 this_mismatches = 0 this_gaps = 0 this_identity = 0 ranges = [] for b in blines: if scov: s, start, stop = b.subject, b.sstart, b.sstop else: s, start, stop = b.query, b.qstart, b.qstop cov_id = s if b.pctid < pctid: continue if start > stop: start, stop = stop, start this_covered += stop - start + 1 this_alignlen += b.hitlen this_mismatches += b.nmismatch this_gaps += b.ngaps ranges.append(("1", start, stop)) if ranges: this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen if union: this_covered = range_union(ranges) this_coverage = this_covered * 100. / sizes[cov_id] covidstore[query] = (this_identity, this_coverage) if this_identity >= pctid and this_coverage >= pctcov: valid.add(query) covered += this_covered mismatches += this_mismatches gaps += this_gaps alignlen += this_alignlen if opts.list: if qspair: allpairs = defaultdict(list) for (q, s) in covidstore: allpairs[q].append((q, s)) allpairs[s].append((q, s)) for id, size in sz.iter_sizes(): if id not in allpairs: print "\t".join((id, "na", "0", "0")) else: for qs in allpairs[id]: this_identity, this_coverage = covidstore[qs] print "{0}\t{1:.1f}\t{2:.1f}".format( "\t".join(qs), this_identity, this_coverage) else: for query, size in sz.iter_sizes(): this_identity, this_coverage = covidstore.get(query, (0, 0)) print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage) mapped_count = len(queries) valid_count = len(valid) cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts) m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\ format(mismatches, gaps, alignlen) total = len(sizes.keys()) m += "Total mapped: {0} ({1:.1f}% of {2})\n".\ format(mapped_count, mapped_count * 100. / total, total) m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\ format(cutoff_message, valid_count, valid_count * 100. / total, total) m += "Average id = {0:.2f}%\n".\ format(100 - (mismatches + gaps) * 100. / alignlen) queries_combined = sz.totalsize m += "Coverage: {0} covered, {1} total\n".\ format(covered, queries_combined) m += "Average coverage = {0:.2f}%".\ format(covered * 100. / queries_combined) logfile = blastfile + ".covfilter.log" fw = open(logfile, "w") for f in (sys.stderr, fw): print >> f, m fw.close() if opts.ids: filename = opts.ids fw = must_open(filename, "w") for id in valid: print >> fw, id logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\ format(cutoff_message, filename)) outfile = opts.outfile if not outfile: return fw = must_open(outfile, "w") blast = Blast(blastfile) for b in blast: query = (b.query, b.subject) if qspair else b.query if query in valid: print >> fw, b
def covfilter(args): """ %prog covfilter blastfile fastafile Fastafile is used to get the sizes of the queries. Two filters can be applied, the id% and cov%. """ from jcvi.algorithms.supermap import supermap from jcvi.utils.range import range_union allowed_iterby = ("query", "query_sbjct") p = OptionParser(covfilter.__doc__) p.set_align(pctid=95, pctcov=50) p.add_option("--scov", default=False, action="store_true", help="Subject coverage instead of query [default: %default]") p.add_option("--supermap", action="store_true", help="Use supermap instead of union") p.add_option("--ids", dest="ids", default=None, help="Print out the ids that satisfy [default: %default]") p.add_option("--list", dest="list", default=False, action="store_true", help="List the id% and cov% per gene [default: %default]") p.add_option("--iterby", dest="iterby", default="query", choices=allowed_iterby, help="Choose how to iterate through BLAST [default: %default]") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args pctid = opts.pctid pctcov = opts.pctcov union = not opts.supermap scov = opts.scov sz = Sizes(fastafile) sizes = sz.mapping iterby = opts.iterby qspair = iterby == "query_sbjct" if not union: querysupermap = blastfile + ".query.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") blastfile = querysupermap assert op.exists(blastfile) covered = 0 mismatches = 0 gaps = 0 alignlen = 0 queries = set() valid = set() blast = BlastSlow(blastfile) iterator = blast.iter_hits_pair if qspair else blast.iter_hits covidstore = {} for query, blines in iterator(): blines = list(blines) queries.add(query) # per gene report this_covered = 0 this_alignlen = 0 this_mismatches = 0 this_gaps = 0 this_identity = 0 ranges = [] for b in blines: if scov: s, start, stop = b.subject, b.sstart, b.sstop else: s, start, stop = b.query, b.qstart, b.qstop cov_id = s if b.pctid < pctid: continue if start > stop: start, stop = stop, start this_covered += stop - start + 1 this_alignlen += b.hitlen this_mismatches += b.nmismatch this_gaps += b.ngaps ranges.append(("1", start, stop)) if ranges: this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen if union: this_covered = range_union(ranges) this_coverage = this_covered * 100. / sizes[cov_id] covidstore[query] = (this_identity, this_coverage) if this_identity >= pctid and this_coverage >= pctcov: valid.add(query) covered += this_covered mismatches += this_mismatches gaps += this_gaps alignlen += this_alignlen if opts.list: if qspair: allpairs = defaultdict(list) for (q, s) in covidstore: allpairs[q].append((q, s)) allpairs[s].append((q, s)) for id, size in sz.iter_sizes(): if id not in allpairs: print "\t".join((id, "na", "0", "0")) else: for qs in allpairs[id]: this_identity, this_coverage = covidstore[qs] print "{0}\t{1:.1f}\t{2:.1f}".format("\t".join(qs), this_identity, this_coverage) else: for query, size in sz.iter_sizes(): this_identity, this_coverage = covidstore.get(query, (0, 0)) print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage) mapped_count = len(queries) valid_count = len(valid) cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts) m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\ format(mismatches, gaps, alignlen) total = len(sizes.keys()) m += "Total mapped: {0} ({1:.1f}% of {2})\n".\ format(mapped_count, mapped_count * 100. / total, total) m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\ format(cutoff_message, valid_count, valid_count * 100. / total, total) m += "Average id = {0:.2f}%\n".\ format(100 - (mismatches + gaps) * 100. / alignlen) queries_combined = sz.totalsize m += "Coverage: {0} covered, {1} total\n".\ format(covered, queries_combined) m += "Average coverage = {0:.2f}%".\ format(covered * 100. / queries_combined) logfile = blastfile + ".covfilter.log" fw = open(logfile, "w") for f in (sys.stderr, fw): print >> f, m fw.close() if opts.ids: filename = opts.ids fw = must_open(filename, "w") for id in valid: print >> fw, id logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\ format(cutoff_message, filename)) outfile = opts.outfile if not outfile: return fw = must_open(outfile, "w") blast = Blast(blastfile) for b in blast: query = (b.query, b.subject) if qspair else b.query if query in valid: print >> fw, b
def test_range_union(ranges, expected): from jcvi.utils.range import range_union assert range_union(ranges) == expected