def plot_GC3(GC3, cdsfile, fill="white"): from jcvi.graphics.histogram import histogram numberfile = "{0}.gc3".format(cdsfile) fw = must_open(numberfile, "w") fw.write("\n".join(map(str, GC3.values()))) fw.close() histogram(numberfile, vmin=0, vmax=1, xlabel="GC3", title=cdsfile, bins=50, skip=0, ascii=False, log=0, fill=fill) logging.debug("{0} GC3 values plotted to {1}.pdf".\ format(len(GC3), numberfile))
def report_pairs(data, cutoff=0, mateorientation=None, pairsfile=None, insertsfile=None, rclip=1, ascii=False, bins=20, distmode="ss", mpcutoff=1000): """ This subroutine is used by the pairs function in blast.py and cas.py. Reports number of fragments and pairs as well as linked pairs """ allowed_mateorientations = ("++", "--", "+-", "-+") if mateorientation: assert mateorientation in allowed_mateorientations num_fragments, num_pairs = 0, 0 all_dist = [] linked_dist = [] # +- (forward-backward) is `innie`, -+ (backward-forward) is `outie` orientations = defaultdict(int) # clip how many chars from end of the read name to get pair name key = (lambda x: x.accn[:-rclip]) if rclip else (lambda x: x.accn) data.sort(key=key) if pairsfile: pairsfw = open(pairsfile, "w") if insertsfile: insertsfw = open(insertsfile, "w") for pe, lines in groupby(data, key=key): lines = list(lines) if len(lines) != 2: num_fragments += len(lines) continue num_pairs += 1 a, b = lines asubject, astart, astop = a.seqid, a.start, a.end bsubject, bstart, bstop = b.seqid, b.start, b.end aquery, bquery = a.accn, b.accn astrand, bstrand = a.strand, b.strand dist, orientation = range_distance(\ (asubject, astart, astop, astrand), (bsubject, bstart, bstop, bstrand), distmode=distmode) if dist >= 0: all_dist.append((dist, orientation, aquery, bquery)) # select only pairs with certain orientations - e.g. innies, outies, etc. if mateorientation: all_dist = [x for x in all_dist if x[1] == mateorientation] # try to infer cutoff as twice the median until convergence if cutoff <= 0: dists = np.array([x[0] for x in all_dist], dtype="int") p0 = analyze_dists(dists, cutoff=mpcutoff) cutoff = int(2 * p0) # initial estimate cutoff = int(math.ceil(cutoff / bins)) * bins logging.debug("Insert size cutoff set to {0}, ".format(cutoff) + "use '--cutoff' to override") for dist, orientation, aquery, bquery in all_dist: if dist > cutoff: continue if cutoff > 2 * mpcutoff and dist < mpcutoff: continue linked_dist.append(dist) if pairsfile: print >> pairsfw, "{0}\t{1}\t{2}".format(aquery, bquery, dist) orientations[orientation] += 1 print >>sys.stderr, "{0} fragments, {1} pairs ({2} total)".\ format(num_fragments, num_pairs, num_fragments + num_pairs * 2) s = SummaryStats(linked_dist, dtype="int") num_links = s.size meandist, stdev = s.mean, s.sd p0, p1, p2 = s.median, s.p1, s.p2 print >>sys.stderr, "%d pairs (%.1f%%) are linked (cutoff=%d)" % \ (num_links, num_links * 100. / num_pairs, cutoff) print >>sys.stderr, "mean distance between mates: {0} +/- {1}".\ format(meandist, stdev) print >> sys.stderr, "median distance between mates: {0}".format(p0) print >> sys.stderr, "95% distance range: {0} - {1}".format(p1, p2) print >> sys.stderr, "\nOrientations:" orientation_summary = [] for orientation, count in sorted(orientations.items()): o = "{0}:{1}".format(orientation, \ percentage(count, num_links, mode=1)) orientation_summary.append(o.split()[0]) print >> sys.stderr, o if insertsfile: from jcvi.graphics.histogram import histogram print >> insertsfw, "\n".join(str(x) for x in linked_dist) insertsfw.close() prefix = insertsfile.rsplit(".", 1)[0] if prefix > 10: prefix = prefix.split("-")[0] osummary = " ".join(orientation_summary) title = "{0} ({1}; median:{2} bp)".format(prefix, osummary, p0) histogram(insertsfile, vmin=0, vmax=cutoff, bins=bins, xlabel="Insertsize", title=title, ascii=ascii) if op.exists(insertsfile): os.remove(insertsfile) return s
def report_pairs(data, cutoff=0, mateorientation=None, pairsfile=None, insertsfile=None, rclip=1, ascii=False, bins=20, distmode="ss"): """ This subroutine is used by the pairs function in blast.py and cas.py. Reports number of fragments and pairs as well as linked pairs """ from jcvi.utils.cbook import percentage allowed_mateorientations = ("++", "--", "+-", "-+") if mateorientation: assert mateorientation in allowed_mateorientations num_fragments, num_pairs = 0, 0 all_dist = [] linked_dist = [] # +- (forward-backward) is `innie`, -+ (backward-forward) is `outie` orientations = defaultdict(int) # clip how many chars from end of the read name to get pair name key = (lambda x: x.accn[:-rclip]) if rclip else (lambda x: x.accn) data.sort(key=key) if pairsfile: pairsfw = open(pairsfile, "w") if insertsfile: insertsfw = open(insertsfile, "w") for pe, lines in groupby(data, key=key): lines = list(lines) if len(lines) != 2: num_fragments += len(lines) continue num_pairs += 1 a, b = lines asubject, astart, astop = a.seqid, a.start, a.end bsubject, bstart, bstop = b.seqid, b.start, b.end aquery, bquery = a.accn, b.accn astrand, bstrand = a.strand, b.strand dist, orientation = range_distance(\ (asubject, astart, astop, astrand), (bsubject, bstart, bstop, bstrand), distmode=distmode) if dist >= 0: all_dist.append((dist, orientation, aquery, bquery)) # select only pairs with certain orientations - e.g. innies, outies, etc. if mateorientation: all_dist = [x for x in all_dist if x[1] == mateorientation] # try to infer cutoff as twice the median until convergence if cutoff <= 0: dists = np.array([x[0] for x in all_dist], dtype="int") p0 = np.median(dists) cutoff = int(2 * p0) # initial estimate cutoff = int(math.ceil(cutoff / bins)) * bins logging.debug("Insert size cutoff set to {0}, ".format(cutoff) + "use '--cutoff' to override") for dist, orientation, aquery, bquery in all_dist: if dist > cutoff: continue linked_dist.append(dist) if pairsfile: print >> pairsfw, "{0}\t{1}\t{2}".format(aquery, bquery, dist) orientations[orientation] += 1 print >>sys.stderr, "%d fragments, %d pairs" % (num_fragments, num_pairs) num_links = len(linked_dist) linked_dist = np.array(linked_dist, dtype="int") linked_dist = np.sort(linked_dist) meandist = np.mean(linked_dist) stdev = np.std(linked_dist) p0 = np.median(linked_dist) p1 = linked_dist[int(num_links * .025)] p2 = linked_dist[int(num_links * .975)] meandist, stdev = int(meandist), int(stdev) p0 = int(p0) print >>sys.stderr, "%d pairs (%.1f%%) are linked (cutoff=%d)" % \ (num_links, num_links * 100. / num_pairs, cutoff) print >>sys.stderr, "mean distance between mates: {0} +/- {1}".\ format(meandist, stdev) print >>sys.stderr, "median distance between mates: {0}".format(p0) print >>sys.stderr, "95% distance range: {0} - {1}".format(p1, p2) print >>sys.stderr, "\nOrientations:" orientation_summary = [] for orientation, count in sorted(orientations.items()): o = "{0}:{1}".format(orientation, \ percentage(count, num_links, denominator=False)) orientation_summary.append(o.split()[0]) print >>sys.stderr, o if insertsfile: from jcvi.graphics.histogram import histogram print >>insertsfw, "\n".join(str(x) for x in linked_dist) insertsfw.close() prefix = insertsfile.rsplit(".", 1)[0] osummary = " ".join(orientation_summary) title="{0} ({1}; median dist:{2})".format(prefix, osummary, p0) histogram(insertsfile, vmin=0, vmax=cutoff, bins=bins, xlabel="Insertsize", title=title, ascii=ascii) if op.exists(insertsfile): os.remove(insertsfile) return meandist, stdev, p0, p1, p2