def scenario(args): """ %prog scenario Illustration of the two-step genome merger process for B. rapa companion paper. """ p = OptionParser(__doc__) opts, args = p.parse_args() fig = plt.figure(1, (5, 5)) root = fig.add_axes([0, 0, 1, 1]) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() # Layout format: (x, y, label, (chr lengths)) anc = (0.5, 0.9, "Ancestor", (1, )) s1 = (0.2, 0.6, "Genome I", (1, )) s2 = (0.5, 0.6, "Genome II", (1, )) s3 = (0.8, 0.6, "Genome III", (1, )) tetra = (0.35, 0.4, "Tetraploid I / II", (0.5, 0.9)) hexa = (0.5, 0.1, "Hexaploid I / II / III", (0.36, 0.46, 0.9)) labels = (anc, s1, s2, s3, tetra, hexa) connections = ( (anc, s1), (anc, s2), (anc, s3), (s1, tetra), (s2, tetra), (tetra, hexa), (s3, hexa), ) xinterval = 0.02 yratio = 0.05 for xx, yy, label, chrl in labels: # RoundLabel(root, xx, yy, label) root.text(xx, yy, label, ha="center", va="center") offset = len(label) * 0.012 for i, c in enumerate(chrl): ya = yy + yratio * c yb = yy - yratio * c Chromosome(root, xx - offset + i * xinterval, ya, yb, width=0.01) # Comments comments = ((0.15, 0.33, "II dominant"), (0.25, 0.03, "III dominant")) for xx, yy, c in comments: root.text(xx, yy, c, size=9, ha="center", va="center") # Branches tip = 0.04 for a, b in connections: xa, ya, la, chra = a xb, yb, lb, chrb = b plt.plot((xa, xb), (ya - tip, yb + 2 * tip), "k-", lw=2, alpha=0.5) figname = fname() + ".pdf" savefig(figname, dpi=300)
def plot_data(x, y, tour, M): from jcvi.graphics.base import plt, savefig plt.plot(x, y, "ro") for ia, ib in pairwise(tour): plt.plot((x[ia], x[ib]), (y[ia], y[ib]), "r-") score = evaluate(tour, M) plt.title("Score={0:.2f}".format(score)) savefig("demo.pdf")
def scenario(args): """ %prog scenario Illustration of the two-step genome merger process for B. rapa companion paper. """ p = OptionParser(__doc__) opts, args = p.parse_args() fig = plt.figure(1, (5, 5)) root = fig.add_axes([0, 0, 1, 1]) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() # Layout format: (x, y, label, (chr lengths)) anc = (.5, .9, "Ancestor", (1,)) s1 = (.2, .6, "Genome I", (1,)) s2 = (.5, .6, "Genome II", (1,)) s3 = (.8, .6, "Genome III", (1,)) tetra = (.35, .4, "Tetraploid I / II", (.5, .9)) hexa = (.5, .1, "Hexaploid I / II / III", (.36, .46, .9)) labels = (anc, s1, s2, s3, tetra, hexa) connections = ((anc, s1), (anc, s2), (anc, s3),\ (s1, tetra), (s2, tetra), (tetra, hexa), (s3, hexa)) xinterval = .02 yratio = .05 for xx, yy, label, chrl in labels: #RoundLabel(root, xx, yy, label) root.text(xx, yy, label, ha="center", va="center") offset = len(label) * .012 for i, c in enumerate(chrl): ya = yy + yratio * c yb = yy - yratio * c Chromosome(root, xx - offset + i * xinterval, ya, yb, width=.01) # Comments comments = ((.15, .33, "II dominant"), (.25, .03, "III dominant")) for xx, yy, c in comments: root.text(xx, yy, c, size=9, ha="center", va="center") # Branches tip = .04 for a, b in connections: xa, ya, la, chra = a xb, yb, lb, chrb = b plt.plot((xa, xb), (ya - tip, yb + 2 * tip), 'k-', lw=2, alpha=.5) figname = fname() + ".pdf" savefig(figname, dpi=300)
def excision(args): """ %prog excision Illustrate the mechanism of illegitimate recombination. """ p = OptionParser(__doc__) opts, args = p.parse_args(args) fig = plt.figure(1, (5, 5)) root = fig.add_axes([0, 0, 1, 1]) plt.plot((.2, .8), (.6, .6), 'r-', lw=3) plt.plot((.4, .6), (.6, .6), 'b>-', mfc='g', mec='w', ms=12, lw=3) plt.plot((.3, .7), (.5, .5), 'r-', lw=3) plt.plot((.5, ), (.5, ), 'b>-', mfc='g', mec='w', ms=12, lw=3) # Circle excision plt.plot((.5, ), (.45, ), 'b>-', mfc='g', mec='w', ms=12, lw=3) circle = CirclePolygon((.5, .4), .05, fill=False, lw=3, ec="b") root.add_patch(circle) arrow_dist = .07 ar_xpos, ar_ypos = .5, .52 root.annotate(" ", (ar_xpos, ar_ypos), (ar_xpos, ar_ypos + arrow_dist), arrowprops=arrowprops) RoundLabel(root, .2, .64, "Gene") RoundLabel(root, .3, .54, "Excision") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() figname = fname() + ".pdf" savefig(figname, dpi=300)
def excision(args): """ %prog excision Illustrate the mechanism of illegitimate recombination. """ p = OptionParser(__doc__) opts, args = p.parse_args(args) fig = plt.figure(1, (5, 5)) root = fig.add_axes([0, 0, 1, 1]) plt.plot((0.2, 0.8), (0.6, 0.6), "r-", lw=3) plt.plot((0.4, 0.6), (0.6, 0.6), "b>-", mfc="g", mec="w", ms=12, lw=3) plt.plot((0.3, 0.7), (0.5, 0.5), "r-", lw=3) plt.plot((0.5, ), (0.5, ), "b>-", mfc="g", mec="w", ms=12, lw=3) # Circle excision plt.plot((0.5, ), (0.45, ), "b>-", mfc="g", mec="w", ms=12, lw=3) circle = CirclePolygon((0.5, 0.4), 0.05, fill=False, lw=3, ec="b") root.add_patch(circle) arrow_dist = 0.07 ar_xpos, ar_ypos = 0.5, 0.52 root.annotate(" ", (ar_xpos, ar_ypos), (ar_xpos, ar_ypos + arrow_dist), arrowprops=arrowprops) RoundLabel(root, 0.2, 0.64, "Gene") RoundLabel(root, 0.3, 0.54, "Excision") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() figname = fname() + ".pdf" savefig(figname, dpi=300)
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. """ p = OptionParser(histogram.__doc__) p.add_option( "--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive", ) p.add_option( "--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive", ) p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot", ) p.add_option( "--method", choices=("nbinom", "allpaths"), default="nbinom", help= "'nbinom' - slow but more accurate for het or polyploid genome; 'allpaths' - fast and works for homozygous enomes", ) p.add_option( "--maxiter", default=100, type="int", help="Max iterations for optimization. Only used with --method nbinom", ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option( "--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks", ) opts, args, iopts = p.set_image_options(args, figsize="7x7") if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args method = opts.method vmin, vmax = opts.vmin, opts.vmax ascii = not opts.pdf peaks = not opts.nopeaks and method == "allpaths" N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = merylhistogram(histfile) ks = KmerSpectrum(histfile) method_info = ks.analyze(K=N, maxiter=opts.maxiter, method=method) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.lambda_ if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1:.1f}x".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f} Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print(msg, file=sys.stderr) x, y = ks.get_xy(vmin, vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (iopts.w, iopts.h)) plt.bar(x, y, fc="#b2df8a", lw=0) # Plot the negative binomial fit if method == "nbinom": generative_model = method_info["generative_model"] GG = method_info["Gbins"] ll = method_info["lambda"] rr = method_info["rho"] kf_range = method_info["kf_range"] stacked = generative_model(GG, ll, rr) plt.plot( kf_range, stacked, ":", color="#6a3d9a", lw=2, ) ax = plt.gca() if peaks: # Only works for method 'allpaths' t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=3, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak") ax.text(ks.max2, tcounts[ks.max2], "Main peak") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 if method == "nbinom": # Plot multiple CN locations, CN1, CN2, ... up to ploidy cn_color = "#a6cee3" for i in range(1, ks.ploidy + 1): x = i * ks.lambda_ plt.plot((x, x), (0, ymax), "-.", color=cn_color) plt.text( x, ymax * 0.95, "CN{}".format(i), ha="right", va="center", color=cn_color, rotation=90, ) messages = [ Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg, ] if method == "nbinom": messages += [ks.ploidy_message] + ks.copy_messages write_messages(ax, messages) ax.set_title(markup(title)) ax.set_xlim((0, vmax)) ax.set_ylim((0, ymax)) adjust_spines(ax, ["left", "bottom"], outward=True) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + "." + iopts.format savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\ format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") tc = "gray" axt = ax.transAxes ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right") ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right") ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right") ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right") ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title), color='r') ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel, color='r') ax.set_ylabel(ylabel, color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]" ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = meryl([histfile]) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, "g-", lw=2, alpha=0.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=2, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") messages = [Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg] write_messages(ax, messages) ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title)) ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args N = int(N) KMERYL, KSOAP, KALLPATHS = range(3) kformats = ("Meryl", "Soap", "AllPaths") kformat = KMERYL ascii = not opts.pdf peaks = not opts.nopeaks fp = open(histfile) hist = {} totalKmers = 0 # Guess the format of the Kmer histogram for row in fp: if row.startswith("# 1:"): kformat = KALLPATHS break if len(row.split()) == 1: kformat = KSOAP break fp.seek(0) logging.debug("Guessed format: {0}".format(kformats[kformat])) data = [] for rowno, row in enumerate(fp): if row[0] == '#': continue if kformat == KSOAP: K = rowno + 1 counts = int(row.strip()) else: # meryl histogram K, counts = row.split()[:2] K, counts = int(K), int(counts) Kcounts = K * counts totalKmers += Kcounts hist[K] = Kcounts data.append((K, counts)) covmax = 1000000 ks = KmerSpectrum(data) ks.analyze(K=N, covmax=covmax) Total_Kmers = int(totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\ format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg counts = sorted((a, b) for a, b in hist.items() \ if opts.vmin <= a <= opts.vmax) x, y = zip(*counts) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in counts if x in t] x, y = zip(*tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') tcounts = dict(tcounts) if peaks: ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") tc = "gray" axt = ax.transAxes ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right") ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right") ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right") ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right") ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title), color='r') ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel, color='r') ax.set_ylabel(ylabel, color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. """ p = OptionParser(histogram.__doc__) p.add_option( "--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive", ) p.add_option( "--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive", ) p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot", ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option( "--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = merylhistogram(histfile) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print(msg, file=sys.stderr) x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, "g-", lw=2, alpha=0.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=2, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") messages = [ Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg, ] write_messages(ax, messages) ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title)) ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args N = int(N) KMERYL, KSOAP, KALLPATHS = range(3) kformats = ("Meryl", "Soap", "AllPaths") kformat = KMERYL ascii = not opts.pdf peaks = not opts.nopeaks fp = open(histfile) hist = {} totalKmers = 0 # Guess the format of the Kmer histogram for row in fp: if row.startswith("# 1:"): kformat = KALLPATHS break if len(row.split()) == 1: kformat = KSOAP break fp.seek(0) logging.debug("Guessed format: {0}".format(kformats[kformat])) data = [] for rowno, row in enumerate(fp): if row[0] == '#': continue if kformat == KSOAP: K = rowno + 1 counts = int(row.strip()) else: # meryl histogram K, counts = row.split()[:2] K, counts = int(K), int(counts) Kcounts = K * counts totalKmers += Kcounts hist[K] = Kcounts data.append((K, counts)) covmax = 1000000 ks = KmerSpectrum(data) ks.analyze(K=N, covmax=covmax) Total_Kmers = int(totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = Total_Kmers * 1. / Kmer_coverage / 1e6 Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg counts = sorted((a, b) for a, b in hist.items() \ if opts.vmin <= a <= opts.vmax) x, y = zip(*counts) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: return asciiplot(x, y, title=title) plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in counts if x in t] x, y = zip(*tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') tcounts = dict(tcounts) if peaks: ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") tc = "gray" axt = ax.transAxes ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right") ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right") ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right") ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right") ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title), color='r') ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel, color='r') ax.set_ylabel(ylabel, color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100)
def bites(args): """ %prog bites Illustrate the pipeline for automated bite discovery. """ p = OptionParser(__doc__) opts, args = p.parse_args() fig = plt.figure(1, (6, 6)) root = fig.add_axes([0, 0, 1, 1]) # HSP pairs hsps = (((50, 150), (60, 180)), ((190, 250), (160, 235)), ((300, 360), (270, 330)), ((430, 470), (450, 490)), ((570, 620), (493, 543)), ((540, 555), (370, 385)), # non-collinear hsps ) titlepos = (.9, .65, .4) titles = ("Compare orthologous region", "Find collinear HSPs", "Scan paired gaps") ytip = .01 mrange = 650. m = lambda x: x / mrange * .7 + .1 for i, (ya, title) in enumerate(zip(titlepos, titles)): yb = ya - .1 plt.plot((.1, .8), (ya, ya), "-", color="gray", lw=2, zorder=1) plt.plot((.1, .8), (yb, yb), "-", color="gray", lw=2, zorder=1) RoundLabel(root, .5, ya + 4 * ytip, title) root.text(.9, ya, "A. thaliana", ha="center", va="center") root.text(.9, yb, "B. rapa", ha="center", va="center") myhsps = hsps if i >= 1: myhsps = hsps[:-1] for (a, b), (c, d) in myhsps: a, b, c, d = [m(x) for x in (a, b, c, d)] r1 = Rectangle((a, ya - ytip), b - a, 2 * ytip, fc='r', lw=0, zorder=2) r2 = Rectangle((c, yb - ytip), d - c, 2 * ytip, fc='r', lw=0, zorder=2) r3 = Rectangle((a, ya - ytip), b - a, 2 * ytip, fill=False, zorder=3) r4 = Rectangle((c, yb - ytip), d - c, 2 * ytip, fill=False, zorder=3) r5 = Polygon(((a, ya - ytip), (c, yb + ytip), (d, yb + ytip), (b, ya - ytip)), fc='r', alpha=.2) rr = (r1, r2, r3, r4, r5) if i == 2: rr = rr[:-1] for r in rr: root.add_patch(r) # Gap pairs hspa, hspb = zip(*myhsps) gapa, gapb = [], [] for (a, b), (c, d) in pairwise(hspa): gapa.append((b + 1, c - 1)) for (a, b), (c, d) in pairwise(hspb): gapb.append((b + 1, c - 1)) gaps = zip(gapa, gapb) tpos = titlepos[-1] yy = tpos - .05 for i, ((a, b), (c, d)) in enumerate(gaps): i += 1 a, b, c, d = [m(x) for x in (a, b, c, d)] xx = (a + b + c + d) / 4 TextCircle(root, xx, yy, str(i)) # Bites ystart = .24 ytip = .05 bites = (("Bite(40=>-15)", True), ("Bite(50=>35)", False), ("Bite(70=>120)", False), ("Bite(100=>3)", True)) for i, (bite, selected) in enumerate(bites): xx = .15 if (i % 2 == 0) else .55 yy = ystart - i / 2 * ytip i += 1 TextCircle(root, xx, yy, str(i)) color = "k" if selected else "gray" root.text(xx + ytip, yy, bite, size=10, color=color, va="center") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() figname = fname() + ".pdf" savefig(figname, dpi=300)
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf fp = open(histfile) hist = {} totalKmers = 0 # Guess the format of the Kmer histogram soap = False for row in fp: if len(row.split()) == 1: soap = True break fp.seek(0) for rowno, row in enumerate(fp): if soap: K = rowno + 1 counts = int(row.strip()) else: # meryl histogram K, counts = row.split()[:2] K, counts = int(K), int(counts) Kcounts = K * counts totalKmers += Kcounts hist[K] = counts history = ["drop"] for a, b in pairwise(sorted(hist.items())): Ka, ca = a Kb, cb = b if ca <= cb: status = "rise" else: status = "drop" if history[-1] != status: history.append(status) if history == ["drop", "rise", "drop"]: break Total_Kmers = int(totalKmers) Kmer_coverage = Ka Genome_size = Total_Kmers * 1. / Ka / 1e6 Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size) for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg counts = sorted((a, b) for a, b in hist.items() if a <= 100) x, y = zip(*counts) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: return asciiplot(x, y, title=title) fig = plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() ax.text(.5, .9, _(Total_Kmers_msg), ha="center", color='b', transform=ax.transAxes) ax.text(.5, .8, _(Kmer_coverage_msg), ha="center", color='b', transform=ax.transAxes) ax.text(.5, .7, _(Genome_size_msg), ha="center", color='b', transform=ax.transAxes) ax.set_title(_(title), color='r') xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(_(xlabel), color='r') ax.set_ylabel(_(ylabel), color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" plt.savefig(imagename, dpi=100) print >> sys.stderr, "Image saved to `{0}`.".format(imagename)
def bites(args): """ %prog bites Illustrate the pipeline for automated bite discovery. """ p = OptionParser(__doc__) opts, args = p.parse_args() fig = plt.figure(1, (6, 6)) root = fig.add_axes([0, 0, 1, 1]) # HSP pairs hsps = ( ((50, 150), (60, 180)), ((190, 250), (160, 235)), ((300, 360), (270, 330)), ((430, 470), (450, 490)), ((570, 620), (493, 543)), ((540, 555), (370, 385)), # non-collinear hsps ) titlepos = (0.9, 0.65, 0.4) titles = ("Compare orthologous region", "Find collinear HSPs", "Scan paired gaps") ytip = 0.01 mrange = 650.0 m = lambda x: x / mrange * 0.7 + 0.1 for i, (ya, title) in enumerate(zip(titlepos, titles)): yb = ya - 0.1 plt.plot((0.1, 0.8), (ya, ya), "-", color="gray", lw=2, zorder=1) plt.plot((0.1, 0.8), (yb, yb), "-", color="gray", lw=2, zorder=1) RoundLabel(root, 0.5, ya + 4 * ytip, title) root.text(0.9, ya, "A. thaliana", ha="center", va="center") root.text(0.9, yb, "B. rapa", ha="center", va="center") myhsps = hsps if i >= 1: myhsps = hsps[:-1] for (a, b), (c, d) in myhsps: a, b, c, d = [m(x) for x in (a, b, c, d)] r1 = Rectangle((a, ya - ytip), b - a, 2 * ytip, fc="r", lw=0, zorder=2) r2 = Rectangle((c, yb - ytip), d - c, 2 * ytip, fc="r", lw=0, zorder=2) r3 = Rectangle((a, ya - ytip), b - a, 2 * ytip, fill=False, zorder=3) r4 = Rectangle((c, yb - ytip), d - c, 2 * ytip, fill=False, zorder=3) r5 = Polygon( ((a, ya - ytip), (c, yb + ytip), (d, yb + ytip), (b, ya - ytip)), fc="r", alpha=0.2, ) rr = (r1, r2, r3, r4, r5) if i == 2: rr = rr[:-1] for r in rr: root.add_patch(r) # Gap pairs hspa, hspb = zip(*myhsps) gapa, gapb = [], [] for (a, b), (c, d) in pairwise(hspa): gapa.append((b + 1, c - 1)) for (a, b), (c, d) in pairwise(hspb): gapb.append((b + 1, c - 1)) gaps = zip(gapa, gapb) tpos = titlepos[-1] yy = tpos - 0.05 for i, ((a, b), (c, d)) in enumerate(gaps): i += 1 a, b, c, d = [m(x) for x in (a, b, c, d)] xx = (a + b + c + d) / 4 TextCircle(root, xx, yy, str(i)) # Bites ystart = 0.24 ytip = 0.05 bites = ( ("Bite(40=>-15)", True), ("Bite(50=>35)", False), ("Bite(70=>120)", False), ("Bite(100=>3)", True), ) for i, (bite, selected) in enumerate(bites): xx = 0.15 if (i % 2 == 0) else 0.55 yy = ystart - i / 2 * ytip i += 1 TextCircle(root, xx, yy, str(i)) color = "k" if selected else "gray" root.text(xx + ytip, yy, bite, size=10, color=color, va="center") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() figname = fname() + ".pdf" savefig(figname, dpi=300)