def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\ format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") tc = "gray" axt = ax.transAxes ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right") ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right") ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right") ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right") ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title), color='r') ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel, color='r') ax.set_ylabel(ylabel, color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize, qbreaks, sbreaks, sep=True, chrlw=.1, sepcolor="g", minfont=5, stdpf=True): xlim = (0, xsize) ylim = (ysize, 0) # invert the y-axis # Tag to mark whether to plot chr name (skip small ones) xchr_labels, ychr_labels = [], [] th = TextHandler(fig) # plot the chromosome breaks for (seqid, beg, end) in qbreaks: xsize_ratio = abs(end - beg) * .8 / xsize fontsize = th.select_fontsize(xsize_ratio) seqid = "".join(seqid_parse(seqid, stdpf=stdpf)[:2]) xchr_labels.append((seqid, (beg + end) / 2, fontsize)) if sep: ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor) for (seqid, beg, end) in sbreaks: ysize_ratio = abs(end - beg) * .8 / ysize fontsize = th.select_fontsize(ysize_ratio) seqid = "".join(seqid_parse(seqid, stdpf=stdpf)[:2]) ychr_labels.append((seqid, (beg + end) / 2, fontsize)) if sep: ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor) # plot the chromosome labels for label, pos, fontsize in xchr_labels: pos = .1 + pos * .8 / xsize if fontsize >= minfont: root.text(pos, .91, latex(label), size=fontsize, ha="center", va="bottom", rotation=45, color="grey") # remember y labels are inverted for label, pos, fontsize in ychr_labels: pos = .9 - pos * .8 / ysize if fontsize >= minfont: root.text(.91, pos, latex(label), size=fontsize, va="center", color="grey") # Plot the frame ax.plot(xlim, [0, 0], "-", lw=chrlw, color=sepcolor) ax.plot(xlim, [ysize, ysize], "-", lw=chrlw, color=sepcolor) ax.plot([0, 0], ylim, "-", lw=chrlw, color=sepcolor) ax.plot([xsize, xsize], ylim, "-", lw=chrlw, color=sepcolor) ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel(gx, size=16) ax.set_ylabel(gy, size=16) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color='gray', size=10) return xlim, ylim
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1, is_self=False, synteny=False, cmap_text=None, genomenames=None, sample_number=10000, ignore=.005, palette=None, chrlw=.01, title=None): fp = open(anchorfile) qorder = qbed.order sorder = sbed.order data = [] if cmap_text: logging.debug("Normalize values to [%.1f, %.1f]" % (vmin, vmax)) block_id = 0 for row in fp: atoms = row.split() block_color = None if row[0] == "#": block_id += 1 if palette: block_color = palette.get(block_id, "k") continue # first two columns are query and subject, and an optional third column if len(atoms) < 2: continue query, subject = atoms[:2] value = atoms[-1] try: value = float(value) except ValueError: value = vmax if value < vmin: value = vmin if value > vmax: value = vmax if query not in qorder: #logging.warning("ignore %s" % query) continue if subject not in sorder: #logging.warning("ignore %s" % subject) continue qi, q = qorder[query] si, s = sorder[subject] nv = vmax - value if block_color is None else block_color data.append((qi, si, nv)) if is_self: # Mirror image data.append((si, qi, nv)) # only show random subset, default to sample_number = 5000 if len(data) > sample_number: logging.debug("Showing a random subset of %s data points (total %s) " \ "for clarity." % (sample_number, len(data))) data = sample(data, sample_number) # the data are plotted in this order, the least value are plotted # last for aesthetics if not palette: data.sort(key=lambda x: -x[2]) default_cm = cm.copper x, y, c = zip(*data) if palette: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) else: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=default_cm, vmin=vmin, vmax=vmax) if synteny: clusters = batch_scan(data, qbed, sbed) draw_box(clusters, ax) if cmap_text: draw_cmap(root, cmap_text, vmin, vmax, cmap=default_cm, reverse=True) xsize, ysize = len(qbed), len(sbed) logging.debug("xsize=%d ysize=%d" % (xsize, ysize)) xlim = (0, xsize) ylim = (ysize, 0) # invert the y-axis xchr_labels, ychr_labels = [], [] # Tag to mark whether to plot chr name (skip small ones) ignore_size_x = ignore_size_y = 0 if ignore: ignore_size_x = xsize * ignore ignore_size_y = ysize * ignore # plot the chromosome breaks for (seqid, beg, end) in qbed.get_breaks(): ignore = abs(end - beg) < ignore_size_x seqid = seqid.split("_")[-1] try: seqid = int(seqid) seqid = "c%d" % seqid except: pass xchr_labels.append((seqid, (beg + end) / 2, ignore)) ax.plot([beg, beg], ylim, "g-", lw=chrlw) for (seqid, beg, end) in sbed.get_breaks(): ignore = abs(end - beg) < ignore_size_y seqid = seqid.split("_")[-1] try: seqid = int(seqid) seqid = "c%d" % seqid except: pass ychr_labels.append((seqid, (beg + end) / 2, ignore)) ax.plot(xlim, [beg, beg], "g-", lw=chrlw) # plot the chromosome labels for label, pos, ignore in xchr_labels: pos = .1 + pos * .8 / xsize if not ignore: root.text(pos, .91, label, ha="center", va="bottom", rotation=45, color="grey") # remember y labels are inverted for label, pos, ignore in ychr_labels: pos = .9 - pos * .8 / ysize if not ignore: root.text(.91, pos, label, va="center", color="grey") # create a diagonal to separate mirror image for self comparison if is_self: ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2) ax.set_xlim(xlim) ax.set_ylim(ylim) # add genome names if genomenames: gx, gy = genomenames.split("_") else: to_ax_label = lambda fname: op.basename(fname).split(".")[0] gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)] ax.set_xlabel(gx, size=16) ax.set_ylabel(gy, size=16) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color='gray', size=10) if palette: # bottom-left has the palette, if available colors = palette.colors xstart, ystart = .1, .05 for category, c in sorted(colors.items()): root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c)) root.text(xstart + .04, ystart, category, color=c) xstart += .1 if title: fig.suptitle(title, x=.05, y=.98, color="k") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off()
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1, is_self=False, synteny=False, cmap_text=None, genomenames=None, sample_number=10000, minfont=5, palette=None, chrlw=.01, title=None, sepcolor="gainsboro"): fp = open(anchorfile) qorder = qbed.order sorder = sbed.order data = [] if cmap_text: logging.debug("Normalize values to [%.1f, %.1f]" % (vmin, vmax)) block_id = 0 for row in fp: atoms = row.split() block_color = None if row[0] == "#": block_id += 1 if palette: block_color = palette.get(block_id, "k") continue # first two columns are query and subject, and an optional third column if len(atoms) < 2: continue query, subject = atoms[:2] value = atoms[-1] try: value = float(value) except ValueError: value = vmax if value < vmin: value = vmin if value > vmax: value = vmax if query not in qorder: continue if subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] nv = vmax - value if block_color is None else block_color data.append((qi, si, nv)) if is_self: # Mirror image data.append((si, qi, nv)) npairs = len(data) # Only show random subset if npairs > sample_number: logging.debug("Showing a random subset of {0} data points (total {1}) " \ "for clarity.".format(sample_number, npairs)) data = sample(data, sample_number) # the data are plotted in this order, the least value are plotted # last for aesthetics if not palette: data.sort(key=lambda x: -x[2]) default_cm = cm.copper x, y, c = zip(*data) if palette: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) else: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=default_cm, vmin=vmin, vmax=vmax) if synteny: clusters = batch_scan(data, qbed, sbed) draw_box(clusters, ax) if cmap_text: draw_cmap(root, cmap_text, vmin, vmax, cmap=default_cm, reverse=True) xsize, ysize = len(qbed), len(sbed) logging.debug("xsize=%d ysize=%d" % (xsize, ysize)) xlim = (0, xsize) ylim = (ysize, 0) # invert the y-axis # Tag to mark whether to plot chr name (skip small ones) xchr_labels, ychr_labels = [], [] th = TextHandler(fig) # plot the chromosome breaks for (seqid, beg, end) in qbed.get_breaks(): xsize_ratio = abs(end - beg) * .8 / xsize fontsize = th.select_fontsize(xsize_ratio) seqid = "".join(seqid_parse(seqid)[:2]) xchr_labels.append((seqid, (beg + end) / 2, fontsize)) ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor) for (seqid, beg, end) in sbed.get_breaks(): ysize_ratio = abs(end - beg) * .8 / ysize fontsize = th.select_fontsize(ysize_ratio) seqid = "".join(seqid_parse(seqid)[:2]) ychr_labels.append((seqid, (beg + end) / 2, fontsize)) ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor) # plot the chromosome labels for label, pos, fontsize in xchr_labels: pos = .1 + pos * .8 / xsize if fontsize >= minfont: root.text(pos, .91, latex(label), size=fontsize, ha="center", va="bottom", rotation=45, color="grey") # remember y labels are inverted for label, pos, fontsize in ychr_labels: pos = .9 - pos * .8 / ysize if fontsize >= minfont: root.text(.91, pos, latex(label), size=fontsize, va="center", color="grey") # create a diagonal to separate mirror image for self comparison if is_self: ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2) ax.set_xlim(xlim) ax.set_ylim(ylim) # add genome names if genomenames: gx, gy = genomenames.split("_") else: to_ax_label = lambda fname: op.basename(fname).split(".")[0] gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)] ax.set_xlabel(gx, size=16) ax.set_ylabel(gy, size=16) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color='gray', size=10) if palette: # bottom-left has the palette, if available colors = palette.colors xstart, ystart = .1, .05 for category, c in sorted(colors.items()): root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c)) root.text(xstart + .04, ystart, category, color=c) xstart += .1 if not title: title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy) if is_self: title = "Intra-genomic comparison within {0}".format(gx) npairs /= 2 title += " ({0} gene pairs)".format(thousands(npairs)) root.set_title(title, x=.5, y=.96, color="k") logging.debug(title) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off()
def multihistogram(args): """ %prog multihistogram *.histogram species Plot the histogram based on a set of K-mer hisotograms. The method is based on Star et al.'s method (Atlantic Cod genome paper). """ p = OptionParser(multihistogram.__doc__) p.add_option("--kmin", default=15, type="int", help="Minimum K-mer size, inclusive") p.add_option("--kmax", default=30, type="int", help="Maximum K-mer size, inclusive") p.add_option("--vmin", default=2, type="int", help="Minimum value, inclusive") p.add_option("--vmax", default=100, type="int", help="Maximum value, inclusive") opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300) histfiles = args[:-1] species = args[-1] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) A = fig.add_axes([0.08, 0.12, 0.38, 0.76]) B = fig.add_axes([0.58, 0.12, 0.38, 0.76]) lines = [] legends = [] genomesizes = [] for histfile in histfiles: ks = KmerSpectrum(histfile) x, y = ks.get_xy(opts.vmin, opts.vmax) K = get_number(op.basename(histfile).split(".")[0].split("-")[-1]) if not opts.kmin <= K <= opts.kmax: continue line, = A.plot(x, y, "-", lw=1) lines.append(line) legends.append("K = {0}".format(K)) ks.analyze(K=K) genomesizes.append((K, ks.genomesize / 1e6)) leg = A.legend(lines, legends, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) title = "{0} genome K-mer histogram".format(species) A.set_title(markup(title)) xlabel, ylabel = "Coverage (X)", "Counts" A.set_xlabel(xlabel) A.set_ylabel(ylabel) set_human_axis(A) title = "{0} genome size estimate".format(species) B.set_title(markup(title)) x, y = zip(*genomesizes) B.plot(x, y, "ko", mfc="w") t = np.linspace(opts.kmin - 0.5, opts.kmax + 0.5, 100) p = np.poly1d(np.polyfit(x, y, 2)) B.plot(t, p(t), "r:") xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)" B.set_xlabel(xlabel) B.set_ylabel(ylabel) set_ticklabels_helvetica(B) labels = ((0.04, 0.96, "A"), (0.54, 0.96, "B")) panel_labels(root, labels) normalize_axes(root) imagename = species + ".multiK.pdf" savefig(imagename, dpi=iopts.dpi, iopts=iopts)
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]" ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = meryl([histfile]) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, "g-", lw=2, alpha=0.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=2, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") messages = [Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg] write_messages(ax, messages) ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title)) ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args N = int(N) KMERYL, KSOAP, KALLPATHS = range(3) kformats = ("Meryl", "Soap", "AllPaths") kformat = KMERYL ascii = not opts.pdf peaks = not opts.nopeaks fp = open(histfile) hist = {} totalKmers = 0 # Guess the format of the Kmer histogram for row in fp: if row.startswith("# 1:"): kformat = KALLPATHS break if len(row.split()) == 1: kformat = KSOAP break fp.seek(0) logging.debug("Guessed format: {0}".format(kformats[kformat])) data = [] for rowno, row in enumerate(fp): if row[0] == '#': continue if kformat == KSOAP: K = rowno + 1 counts = int(row.strip()) else: # meryl histogram K, counts = row.split()[:2] K, counts = int(K), int(counts) Kcounts = K * counts totalKmers += Kcounts hist[K] = Kcounts data.append((K, counts)) covmax = 1000000 ks = KmerSpectrum(data) ks.analyze(K=N, covmax=covmax) Total_Kmers = int(totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\ format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg counts = sorted((a, b) for a, b in hist.items() \ if opts.vmin <= a <= opts.vmax) x, y = zip(*counts) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in counts if x in t] x, y = zip(*tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') tcounts = dict(tcounts) if peaks: ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") tc = "gray" axt = ax.transAxes ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right") ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right") ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right") ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right") ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title), color='r') ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel, color='r') ax.set_ylabel(ylabel, color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def plot_breaks_and_labels( fig, root, ax, gx, gy, xsize, ysize, qbreaks, sbreaks, sep=True, chrlw=0.1, sepcolor="g", minfont=5, stdpf=True, chpf=True, ): xlim = (0, 47724.0) # hard-coding xlim maximum ylim = (ysize, 0) # invert the y-axis # Tag to mark whether to plot chr name (skip small ones) xchr_labels, ychr_labels = [], [] th = TextHandler(fig) # plot the chromosome breaks for (seqid, beg, end) in qbreaks: xsize_ratio = abs(end - beg) * 0.8 / xsize fontsize = th.select_fontsize(xsize_ratio) if chpf: seqid = "".join(seqid_parse(seqid, stdpf=stdpf)[:2]) xchr_labels.append((seqid, (beg + end) / 2, fontsize)) if sep: ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor) for (seqid, beg, end) in sbreaks: ysize_ratio = abs(end - beg) * 0.8 / ysize fontsize = th.select_fontsize(ysize_ratio) if chpf: seqid = "".join(seqid_parse(seqid, stdpf=stdpf)[:2]) ychr_labels.append((seqid, (beg + end) / 2, fontsize*0.85)) if sep: ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor) # plot the chromosome labels xchr_labels = [('chr\ 1', 1997.5, 12), ('chr\ 2', 5944.5, 12), ('chr\ 3', 9014.0, 12), ('chr\ 4', 11351.5, 12), ('chr\ 5', 13639.0, 12), ('chr\ 6', 17657.5, 12), ('chr\ 7', 22329.0, 12), ('chr\ 8', 25466.0, 12), ('chr\ 9', 28092.0, 12), ('chr\ 10', 31361.5, 12), ('chr\ 11', 34457.0, 12), ('chr\ 12', 37234.0, 12), ('chr\ 13', 41112.5, 12), ('chr\ 14', 43851.0, 12), ('chr\ 15', 45258.5, 12), ('scf\ 16', 46740.5, 12), ('scf\ 458', 47724.0, 12)] for label, pos, fontsize in xchr_labels: #print(xchr_labels) pos = 0.1 + pos * 0.8 / xsize if fontsize >= minfont: root.text( pos, 0.91, latex(label), size=fontsize*0.85, ha="center", va="bottom", rotation=45, color="black", ) # remember y labels are inverted ychr_labels = [('chr\ 1', 2672.0, 10.2), ('chr\ 2', 7532.0, 10.2), ('chr\ 3', 12035.0, 10.2), ('chr\ 4', 16228.0, 10.2), ('chr\ 5', 19784.5, 10.2), ('chr\ 6', 23211.0, 10.2), ('chr\ 7', 26612.5, 10.2), ('chr\ 8', 29773.0, 10.2), ('chr\ 9', 32518.0, 10.2), ('chr\ 10', 35004.5, 10.2), ('chr\ 11', 37760.0, 10.2), ('chr\ 12', 40635.5, 10.2), ('ChrSy', 42048.0, 0), ('ChrUn', 42140.5, 0)] for label, pos, fontsize in ychr_labels: #print(ychr_labels) pos = 0.9 - pos * 0.8 / ysize if fontsize >= minfont: root.text(0.91, pos, latex(label), size=fontsize*0.85, va="center", color="black") # Plot the frame ax.plot(xlim, [0, 0], "-", lw=chrlw, color=sepcolor) ax.plot(xlim, [ysize, ysize], "-", lw=chrlw, color=sepcolor) ax.plot([0, 0], ylim, "-", lw=chrlw, color=sepcolor) ax.plot([xsize, xsize], ylim, "-", lw=chrlw, color=sepcolor) ax.set_xlim(xlim) ax.set_ylim(ylim) # The axis labels have been hardcoded (vs. gx gy as in original) so taht we can get the species names spelled out in italics, rather than the BED file name. ax.set_xlabel('$\it{Zizania\ palustris}$', size=16) ax.set_ylabel('$\it{Oryza\ sativa}$', size=16) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color="black", size=10) return xlim, ylim
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args N = int(N) KMERYL, KSOAP, KALLPATHS = range(3) kformats = ("Meryl", "Soap", "AllPaths") kformat = KMERYL ascii = not opts.pdf peaks = not opts.nopeaks fp = open(histfile) hist = {} totalKmers = 0 # Guess the format of the Kmer histogram for row in fp: if row.startswith("# 1:"): kformat = KALLPATHS break if len(row.split()) == 1: kformat = KSOAP break fp.seek(0) logging.debug("Guessed format: {0}".format(kformats[kformat])) data = [] for rowno, row in enumerate(fp): if row[0] == '#': continue if kformat == KSOAP: K = rowno + 1 counts = int(row.strip()) else: # meryl histogram K, counts = row.split()[:2] K, counts = int(K), int(counts) Kcounts = K * counts totalKmers += Kcounts hist[K] = Kcounts data.append((K, counts)) covmax = 1000000 ks = KmerSpectrum(data) ks.analyze(K=N, covmax=covmax) Total_Kmers = int(totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = Total_Kmers * 1. / Kmer_coverage / 1e6 Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg counts = sorted((a, b) for a, b in hist.items() \ if opts.vmin <= a <= opts.vmax) x, y = zip(*counts) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: return asciiplot(x, y, title=title) plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in counts if x in t] x, y = zip(*tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') tcounts = dict(tcounts) if peaks: ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") tc = "gray" axt = ax.transAxes ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right") ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right") ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right") ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right") ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title), color='r') ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel, color='r') ax.set_ylabel(ylabel, color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100)
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. """ p = OptionParser(histogram.__doc__) p.add_option( "--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive", ) p.add_option( "--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive", ) p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot", ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option( "--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = merylhistogram(histfile) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print(msg, file=sys.stderr) x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, "g-", lw=2, alpha=0.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=2, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") messages = [ Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg, ] write_messages(ax, messages) ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title)) ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf fp = open(histfile) hist = {} totalKmers = 0 # Guess the format of the Kmer histogram soap = False for row in fp: if len(row.split()) == 1: soap = True break fp.seek(0) for rowno, row in enumerate(fp): if soap: K = rowno + 1 counts = int(row.strip()) else: # meryl histogram K, counts = row.split()[:2] K, counts = int(K), int(counts) Kcounts = K * counts totalKmers += Kcounts hist[K] = counts history = ["drop"] for a, b in pairwise(sorted(hist.items())): Ka, ca = a Kb, cb = b if ca <= cb: status = "rise" else: status = "drop" if history[-1] != status: history.append(status) if history == ["drop", "rise", "drop"]: break Total_Kmers = int(totalKmers) Kmer_coverage = Ka Genome_size = Total_Kmers * 1. / Ka / 1e6 Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size) for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg counts = sorted((a, b) for a, b in hist.items() if a <= 100) x, y = zip(*counts) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: return asciiplot(x, y, title=title) fig = plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() ax.text(.5, .9, _(Total_Kmers_msg), ha="center", color='b', transform=ax.transAxes) ax.text(.5, .8, _(Kmer_coverage_msg), ha="center", color='b', transform=ax.transAxes) ax.text(.5, .7, _(Genome_size_msg), ha="center", color='b', transform=ax.transAxes) ax.set_title(_(title), color='r') xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(_(xlabel), color='r') ax.set_ylabel(_(ylabel), color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" plt.savefig(imagename, dpi=100) print >> sys.stderr, "Image saved to `{0}`.".format(imagename)
def draw_depth( root, ax, bed, chrinfo={}, defaultcolor="k", sepcolor="w", ylim=100, title=None, subtitle=None, ): """ Draw depth plot on the given axes, using data from bed Args: root (matplotlib.Axes): Canvas axes ax (matplotlib.Axes): Axes to plot data on bed (Bed): Bed data from mosdepth chrinfo (ChrInfoFile): seqid => color, new name defaultcolor (str): matplotlib-compatible color for data points sepcolor (str): matplotlib-compatible color for chromosome breaks ylim (int): Upper limit of the y-axis (depth) title (str): Title of the figure, to the right of the axis subtitle (str): Subtitle of the figure, just below title """ if chrinfo is None: chrinfo = {} sizes = bed.max_bp_in_chr seqids = chrinfo.keys() if chrinfo else sizes.keys() starts = {} ends = {} label_positions = [] start = 0 for seqid in seqids: starts[seqid] = start end = start + sizes[seqid] ends[seqid] = end label_positions.append((seqid, (start + end) / 2)) start = end xsize = end # Extract plotting data data = [] data_by_seqid = defaultdict(list) for b in bed: seqid = b.seqid if seqid not in starts: continue # chr01A 2000000 3000000 113.00 x = starts[seqid] + (b.start + b.end) / 2 y = float(b.accn) c = chrinfo[seqid].color if seqid in chrinfo else "k" data.append((x, y, c)) data_by_seqid[seqid].append(y) x, y, c = zip(*data) ax.scatter( x, y, c=c, edgecolors="none", s=8, lw=0, ) logging.debug("Obtained {} data points with depth data".format(len(data))) # Per seqid median medians = {} for seqid, values in data_by_seqid.items(): c = chrinfo[seqid].color if seqid in chrinfo else defaultcolor seqid_start = starts[seqid] seqid_end = ends[seqid] seqid_median = np.median(values) medians[seqid] = seqid_median ax.plot( (seqid_start, seqid_end), (seqid_median, seqid_median), "-", lw=4, color=c, alpha=0.5, ) # vertical lines for all the breaks for pos in starts.values(): ax.plot((pos, pos), (0, ylim), "-", lw=1, color=sepcolor) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) median_depth_y = 0.88 chr_label_y = 0.08 for seqid, position in label_positions: xpos = 0.1 + position * 0.8 / xsize c = chrinfo[seqid].color if seqid in chrinfo else defaultcolor newseqid = chrinfo[seqid].new_name if seqid in chrinfo else seqid root.text(xpos, chr_label_y, newseqid, color=c, ha="center", va="center", rotation=20) seqid_median = medians[seqid] root.text( xpos, median_depth_y, str(int(seqid_median)), color=c, ha="center", va="center", ) if title: root.text( 0.95, 0.5, markup(title), color="darkslategray", ha="center", va="center", size=15, ) if subtitle: root.text( 0.95, 0.375, markup(subtitle), color="darkslategray", ha="center", va="center", size=15, ) ax.set_xticks([]) ax.set_xlim(0, xsize) ax.set_ylim(0, ylim) ax.set_ylabel("Depth") set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color="gray", size=10) normalize_axes(root)
def dotplot(anchorfile, qbed, sbed, image_name, vmin, vmax, iopts, is_self=False, synteny=False, cmap_text=None): fp = open(anchorfile) qorder = qbed.order sorder = sbed.order data = [] if cmap_text: logging.debug("Normalize values to [%.1f, %.1f]" % (vmin, vmax)) for row in fp: atoms = row.split() # first two columns are query and subject, and an optional third column if len(atoms) < 2: continue query, subject = atoms[:2] value = atoms[-1] try: value = float(value) except ValueError: value = vmax if value < vmin: value = vmin if value > vmax: value = vmax if query not in qorder: #logging.warning("ignore %s" % query) continue if subject not in sorder: #logging.warning("ignore %s" % subject) continue qi, q = qorder[query] si, s = sorder[subject] nv = vmax - value data.append((qi, si, nv)) if is_self: # Mirror image data.append((si, qi, nv)) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # the whole canvas ax = fig.add_axes([.1, .1, .8, .8]) # the dot plot sample_number = 5000 # only show random subset if len(data) > sample_number: data = sample(data, sample_number) # the data are plotted in this order, the least value are plotted # last for aesthetics data.sort(key=lambda x: -x[2]) default_cm = cm.copper x, y, c = zip(*data) ax.scatter(x, y, c=c, s=2, lw=0, cmap=default_cm, vmin=vmin, vmax=vmax) if synteny: clusters = batch_scan(data, qbed, sbed) draw_box(clusters, ax) if cmap_text: draw_cmap(root, cmap_text, vmin, vmax, cmap=default_cm, reverse=True) xsize, ysize = len(qbed), len(sbed) logging.debug("xsize=%d ysize=%d" % (xsize, ysize)) xlim = (0, xsize) ylim = (ysize, 0) # invert the y-axis xchr_labels, ychr_labels = [], [] ignore = True # tag to mark whether to plot chr name (skip small ones) ignore_size_x = xsize * .005 ignore_size_y = ysize * .005 # plot the chromosome breaks for (seqid, beg, end) in qbed.get_breaks(): ignore = abs(end - beg) < ignore_size_x seqid = seqid.split("_")[-1] try: seqid = int(seqid) seqid = "c%d" % seqid except: pass xchr_labels.append((seqid, (beg + end) / 2, ignore)) ax.plot([beg, beg], ylim, "g-", lw=1) for (seqid, beg, end) in sbed.get_breaks(): ignore = abs(end - beg) < ignore_size_y seqid = seqid.split("_")[-1] try: seqid = int(seqid) seqid = "c%d" % seqid except: pass ychr_labels.append((seqid, (beg + end) / 2, ignore)) ax.plot(xlim, [beg, beg], "g-", lw=1) # plot the chromosome labels for label, pos, ignore in xchr_labels: pos = .1 + pos * .8 / xsize if not ignore: root.text(pos, .91, label, ha="center", va="bottom", rotation=45, color="grey") # remember y labels are inverted for label, pos, ignore in ychr_labels: pos = .9 - pos * .8 / ysize if not ignore: root.text(.91, pos, label, va="center", color="grey") # create a diagonal to separate mirror image for self comparison if is_self: ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2) ax.set_xlim(xlim) ax.set_ylim(ylim) # add genome names to_ax_label = lambda fname: _(op.basename(fname).split(".")[0]) gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)] ax.set_xlabel(gx, size=16) ax.set_ylabel(gy, size=16) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color='gray', size=10) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() logging.debug("Print image to `{0}` {1}".format(image_name, iopts)) plt.savefig(image_name, dpi=iopts.dpi)
def multihistogram(args): """ %prog multihistogram *.histogram species Plot the histogram based on a set of K-mer hisotograms. The method is based on Star et al.'s method (Atlantic Cod genome paper). """ p = OptionParser(multihistogram.__doc__) p.add_option("--kmin", default=15, type="int", help="Minimum K-mer size, inclusive") p.add_option("--kmax", default=30, type="int", help="Maximum K-mer size, inclusive") p.add_option("--vmin", default=2, type="int", help="Minimum value, inclusive") p.add_option("--vmax", default=100, type="int", help="Maximum value, inclusive") opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300) if len(args) < 1: sys.exit(not p.print_help()) histfiles = args[:-1] species = args[-1] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) A = fig.add_axes([0.08, 0.12, 0.38, 0.76]) B = fig.add_axes([0.58, 0.12, 0.38, 0.76]) lines = [] legends = [] genomesizes = [] for histfile in histfiles: ks = KmerSpectrum(histfile) x, y = ks.get_xy(opts.vmin, opts.vmax) K = get_number(op.basename(histfile).split(".")[0].split("-")[-1]) if not opts.kmin <= K <= opts.kmax: continue (line, ) = A.plot(x, y, "-", lw=1) lines.append(line) legends.append("K = {0}".format(K)) ks.analyze(K=K, method="allpaths") genomesizes.append((K, ks.genomesize / 1e6)) leg = A.legend(lines, legends, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) title = "{0} genome K-mer histogram".format(species) A.set_title(markup(title)) xlabel, ylabel = "Coverage (X)", "Counts" A.set_xlabel(xlabel) A.set_ylabel(ylabel) set_human_axis(A) title = "{0} genome size estimate".format(species) B.set_title(markup(title)) x, y = zip(*genomesizes) B.plot(x, y, "ko", mfc="w") t = np.linspace(opts.kmin - 0.5, opts.kmax + 0.5, 100) p = np.poly1d(np.polyfit(x, y, 2)) B.plot(t, p(t), "r:") xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)" B.set_xlabel(xlabel) B.set_ylabel(ylabel) set_ticklabels_helvetica(B) labels = ((0.04, 0.96, "A"), (0.54, 0.96, "B")) panel_labels(root, labels) normalize_axes(root) imagename = species + ".multiK.pdf" savefig(imagename, dpi=iopts.dpi, iopts=iopts)
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1, is_self=False, synteny=False, cmap_text=None, cmap="copper", genomenames=None, sample_number=10000, minfont=5, palette=None, chrlw=.01, title=None, sepcolor="gainsboro"): fp = open(anchorfile) qorder = qbed.order sorder = sbed.order data = [] if cmap_text: logging.debug("Capping values within [{0:.1f}, {1:.1f}]"\ .format(vmin, vmax)) block_id = 0 for row in fp: atoms = row.split() block_color = None if row[0] == "#": block_id += 1 if palette: block_color = palette.get(block_id, "k") continue # first two columns are query and subject, and an optional third column if len(atoms) < 2: continue query, subject = atoms[:2] value = atoms[-1] if cmap_text: try: value = float(value) except ValueError: value = vmax if value < vmin: continue if value > vmax: continue else: value = 0 if query not in qorder: continue if subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] nv = value if block_color is None else block_color data.append((qi, si, nv)) if is_self: # Mirror image data.append((si, qi, nv)) npairs = len(data) # Only show random subset if npairs > sample_number: logging.debug("Showing a random subset of {0} data points (total {1}) " \ "for clarity.".format(sample_number, npairs)) data = sample(data, sample_number) # the data are plotted in this order, the least value are plotted # last for aesthetics #if not palette: # data.sort(key=lambda x: -x[2]) x, y, c = zip(*data) if palette: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) else: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=cmap, vmin=vmin, vmax=vmax) if synteny: clusters = batch_scan(data, qbed, sbed) draw_box(clusters, ax) if cmap_text: draw_cmap(root, cmap_text, vmin, vmax, cmap=cmap) xsize, ysize = len(qbed), len(sbed) logging.debug("xsize=%d ysize=%d" % (xsize, ysize)) xlim = (0, xsize) ylim = (ysize, 0) # invert the y-axis # Tag to mark whether to plot chr name (skip small ones) xchr_labels, ychr_labels = [], [] th = TextHandler(fig) # plot the chromosome breaks for (seqid, beg, end) in qbed.get_breaks(): xsize_ratio = abs(end - beg) * .8 / xsize fontsize = th.select_fontsize(xsize_ratio) seqid = "".join(seqid_parse(seqid)[:2]) xchr_labels.append((seqid, (beg + end) / 2, fontsize)) ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor) for (seqid, beg, end) in sbed.get_breaks(): ysize_ratio = abs(end - beg) * .8 / ysize fontsize = th.select_fontsize(ysize_ratio) seqid = "".join(seqid_parse(seqid)[:2]) ychr_labels.append((seqid, (beg + end) / 2, fontsize)) ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor) # plot the chromosome labels for label, pos, fontsize in xchr_labels: pos = .1 + pos * .8 / xsize if fontsize >= minfont: root.text(pos, .91, latex(label), size=fontsize, ha="center", va="bottom", rotation=45, color="grey") # remember y labels are inverted for label, pos, fontsize in ychr_labels: pos = .9 - pos * .8 / ysize if fontsize >= minfont: root.text(.91, pos, latex(label), size=fontsize, va="center", color="grey") # create a diagonal to separate mirror image for self comparison if is_self: ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2) ax.set_xlim(xlim) ax.set_ylim(ylim) # add genome names if genomenames: gx, gy = genomenames.split("_") else: to_ax_label = lambda fname: op.basename(fname).split(".")[0] gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)] ax.set_xlabel(markup(gx), size=16) ax.set_ylabel(markup(gy), size=16) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color='gray', size=10) if palette: # bottom-left has the palette, if available colors = palette.colors xstart, ystart = .1, .05 for category, c in sorted(colors.items()): root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c)) root.text(xstart + .04, ystart, category, color=c) xstart += .1 if not title: title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy) if is_self: title = "Intra-genomic comparison within {0}".format(gx) npairs /= 2 title += " ({0} gene pairs)".format(thousands(npairs)) root.set_title(markup(title), x=.5, y=.96, color="k") logging.debug(title) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off()
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. """ p = OptionParser(histogram.__doc__) p.add_option( "--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive", ) p.add_option( "--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive", ) p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot", ) p.add_option( "--method", choices=("nbinom", "allpaths"), default="nbinom", help= "'nbinom' - slow but more accurate for het or polyploid genome; 'allpaths' - fast and works for homozygous enomes", ) p.add_option( "--maxiter", default=100, type="int", help="Max iterations for optimization. Only used with --method nbinom", ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option( "--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks", ) opts, args, iopts = p.set_image_options(args, figsize="7x7") if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args method = opts.method vmin, vmax = opts.vmin, opts.vmax ascii = not opts.pdf peaks = not opts.nopeaks and method == "allpaths" N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = merylhistogram(histfile) ks = KmerSpectrum(histfile) method_info = ks.analyze(K=N, maxiter=opts.maxiter, method=method) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.lambda_ if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1:.1f}x".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f} Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print(msg, file=sys.stderr) x, y = ks.get_xy(vmin, vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (iopts.w, iopts.h)) plt.bar(x, y, fc="#b2df8a", lw=0) # Plot the negative binomial fit if method == "nbinom": generative_model = method_info["generative_model"] GG = method_info["Gbins"] ll = method_info["lambda"] rr = method_info["rho"] kf_range = method_info["kf_range"] stacked = generative_model(GG, ll, rr) plt.plot( kf_range, stacked, ":", color="#6a3d9a", lw=2, ) ax = plt.gca() if peaks: # Only works for method 'allpaths' t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=3, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak") ax.text(ks.max2, tcounts[ks.max2], "Main peak") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 if method == "nbinom": # Plot multiple CN locations, CN1, CN2, ... up to ploidy cn_color = "#a6cee3" for i in range(1, ks.ploidy + 1): x = i * ks.lambda_ plt.plot((x, x), (0, ymax), "-.", color=cn_color) plt.text( x, ymax * 0.95, "CN{}".format(i), ha="right", va="center", color=cn_color, rotation=90, ) messages = [ Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg, ] if method == "nbinom": messages += [ks.ploidy_message] + ks.copy_messages write_messages(ax, messages) ax.set_title(markup(title)) ax.set_xlim((0, vmax)) ax.set_ylim((0, ymax)) adjust_spines(ax, ["left", "bottom"], outward=True) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + "." + iopts.format savefig(imagename, dpi=100) return Genome_size