def draw(self, roundrect=False, plot_label=True, plot_circles=True, pad=.03, vpad=.09): if self.empty: return y = self.y color = self.color ax = self.ax xstart = self.xstart gap = self.gap va = self.va nseqids = len(self.seqids) tr = self.tr def make_circle_name(sid): sid = sid.rsplit("_", 1)[-1] si = "".join(x for x in sid if x in string.digits) si = str(int(si)) if sid in self.rev: si += '-' return si for i, sid in enumerate(self.seqids): size = self.sizes[sid] rsize = self.ratio * size xend = xstart + rsize hc = HorizontalChromosome(ax, xstart, xend, y, height=self.height, lw=self.lw, fc=color, roundrect=roundrect) hc.set_transform(tr) si = make_circle_name(sid) xx = (xstart + xend) / 2 xstart = xend + gap step = 2 if nseqids <= 40 else 10 if nseqids >= 2 * MaxSeqids and (i + 1) % step != 0: continue if nseqids < 5: continue hpad = -pad if va == "bottom" else pad if plot_circles: TextCircle(ax, xx, y + hpad, si, radius=.01, fc="w", color=color, size=10, transform=tr) label = markup(self.label) c = color if color != "gainsboro" else "k" if plot_label: if self.label_va == "top": x, y = self.x, self.y + vpad elif self.label_va == "bottom": x, y = self.x, self.y - vpad else: # "center" x, y = self.xstart - vpad, self.y ax.text(x, y, label, ha="center", va="center", color=c, transform=tr)
def draw(self, title="*Ks* distribution", filename="Ks_plot.pdf"): ax = self.ax ks_max = self.ks_max lines = self.lines labels = self.labels legendp = self.legendp if len(lines) > 1: leg = ax.legend(lines, labels, loc=legendp, shadow=True, fancybox=True, prop={"size": 10}) leg.get_frame().set_alpha(.5) ax.set_xlim((0, ks_max - self.interval)) ax.set_title(markup(title), fontweight="bold") ax.set_xlabel(markup('Synonymous substitutions per site (*Ks*)')) ax.set_ylabel('Percentage of gene pairs') ax.set_xticklabels(ax.get_xticks(), family='Helvetica') ax.set_yticklabels(ax.get_yticks(), family='Helvetica') savefig(filename, dpi=300)
def add_data(self, data, components=1, label="Ks", color='r', marker='.', fill=False, fitted=True): ax = self.ax ks_max = self.ks_max interval = self.interval line, line_mixture = plot_ks_dist(ax, data, interval, components, ks_max, color=color, marker=marker, fill=fill, fitted=fitted) self.lines.append(line) self.labels.append(label) if fitted: self.lines.append(line_mixture) self.labels.append(label + " (fitted)") self.labels = [markup(x) for x in self.labels]
def draw(self, roundrect=False, plot_label=True): if self.empty: return y = self.y color = self.color ax = self.ax xstart = self.xstart gap = self.gap va = self.va nseqids = len(self.seqids) tr = self.tr for i, sid in enumerate(self.seqids): size = self.sizes[sid] rsize = self.ratio * size xend = xstart + rsize hc = HorizontalChromosome(ax, xstart, xend, y, height=self.height, lw=self.lw, fc=color, roundrect=roundrect) hc.set_transform(tr) sid = sid.rsplit("_", 1)[-1] si = "".join(x for x in sid if x not in string.letters) si = str(int(si)) xx = (xstart + xend) / 2 xstart = xend + gap if nseqids > 2 * MaxSeqids and (i + 1) % 10 != 0: continue if nseqids < 5: continue pad = .02 if va == "bottom": pad = - pad TextCircle(ax, xx, y + pad, si, radius=.01, fc="w", color=color, size=10, transform=tr) xp = min(self.xstart / 2, .1) if (self.xstart + self.xend) / 2 <= .5 \ else max(1 - self.xend / 2, .92) label = markup(self.label) c = color if color != "gainsboro" else "k" if plot_label: ax.text(xp, y + self.height * .6, label, ha="center", color=c, transform=tr)
def cartoon(args): """ %prog synteny.py Generate cartoon illustration of SynFind. """ p = OptionParser(cartoon.__doc__) opts, args, iopts = p.set_image_options(args, figsize="10x7") fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # Panel A A = CartoonRegion(41) A.draw(root, .35, .85, strip=False, color=False) x1, x2 = A.x1, A.x2 lsg = "lightslategray" pad = .01 xc, yc = .35, .88 arrowlen = x2 - xc - pad arrowprops = dict(length_includes_head=True, width=.01, fc=lsg, lw=0, head_length=arrowlen * .15, head_width=.03) p = FancyArrow(xc - pad, yc, -arrowlen, 0, shape="left", **arrowprops) root.add_patch(p) p = FancyArrow(xc + pad, yc, arrowlen, 0, shape="right", **arrowprops) root.add_patch(p) yt = yc + 4 * pad root.text((x1 + xc) / 2, yt, "20 genes upstream", ha="center") root.text((x2 + xc) / 2, yt, "20 genes downstream", ha="center") root.plot((xc,), (yc,), "o", mfc='w', mec=lsg, mew=2, lw=2, color=lsg) root.text(xc, yt, "Query gene", ha="center") # Panel B A.draw(root, .35, .7, strip=False) RoundRect(root, (.07, .49), .56, .14, fc='y', alpha=.2) a = deepcopy(A) a.evolve(mode='S', target=10) a.draw(root, .35, .6) b = deepcopy(A) b.evolve(mode='F', target=8) b.draw(root, .35, .56) c = deepcopy(A) c.evolve(mode='G', target=6) c.draw(root, .35, .52) for x in (a, b, c): root.text(.64, x.y, "Score={0}".format(x.nonwhites), va="center") # Panel C A.truncate_between_flankers() a.truncate_between_flankers() b.truncate_between_flankers() c.truncate_between_flankers(target=6) plot_diagram(root, .14, .2, A, a, "S", "syntenic") plot_diagram(root, .37, .2, A, b, "F", "missing, with both flankers") plot_diagram(root, .6, .2, A, c, "G", "missing, with one flanker") labels = ((.04, .95, 'A'), (.04, .75, 'B'), (.04, .4, 'C')) panel_labels(root, labels) # Descriptions xt = .85 desc = ("Extract neighborhood", "of *window* size", "Count gene pairs within *window*", "Find regions above *score* cutoff", "Identify flankers", "Annotate syntelog class" ) for yt, t in zip((.88, .84, .64, .6, .3, .26), desc): root.text(xt, yt, markup(t), ha="center", va="center") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() pf = "cartoon" image_name = pf + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def multihistogram(args): """ %prog multihistogram *.histogram species Plot the histogram based on a set of K-mer hisotograms. The method is based on Star et al.'s method (Atlantic Cod genome paper). """ p = OptionParser(multihistogram.__doc__) p.add_option("--kmin", default=15, type="int", help="Minimum K-mer size, inclusive") p.add_option("--kmax", default=30, type="int", help="Maximum K-mer size, inclusive") p.add_option("--vmin", default=2, type="int", help="Minimum value, inclusive") p.add_option("--vmax", default=100, type="int", help="Maximum value, inclusive") opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300) histfiles = args[:-1] species = args[-1] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) A = fig.add_axes([0.08, 0.12, 0.38, 0.76]) B = fig.add_axes([0.58, 0.12, 0.38, 0.76]) lines = [] legends = [] genomesizes = [] for histfile in histfiles: ks = KmerSpectrum(histfile) x, y = ks.get_xy(opts.vmin, opts.vmax) K = get_number(op.basename(histfile).split(".")[0].split("-")[-1]) if not opts.kmin <= K <= opts.kmax: continue line, = A.plot(x, y, "-", lw=1) lines.append(line) legends.append("K = {0}".format(K)) ks.analyze(K=K) genomesizes.append((K, ks.genomesize / 1e6)) leg = A.legend(lines, legends, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) title = "{0} genome K-mer histogram".format(species) A.set_title(markup(title)) xlabel, ylabel = "Coverage (X)", "Counts" A.set_xlabel(xlabel) A.set_ylabel(ylabel) set_human_axis(A) title = "{0} genome size estimate".format(species) B.set_title(markup(title)) x, y = zip(*genomesizes) B.plot(x, y, "ko", mfc="w") t = np.linspace(opts.kmin - 0.5, opts.kmax + 0.5, 100) p = np.poly1d(np.polyfit(x, y, 2)) B.plot(t, p(t), "r:") xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)" B.set_xlabel(xlabel) B.set_ylabel(ylabel) set_ticklabels_helvetica(B) labels = ((0.04, 0.96, "A"), (0.54, 0.96, "B")) panel_labels(root, labels) normalize_axes(root) imagename = species + ".multiK.pdf" savefig(imagename, dpi=iopts.dpi, iopts=iopts)
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]" ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = meryl([histfile]) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, "g-", lw=2, alpha=0.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=2, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") messages = [Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg] write_messages(ax, messages) ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title)) ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1, is_self=False, synteny=False, cmap_text=None, cmap="copper", genomenames=None, sample_number=10000, minfont=5, palette=None, chrlw=.1, title=None, sep=True, sepcolor="g", stdpf=True): fp = open(anchorfile) # add genome names if genomenames: gx, gy = genomenames.split("_") else: to_ax_label = lambda fname: op.basename(fname).split(".")[0] gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)] gx, gy = markup(gx), markup(gy) qorder = qbed.order sorder = sbed.order data = [] if cmap_text: logging.debug("Capping values within [{0:.1f}, {1:.1f}]"\ .format(vmin, vmax)) block_id = 0 for row in fp: atoms = row.split() block_color = None if row[0] == "#": block_id += 1 if palette: block_color = palette.get(block_id, "k") continue # first two columns are query and subject, and an optional third column if len(atoms) < 2: continue query, subject = atoms[:2] value = atoms[-1] if cmap_text: try: value = float(value) except ValueError: value = vmax if value < vmin: continue if value > vmax: continue else: value = 0 if query not in qorder: continue if subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] nv = value if block_color is None else block_color data.append((qi, si, nv)) if is_self: # Mirror image data.append((si, qi, nv)) npairs = downsample(data, sample_number=sample_number) x, y, c = zip(*data) if palette: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) else: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=cmap, vmin=vmin, vmax=vmax) if synteny: clusters = batch_scan(data, qbed, sbed) draw_box(clusters, ax) if cmap_text: draw_cmap(root, cmap_text, vmin, vmax, cmap=cmap) xsize, ysize = len(qbed), len(sbed) logging.debug("xsize=%d ysize=%d" % (xsize, ysize)) qbreaks = qbed.get_breaks() sbreaks = sbed.get_breaks() xlim, ylim = plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize, qbreaks, sbreaks, sep=sep, chrlw=chrlw, sepcolor=sepcolor, minfont=minfont, stdpf=stdpf) # create a diagonal to separate mirror image for self comparison if is_self: ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2) if palette: # bottom-left has the palette, if available colors = palette.colors xstart, ystart = .1, .05 for category, c in sorted(colors.items()): root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c)) root.text(xstart + .04, ystart, category, color=c) xstart += .1 if title is None: title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy) if is_self: title = "Intra-genomic comparison within {0}".format(gx) npairs /= 2 title += " ({0} gene pairs)".format(thousands(npairs)) root.set_title(title, x=.5, y=.96, color="k") if title: logging.debug("Dot plot title: {}".format(title)) normalize_axes(root)
def __init__(self, ax, ext, layout, bed, scale, switch=None, chr_label=True, pad=.04, vpad=.012): x, y = layout.x, layout.y ratio = layout.ratio scale /= ratio self.y = y lr = layout.rotation tr = Affine2D().rotate_deg_around(x, y, lr) + ax.transAxes inv = ax.transAxes.inverted() start, end, si, ei, chr, orientation, span = ext flank = span / scale / 2 xstart, xend = x - flank, x + flank self.xstart, self.xend = xstart, xend cv = lambda t: xstart + abs(t - startbp) / scale hidden = layout.hidden # Chromosome if not hidden: ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \ lw=2, zorder=1) self.genes = genes = bed[si: ei + 1] startbp, endbp = start.start, end.end if orientation == '-': startbp, endbp = endbp, startbp if switch: chr = switch.get(chr, chr) label = "-".join((human_size(startbp, target="Mb")[:-2], human_size(endbp, target="Mb"))) height = .012 self.gg = {} # Genes for g in genes: gstart, gend = g.start, g.end strand = g.strand if strand == '-': gstart, gend = gend, gstart if orientation == '-': strand = "+" if strand == "-" else "-" x1, x2 = cv(gstart), cv(gend) a, b = tr.transform((x1, y)), tr.transform((x2, y)) a, b = inv.transform(a), inv.transform(b) self.gg[g.accn] = (a, b) color = "b" if strand == "+" else "g" if not hidden: gp = Glyph(ax, x1, x2, y, height, gradient=False, fc=color, zorder=3) gp.set_transform(tr) ha, va = layout.ha, layout.va hpad = .02 if ha == "left": xx = xstart - hpad ha = "right" elif ha == "right": xx = xend + hpad ha = "left" else: xx = x ha = "center" # Tentative solution to labels stick into glyph magic = 40. cc = abs(lr) / magic if abs(lr) > magic else 1 if va == "top": yy = y + cc * pad elif va == "bottom": yy = y - cc * pad else: yy = y l = np.array((xx, yy)) trans_angle = ax.transAxes.transform_angles(np.array((lr, )), l.reshape((1, 2)))[0] lx, ly = l if not hidden and chr_label: ax.text(lx, ly + vpad, markup(chr), color=layout.color, ha=ha, va="center", rotation=trans_angle) ax.text(lx, ly - vpad, label, color="k", ha=ha, va="center", rotation=trans_angle)
def draw(self, roundrect=False, plot_label=True, pad=.03, vpad=.09): if self.empty: return y = self.y color = self.color ax = self.ax xstart = self.xstart gap = self.gap va = self.va nseqids = len(self.seqids) tr = self.tr for i, sid in enumerate(self.seqids): size = self.sizes[sid] rsize = self.ratio * size xend = xstart + rsize hc = HorizontalChromosome(ax, xstart, xend, y, height=self.height, lw=self.lw, fc=color, roundrect=roundrect) hc.set_transform(tr) sid = sid.rsplit("_", 1)[-1] si = "".join(x for x in sid if x in string.digits) si = str(int(si)) xx = (xstart + xend) / 2 xstart = xend + gap step = 2 if nseqids <= 40 else 10 if nseqids >= 2 * MaxSeqids and (i + 1) % step != 0: continue if nseqids < 5: continue hpad = -pad if va == "bottom" else pad TextCircle(ax, xx, y + hpad, si, radius=.01, fc="w", color=color, size=10, transform=tr) label = markup(self.label) c = color if color != "gainsboro" else "k" if plot_label: if self.label_va == "top": x, y = self.x, self.y + vpad va = "bottom" elif self.label_va == "bottom": x, y = self.x, self.y - vpad va = "top" else: # "center" x, y = self.xstart - vpad, self.y va = "center" ax.text(x, y, label, ha="center", va="center", color=c, transform=tr)
def multihistogram(args): """ %prog multihistogram *.histogram species Plot the histogram based on a set of K-mer hisotograms. The method is based on Star et al.'s method (Atlantic Cod genome paper). """ p = OptionParser(multihistogram.__doc__) p.add_option("--kmin", default=15, type="int", help="Minimum K-mer size, inclusive") p.add_option("--kmax", default=30, type="int", help="Maximum K-mer size, inclusive") p.add_option("--vmin", default=2, type="int", help="Minimum value, inclusive") p.add_option("--vmax", default=100, type="int", help="Maximum value, inclusive") opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300) histfiles = args[:-1] species = args[-1] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) A = fig.add_axes([.08, .12, .38, .76]) B = fig.add_axes([.58, .12, .38, .76]) lines = [] legends = [] genomesizes = [] for histfile in histfiles: ks = KmerSpectrum(histfile) x, y = ks.get_xy(opts.vmin, opts.vmax) K = get_number(op.basename(histfile).split(".")[0].split("-")[-1]) if not opts.kmin <= K <= opts.kmax: continue line, = A.plot(x, y, '-', lw=1) lines.append(line) legends.append("K = {0}".format(K)) ks.analyze(K=K) genomesizes.append((K, ks.genomesize / 1e6)) leg = A.legend(lines, legends, shadow=True, fancybox=True) leg.get_frame().set_alpha(.5) title = "{0} genome K-mer histogram".format(species) A.set_title(markup(title)) xlabel, ylabel = "Coverage (X)", "Counts" A.set_xlabel(xlabel) A.set_ylabel(ylabel) set_human_axis(A) title = "{0} genome size estimate".format(species) B.set_title(markup(title)) x, y = zip(*genomesizes) B.plot(x, y, "ko", mfc='w') t = np.linspace(opts.kmin - .5, opts.kmax + .5, 100) p = np.poly1d(np.polyfit(x, y, 2)) B.plot(t, p(t), "r:") xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)" B.set_xlabel(xlabel) B.set_ylabel(ylabel) set_ticklabels_helvetica(B) labels = ((.04, .96, 'A'), (.54, .96, 'B')) panel_labels(root, labels) normalize_axes(root) imagename = species + ".multiK.pdf" savefig(imagename, dpi=iopts.dpi, iopts=iopts)
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. """ p = OptionParser(histogram.__doc__) p.add_option( "--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive", ) p.add_option( "--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive", ) p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot", ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option( "--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = merylhistogram(histfile) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print(msg, file=sys.stderr) x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, "g-", lw=2, alpha=0.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=2, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") messages = [ Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg, ] write_messages(ax, messages) ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title)) ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args N = int(N) KMERYL, KSOAP, KALLPATHS = range(3) kformats = ("Meryl", "Soap", "AllPaths") kformat = KMERYL ascii = not opts.pdf peaks = not opts.nopeaks fp = open(histfile) hist = {} totalKmers = 0 # Guess the format of the Kmer histogram for row in fp: if row.startswith("# 1:"): kformat = KALLPATHS break if len(row.split()) == 1: kformat = KSOAP break fp.seek(0) logging.debug("Guessed format: {0}".format(kformats[kformat])) data = [] for rowno, row in enumerate(fp): if row[0] == '#': continue if kformat == KSOAP: K = rowno + 1 counts = int(row.strip()) else: # meryl histogram K, counts = row.split()[:2] K, counts = int(K), int(counts) Kcounts = K * counts totalKmers += Kcounts hist[K] = Kcounts data.append((K, counts)) covmax = 1000000 ks = KmerSpectrum(data) ks.analyze(K=N, covmax=covmax) Total_Kmers = int(totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\ format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg counts = sorted((a, b) for a, b in hist.items() \ if opts.vmin <= a <= opts.vmax) x, y = zip(*counts) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in counts if x in t] x, y = zip(*tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') tcounts = dict(tcounts) if peaks: ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") tc = "gray" axt = ax.transAxes ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right") ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right") ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right") ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right") ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title), color='r') ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel, color='r') ax.set_ylabel(ylabel, color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def draw_chromosomes( root, bedfile, sizes, iopts, mergedist, winsize, imagemap, mappingfile=None, gauge=False, legend=True, empty=False, title=None, ): bed = Bed(bedfile) prefix = bedfile.rsplit(".", 1)[0] if imagemap: imgmapfile = prefix + ".map" mapfh = open(imgmapfile, "w") print('<map id="' + prefix + '">', file=mapfh) if mappingfile: mappings = DictFile(mappingfile, delimiter="\t") classes = sorted(set(mappings.values())) preset_colors = (DictFile( mappingfile, keypos=1, valuepos=2, delimiter="\t") if DictFile.num_columns(mappingfile) >= 3 else {}) else: classes = sorted(set(x.accn for x in bed)) mappings = dict((x, x) for x in classes) preset_colors = {} logging.debug("A total of {} classes found: {}".format( len(classes), ",".join(classes))) # Assign colors to classes ncolors = max(3, min(len(classes), 12)) palette = set1_n if ncolors <= 8 else set3_n colorset = palette(number=ncolors) colorset = sample_N(colorset, len(classes)) class_colors = dict(zip(classes, colorset)) class_colors.update(preset_colors) logging.debug("Assigned colors: {}".format(class_colors)) chr_lens = {} centromeres = {} if sizes: chr_lens = Sizes(sizes).sizes_mapping else: for b, blines in groupby(bed, key=(lambda x: x.seqid)): blines = list(blines) maxlen = max(x.end for x in blines) chr_lens[b] = maxlen for b in bed: accn = b.accn if accn == "centromere": centromeres[b.seqid] = b.start if accn in mappings: b.accn = mappings[accn] else: b.accn = "-" chr_number = len(chr_lens) if centromeres: assert chr_number == len( centromeres), "chr_number = {}, centromeres = {}".format( chr_number, centromeres) r = 0.7 # width and height of the whole chromosome set xstart, ystart = 0.15, 0.85 xinterval = r / chr_number xwidth = xinterval * 0.5 # chromosome width max_chr_len = max(chr_lens.values()) ratio = r / max_chr_len # canvas / base # first the chromosomes for a, (chr, clen) in enumerate(sorted(chr_lens.items())): xx = xstart + a * xinterval + 0.5 * xwidth root.text(xx, ystart + 0.01, str(get_number(chr)), ha="center") if centromeres: yy = ystart - centromeres[chr] * ratio ChromosomeWithCentromere(root, xx, ystart, yy, ystart - clen * ratio, width=xwidth) else: Chromosome(root, xx, ystart, ystart - clen * ratio, width=xwidth) chr_idxs = dict((a, i) for i, a in enumerate(sorted(chr_lens.keys()))) alpha = 1 # color the regions for chr in sorted(chr_lens.keys()): segment_size, excess = 0, 0 bac_list = [] prev_end, prev_klass = 0, None for b in bed.sub_bed(chr): clen = chr_lens[chr] idx = chr_idxs[chr] klass = b.accn if klass == "centromere": continue start = b.start end = b.end if start < prev_end + mergedist and klass == prev_klass: start = prev_end xx = xstart + idx * xinterval yystart = ystart - end * ratio yyend = ystart - start * ratio root.add_patch( Rectangle( (xx, yystart), xwidth, yyend - yystart, fc=class_colors.get(klass, "lightslategray"), lw=0, alpha=alpha, )) prev_end, prev_klass = b.end, klass if imagemap: """ `segment` : size of current BAC being investigated + `excess` `excess` : left-over bases from the previous BAC, as a result of iterating over `winsize` regions of `segment` """ if excess == 0: segment_start = start segment = (end - start + 1) + excess while True: if segment < winsize: bac_list.append(b.accn) excess = segment break segment_end = segment_start + winsize - 1 tlx, tly, brx, bry = ( xx, (1 - ystart) + segment_start * ratio, xx + xwidth, (1 - ystart) + segment_end * ratio, ) print( "\t" + write_ImageMapLine( tlx, tly, brx, bry, iopts.w, iopts.h, iopts.dpi, chr + ":" + ",".join(bac_list), segment_start, segment_end, ), file=mapfh, ) segment_start += winsize segment -= winsize bac_list = [] if imagemap and excess > 0: bac_list.append(b.accn) segment_end = end tlx, tly, brx, bry = ( xx, (1 - ystart) + segment_start * ratio, xx + xwidth, (1 - ystart) + segment_end * ratio, ) print( "\t" + write_ImageMapLine( tlx, tly, brx, bry, iopts.w, iopts.h, iopts.dpi, chr + ":" + ",".join(bac_list), segment_start, segment_end, ), file=mapfh, ) if imagemap: print("</map>", file=mapfh) mapfh.close() logging.debug("Image map written to `{0}`".format(mapfh.name)) if gauge: xstart, ystart = 0.9, 0.85 Gauge(root, xstart, ystart - r, ystart, max_chr_len) if "centromere" in class_colors: del class_colors["centromere"] # class legends, four in a row if legend: xstart = 0.1 xinterval = 0.8 / len(class_colors) xwidth = 0.04 yy = 0.08 for klass, cc in sorted(class_colors.items()): if klass == "-": continue root.add_patch( Rectangle((xstart, yy), xwidth, xwidth, fc=cc, lw=0, alpha=alpha)) root.text(xstart + xwidth + 0.01, yy, latex(klass), fontsize=10) xstart += xinterval if empty: root.add_patch( Rectangle((xstart, yy), xwidth, xwidth, fill=False, lw=1)) root.text(xstart + xwidth + 0.01, yy, empty, fontsize=10) if title: root.text(0.5, 0.95, markup(title), ha="center", va="center")
def draw_tree( ax, t, hpd=None, margin=0.1, rmargin=0.2, tip=0.01, treecolor="k", supportcolor="k", internal=True, outgroup=None, dashedoutgroup=False, reroot=True, gffdir=None, sizes=None, trunc_name=None, SH=None, scutoff=0, leafcolor="k", leaffont=12, leafinfo=None, wgdinfo=None, geoscale=False, ): """ main function for drawing phylogenetic tree """ if reroot: if outgroup: R = t.get_common_ancestor(*outgroup) else: # Calculate the midpoint node R = t.get_midpoint_outgroup() if R is not t: t.set_outgroup(R) # By default, the distance to outgroup and non-outgroup is the same # we re-adjust the distances so that the outgroups will appear # farthest from everything else if dashedoutgroup: a, b = t.children # Avoid even split total = a.dist + b.dist newR = t.get_common_ancestor(*outgroup) a.dist = 0.9 * total b.dist = total - a.dist farthest, max_dist = t.get_farthest_leaf() print("max_dist = {}".format(max_dist), file=sys.stderr) xstart = margin ystart = 2 * margin # scale the tree scale = (1 - margin - rmargin) / max_dist def rescale(dist): return xstart + scale * dist def rescale_divergence(divergence): return rescale(max_dist - divergence) num_leaves = len(t.get_leaf_names()) yinterval = (1 - ystart) / num_leaves # get exons structures, if any structures = {} if gffdir: gffiles = glob("{0}/*.gff*".format(gffdir)) setups, ratio = get_setups(gffiles, canvas=rmargin / 2, noUTR=True) structures = dict((a, (b, c)) for a, b, c in setups) if sizes: sizes = Sizes(sizes).mapping coords = {} i = 0 for n in t.traverse("postorder"): dist = n.get_distance(t) xx = rescale(dist) if n.is_leaf(): yy = ystart + i * yinterval i += 1 if trunc_name: name = truncate_name(n.name, rule=trunc_name) else: name = n.name if leafinfo and n.name in leafinfo: line = leafinfo[n.name] lc = line.color sname = line.new_name else: lc = leafcolor sname = None lc = lc or "k" sname = sname or name.replace("_", "-") # if color is given as "R,G,B" if "," in lc: lc = [float(x) for x in lc.split(",")] ax.text( xx + tip, yy, markup(sname), va="center", fontstyle="italic", size=leaffont, color=lc, ) gname = n.name.split("_")[0] if gname in structures: mrnabed, cdsbeds = structures[gname] ExonGlyph( ax, 1 - rmargin / 2, yy, mrnabed, cdsbeds, align="right", ratio=ratio, ) if sizes and gname in sizes: size = sizes[gname] size = size / 3 - 1 # base pair converted to amino acid size = "{0}aa".format(size) ax.text(1 - rmargin / 2 + tip, yy, size, size=leaffont) else: linestyle = "--" if (dashedoutgroup and n is t) else "-" children = [coords[x] for x in n.get_children()] children_x, children_y = zip(*children) min_y, max_y = min(children_y), max(children_y) # plot the vertical bar ax.plot((xx, xx), (min_y, max_y), linestyle, color=treecolor) # plot the horizontal bar for cx, cy in children: ax.plot((xx, cx), (cy, cy), linestyle, color=treecolor) yy = sum(children_y) * 1.0 / len(children_y) # plot HPD if exists if hpd and n.name in hpd: a, b = hpd[n.name] ax.plot( (rescale_divergence(a), rescale_divergence(b)), (yy, yy), "-", color="darkslategray", alpha=0.4, lw=2, ) support = n.support if support > 1: support = support / 100.0 if not n.is_root() and supportcolor: if support > scutoff / 100.0: ax.text( xx, yy + 0.005, "{0:d}".format(int(abs(support * 100))), ha="right", size=leaffont, color=supportcolor, ) if internal and n.name: TextCircle(ax, xx, yy, n.name, size=9) coords[n] = (xx, yy) # WGD info draw_wgd(ax, yy, rescale_divergence, n.name, wgdinfo) # scale bar if geoscale: draw_geoscale(ax, margin=margin, rmargin=rmargin, yy=margin, max_dist=max_dist) else: br = 0.1 x1 = xstart + 0.1 x2 = x1 + br * scale yy = margin ax.plot([x1, x1], [yy - tip, yy + tip], "-", color=treecolor) ax.plot([x2, x2], [yy - tip, yy + tip], "-", color=treecolor) ax.plot([x1, x2], [yy, yy], "-", color=treecolor) ax.text( (x1 + x2) / 2, yy - tip, "{0:g}".format(br), va="top", ha="center", size=leaffont, color=treecolor, ) if SH is not None: xs = x1 ys = (margin + yy) / 2.0 ax.text( xs, ys, "SH test against ref tree: {0}".format(SH), ha="left", size=leaffont, color="g", ) normalize_axes(ax)
def __init__(self, ax, ext, layout, bed, scale, switch=None, chr_label=True, pad=.04, vpad=.012, extra_features=None): x, y = layout.x, layout.y ratio = layout.ratio scale /= ratio self.y = y lr = layout.rotation tr = mpl.transforms.Affine2D().\ rotate_deg_around(x, y, lr) + ax.transAxes inv = ax.transAxes.inverted() start, end, si, ei, chr, orientation, span = ext flank = span / scale / 2 xstart, xend = x - flank, x + flank self.xstart, self.xend = xstart, xend cv = lambda t: xstart + abs(t - startbp) / scale hidden = layout.hidden # Chromosome if not hidden: ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \ lw=2, zorder=1) self.genes = genes = bed[si: ei + 1] startbp, endbp = start.start, end.end if orientation == '-': startbp, endbp = endbp, startbp if switch: chr = switch.get(chr, chr) label = "-".join((human_size(startbp, target="Mb")[:-2], human_size(endbp, target="Mb"))) height = .012 self.gg = {} # Genes for g in genes: gstart, gend = g.start, g.end strand = g.strand if strand == '-': gstart, gend = gend, gstart if orientation == '-': strand = "+" if strand == "-" else "-" x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv) self.gg[g.accn] = (a, b) color = forward if strand == "+" else backward if not hidden: gp = Glyph(ax, x1, x2, y, height, gradient=False, fc=color, zorder=3) gp.set_transform(tr) # Extra features (like repeats) if extra_features: for g in extra_features: gstart, gend = g.start, g.end x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv) gp = Glyph(ax, x1, x2, y, height * 3 / 4, gradient=False, fc='#ff7f00', zorder=2) gp.set_transform(tr) ha, va = layout.ha, layout.va hpad = .02 if ha == "left": xx = xstart - hpad ha = "right" elif ha == "right": xx = xend + hpad ha = "left" else: xx = x ha = "center" # Tentative solution to labels stick into glyph magic = 40. cc = abs(lr) / magic if abs(lr) > magic else 1 if va == "top": yy = y + cc * pad elif va == "bottom": yy = y - cc * pad - .01 else: yy = y l = np.array((xx, yy)) trans_angle = ax.transAxes.transform_angles(np.array((lr, )), l.reshape((1, 2)))[0] lx, ly = l if not hidden and chr_label: bbox = dict(boxstyle="round", fc='w', ec='w', alpha=.5) ax.text(lx, ly + vpad, markup(chr), color=layout.color, ha=ha, va="center", rotation=trans_angle, bbox=bbox, zorder=10) ax.text(lx, ly - vpad, label, color="lightslategrey", size=10, ha=ha, va="center", rotation=trans_angle, bbox=bbox, zorder=10)
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args N = int(N) KMERYL, KSOAP, KALLPATHS = range(3) kformats = ("Meryl", "Soap", "AllPaths") kformat = KMERYL ascii = not opts.pdf peaks = not opts.nopeaks fp = open(histfile) hist = {} totalKmers = 0 # Guess the format of the Kmer histogram for row in fp: if row.startswith("# 1:"): kformat = KALLPATHS break if len(row.split()) == 1: kformat = KSOAP break fp.seek(0) logging.debug("Guessed format: {0}".format(kformats[kformat])) data = [] for rowno, row in enumerate(fp): if row[0] == '#': continue if kformat == KSOAP: K = rowno + 1 counts = int(row.strip()) else: # meryl histogram K, counts = row.split()[:2] K, counts = int(K), int(counts) Kcounts = K * counts totalKmers += Kcounts hist[K] = Kcounts data.append((K, counts)) covmax = 1000000 ks = KmerSpectrum(data) ks.analyze(K=N, covmax=covmax) Total_Kmers = int(totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = Total_Kmers * 1. / Kmer_coverage / 1e6 Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg counts = sorted((a, b) for a, b in hist.items() \ if opts.vmin <= a <= opts.vmax) x, y = zip(*counts) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: return asciiplot(x, y, title=title) plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in counts if x in t] x, y = zip(*tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') tcounts = dict(tcounts) if peaks: ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") tc = "gray" axt = ax.transAxes ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right") ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right") ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right") ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right") ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title), color='r') ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel, color='r') ax.set_ylabel(ylabel, color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100)
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1, is_self=False, synteny=False, cmap_text=None, cmap="copper", genomenames=None, sample_number=10000, minfont=5, palette=None, chrlw=.01, title=None, sepcolor="gainsboro"): fp = open(anchorfile) qorder = qbed.order sorder = sbed.order data = [] if cmap_text: logging.debug("Capping values within [{0:.1f}, {1:.1f}]"\ .format(vmin, vmax)) block_id = 0 for row in fp: atoms = row.split() block_color = None if row[0] == "#": block_id += 1 if palette: block_color = palette.get(block_id, "k") continue # first two columns are query and subject, and an optional third column if len(atoms) < 2: continue query, subject = atoms[:2] value = atoms[-1] if cmap_text: try: value = float(value) except ValueError: value = vmax if value < vmin: continue if value > vmax: continue else: value = 0 if query not in qorder: continue if subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] nv = value if block_color is None else block_color data.append((qi, si, nv)) if is_self: # Mirror image data.append((si, qi, nv)) npairs = len(data) # Only show random subset if npairs > sample_number: logging.debug("Showing a random subset of {0} data points (total {1}) " \ "for clarity.".format(sample_number, npairs)) data = sample(data, sample_number) # the data are plotted in this order, the least value are plotted # last for aesthetics #if not palette: # data.sort(key=lambda x: -x[2]) x, y, c = zip(*data) if palette: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) else: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=cmap, vmin=vmin, vmax=vmax) if synteny: clusters = batch_scan(data, qbed, sbed) draw_box(clusters, ax) if cmap_text: draw_cmap(root, cmap_text, vmin, vmax, cmap=cmap) xsize, ysize = len(qbed), len(sbed) logging.debug("xsize=%d ysize=%d" % (xsize, ysize)) xlim = (0, xsize) ylim = (ysize, 0) # invert the y-axis # Tag to mark whether to plot chr name (skip small ones) xchr_labels, ychr_labels = [], [] th = TextHandler(fig) # plot the chromosome breaks for (seqid, beg, end) in qbed.get_breaks(): xsize_ratio = abs(end - beg) * .8 / xsize fontsize = th.select_fontsize(xsize_ratio) seqid = "".join(seqid_parse(seqid)[:2]) xchr_labels.append((seqid, (beg + end) / 2, fontsize)) ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor) for (seqid, beg, end) in sbed.get_breaks(): ysize_ratio = abs(end - beg) * .8 / ysize fontsize = th.select_fontsize(ysize_ratio) seqid = "".join(seqid_parse(seqid)[:2]) ychr_labels.append((seqid, (beg + end) / 2, fontsize)) ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor) # plot the chromosome labels for label, pos, fontsize in xchr_labels: pos = .1 + pos * .8 / xsize if fontsize >= minfont: root.text(pos, .91, latex(label), size=fontsize, ha="center", va="bottom", rotation=45, color="grey") # remember y labels are inverted for label, pos, fontsize in ychr_labels: pos = .9 - pos * .8 / ysize if fontsize >= minfont: root.text(.91, pos, latex(label), size=fontsize, va="center", color="grey") # create a diagonal to separate mirror image for self comparison if is_self: ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2) ax.set_xlim(xlim) ax.set_ylim(ylim) # add genome names if genomenames: gx, gy = genomenames.split("_") else: to_ax_label = lambda fname: op.basename(fname).split(".")[0] gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)] ax.set_xlabel(markup(gx), size=16) ax.set_ylabel(markup(gy), size=16) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color='gray', size=10) if palette: # bottom-left has the palette, if available colors = palette.colors xstart, ystart = .1, .05 for category, c in sorted(colors.items()): root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c)) root.text(xstart + .04, ystart, category, color=c) xstart += .1 if not title: title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy) if is_self: title = "Intra-genomic comparison within {0}".format(gx) npairs /= 2 title += " ({0} gene pairs)".format(thousands(npairs)) root.set_title(markup(title), x=.5, y=.96, color="k") logging.debug(title) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off()
def __init__( self, ax, ext, layout, bed, scale, switch=None, chr_label=True, loc_label=True, genelabelsize=0, pad=0.05, vpad=0.015, extra_features=None, glyphstyle="box", glyphcolor: BasePalette = OrientationPalette(), ): x, y = layout.x, layout.y ratio = layout.ratio scale /= ratio self.y = y lr = layout.rotation tr = mpl.transforms.Affine2D().rotate_deg_around(x, y, lr) + ax.transAxes inv = ax.transAxes.inverted() start, end, si, ei, chr, orientation, span = ext flank = span / scale / 2 xstart, xend = x - flank, x + flank self.xstart, self.xend = xstart, xend cv = lambda t: xstart + abs(t - startbp) / scale hidden = layout.hidden # Chromosome if not hidden: ax.plot((xstart, xend), (y, y), color="gray", transform=tr, lw=2, zorder=1) self.genes = genes = bed[si : ei + 1] startbp, endbp = start.start, end.end if orientation == "-": startbp, endbp = endbp, startbp if switch: chr = switch.get(chr, chr) if layout.label: chr = layout.label label = "-".join( ( human_size(startbp, target="Mb", precision=2)[:-2], human_size(endbp, target="Mb", precision=2), ) ) height = 0.012 self.gg = {} # Genes for g in genes: gstart, gend = g.start, g.end strand = g.strand if strand == "-": gstart, gend = gend, gstart if orientation == "-": strand = "+" if strand == "-" else "-" x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv) gene_name = g.accn self.gg[gene_name] = (a, b) color, zorder = ( glyphcolor.get_color_and_zorder(strand) if isinstance(glyphcolor, OrientationPalette) else glyphcolor.get_color_and_zorder(gene_name) ) if hidden: continue gp = Glyph( ax, x1, x2, y, height, gradient=False, fc=color, style=glyphstyle, zorder=zorder, ) gp.set_transform(tr) if genelabelsize: ax.text( (x1 + x2) / 2, y + height / 2 + genelabelsize * vpad / 3, markup(gene_name), size=genelabelsize, rotation=25, ha="left", va="center", color="lightslategray", ) # Extra features (like repeats) if extra_features: for g in extra_features: gstart, gend = g.start, g.end x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv) gp = Glyph( ax, x1, x2, y, height * 3 / 4, gradient=False, fc="#ff7f00", style=glyphstyle, zorder=2, ) gp.set_transform(tr) ha, va = layout.ha, layout.va hpad = 0.02 if ha == "left": xx = xstart - hpad ha = "right" elif ha == "right": xx = xend + hpad ha = "left" else: xx = x ha = "center" # Tentative solution to labels stick into glyph magic = 40.0 cc = abs(lr) / magic if abs(lr) > magic else 1 if va == "top": yy = y + cc * pad elif va == "bottom": yy = y - cc * pad else: yy = y l = np.array((xx, yy)) trans_angle = ax.transAxes.transform_angles(np.array((lr,)), l.reshape((1, 2)))[ 0 ] lx, ly = l if not hidden: bbox = dict(boxstyle="round", fc="w", ec="w", alpha=0.5) kwargs = dict( ha=ha, va="center", rotation=trans_angle, bbox=bbox, zorder=10 ) # TODO: I spent several hours on trying to make this work - with no # good solutions. To generate labels on multiple lines, each line # with a different style is difficult in matplotlib. The only way, # if you can tolerate an extra dot (.), is to use the recipe below. # chr_label = r"\noindent " + markup(chr) + r" \\ ." if chr_label else None # loc_label = r"\noindent . \\ " + label if loc_label else None chr_label = markup(chr) if chr_label else None loc_label = label if loc_label else None if chr_label: if loc_label: ax.text(lx, ly + vpad, chr_label, color=layout.color, **kwargs) ax.text( lx, ly - vpad, loc_label, color="lightslategrey", size=10, **kwargs ) else: ax.text(lx, ly, chr_label, color=layout.color, **kwargs)
def draw(self, roundrect=False, plot_label=True): if self.empty: return y = self.y color = self.color ax = self.ax xstart = self.xstart gap = self.gap va = self.va nseqids = len(self.seqids) tr = self.tr for i, sid in enumerate(self.seqids): size = self.sizes[sid] rsize = self.ratio * size xend = xstart + rsize hc = HorizontalChromosome(ax, xstart, xend, y, height=self.height, lw=self.lw, fc=color, roundrect=roundrect) hc.set_transform(tr) sid = sid.rsplit("_", 1)[-1] si = "".join(x for x in sid if x not in string.letters) si = str(int(si)) xx = (xstart + xend) / 2 xstart = xend + gap if nseqids > 2 * MaxSeqids and (i + 1) % 10 != 0: continue if nseqids < 5: continue pad = .02 if va == "bottom": pad = -pad TextCircle(ax, xx, y + pad, si, radius=.01, fc="w", color=color, size=10, transform=tr) xp = min(self.xstart / 2, .1) #if (self.xstart + self.xend) / 2 <= .5 \ #else max(1 - self.xend / 2, .92) label = markup(self.label) c = color if color != "gainsboro" else "k" if plot_label: ax.text(xp, y + self.height * .6, label, ha="center", color=c, transform=tr)
def __init__(self, ax, ext, layout, bed, scale, switch=None, chr_label=True, loc_label=True, pad=.05, vpad=.015, extra_features=None): x, y = layout.x, layout.y ratio = layout.ratio scale /= ratio self.y = y lr = layout.rotation tr = mpl.transforms.Affine2D().\ rotate_deg_around(x, y, lr) + ax.transAxes inv = ax.transAxes.inverted() start, end, si, ei, chr, orientation, span = ext flank = span / scale / 2 xstart, xend = x - flank, x + flank self.xstart, self.xend = xstart, xend cv = lambda t: xstart + abs(t - startbp) / scale hidden = layout.hidden # Chromosome if not hidden: ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \ lw=2, zorder=1) self.genes = genes = bed[si: ei + 1] startbp, endbp = start.start, end.end if orientation == '-': startbp, endbp = endbp, startbp if switch: chr = switch.get(chr, chr) if layout.label: chr = layout.label label = "-".join((human_size(startbp, target="Mb", precision=2)[:-2], human_size(endbp, target="Mb", precision=2))) height = .012 self.gg = {} # Genes for g in genes: gstart, gend = g.start, g.end strand = g.strand if strand == '-': gstart, gend = gend, gstart if orientation == '-': strand = "+" if strand == "-" else "-" x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv) self.gg[g.accn] = (a, b) color = forward if strand == "+" else backward if not hidden: gp = Glyph(ax, x1, x2, y, height, gradient=False, fc=color, zorder=3) gp.set_transform(tr) # Extra features (like repeats) if extra_features: for g in extra_features: gstart, gend = g.start, g.end x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv) gp = Glyph(ax, x1, x2, y, height * 3 / 4, gradient=False, fc='#ff7f00', zorder=2) gp.set_transform(tr) ha, va = layout.ha, layout.va hpad = .02 if ha == "left": xx = xstart - hpad ha = "right" elif ha == "right": xx = xend + hpad ha = "left" else: xx = x ha = "center" # Tentative solution to labels stick into glyph magic = 40. cc = abs(lr) / magic if abs(lr) > magic else 1 if va == "top": yy = y + cc * pad elif va == "bottom": yy = y - cc * pad else: yy = y l = np.array((xx, yy)) trans_angle = ax.transAxes.transform_angles(np.array((lr, )), l.reshape((1, 2)))[0] lx, ly = l if not hidden: bbox = dict(boxstyle="round", fc='w', ec='w', alpha=.5) kwargs = dict(ha=ha, va="center", rotation=trans_angle, bbox=bbox, zorder=10) # TODO: I spent several hours on trying to make this work - with no # good solutions. To generate labels on multiple lines, each line # with a different style is difficult in matplotlib. The only way, # if you can tolerate an extra dot (.), is to use the recipe below. #chr_label = r"\noindent " + markup(chr) + r" \\ ." if chr_label else None #loc_label = r"\noindent . \\ " + label if loc_label else None chr_label = markup(chr) if chr_label else None loc_label = label if loc_label else None if chr_label: if loc_label: ax.text(lx, ly + vpad, chr_label, color=layout.color, **kwargs) ax.text(lx, ly - vpad, loc_label, color="lightslategrey", size=10, **kwargs) else: ax.text(lx, ly, chr_label, color=layout.color, **kwargs)
def histogram(args): """ %prog histogram [reads.fasta|reads.fastq] Plot read length distribution for reads. The plot would be similar to the one generated by SMRT-portal, for example: http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html Plot has two axes - corresponding to pdf and cdf, respectively. Also adding number of reads, average/median, N50, and total length. """ from jcvi.utils.cbook import human_size, thousands, SUFFIXES from jcvi.formats.fastq import fasta from jcvi.graphics.histogram import stem_leaf_plot from jcvi.graphics.base import plt, markup, human_formatter, \ human_base_formatter, savefig, set2, set_ticklabels_helvetica p = OptionParser(histogram.__doc__) p.set_histogram(vmax=50000, bins=100, xlabel="Read length", title="Read length distribution") p.add_option("--ylabel1", default="Counts", help="Label of y-axis on the left") p.add_option("--color", default='0', choices=[str(x) for x in range(8)], help="Color of bars, which is an index 0-7 in brewer set2") opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark") if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args fastafile, qualfile = fasta([fastafile, "--seqtk"]) sizes = Sizes(fastafile) all_sizes = sorted(sizes.sizes) xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins) plt.figure(1, (iopts.w, iopts.h)) ax1 = plt.gca() width = (xmax - xmin) * .5 / bins color = set2[int(opts.color)] ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center") ax1.set_xlabel(markup(opts.xlabel)) ax1.set_ylabel(opts.ylabel1) ax2 = ax1.twinx() cur_size = 0 total_size, l50, n50 = sizes.summary cdf = {} hsize = human_size(total_size) tag = hsize[-2:] unit = 1000 ** SUFFIXES[1000].index(tag) for x in all_sizes: if x not in cdf: cdf[x] = (total_size - cur_size) * 1. / unit cur_size += x x, y = zip(*sorted(cdf.items())) ax2.plot(x, y, '-', color="darkslategray") ylabel2 = "{0} above read length".format(tag) ax2.set_ylabel(ylabel2) for ax in (ax1, ax2): set_ticklabels_helvetica(ax) ax.set_xlim((xmin - width / 2, xmax + width / 2)) tc = "gray" axt = ax1.transAxes xx, yy = .95, .95 ma = "Total bases: {0}".format(hsize) mb = "Total reads: {0}".format(thousands(len(sizes))) mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes))) md = "Median read length: {0}bp".format(thousands(np.median(all_sizes))) me = "N50 read length: {0}bp".format(thousands(l50)) for t in (ma, mb, mc, md, me): print >> sys.stderr, t ax1.text(xx, yy, t, color=tc, transform=axt, ha="right") yy -= .05 ax1.set_title(markup(opts.title)) # Seaborn removes ticks for all styles except 'ticks'. Now add them back: ax1.tick_params(axis="x", direction="out", length=3, left=False, right=False, top=False, bottom=True) ax1.xaxis.set_major_formatter(human_base_formatter) ax1.yaxis.set_major_formatter(human_formatter) figname = sizes.filename + ".pdf" savefig(figname)
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. """ p = OptionParser(histogram.__doc__) p.add_option( "--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive", ) p.add_option( "--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive", ) p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot", ) p.add_option( "--method", choices=("nbinom", "allpaths"), default="nbinom", help= "'nbinom' - slow but more accurate for het or polyploid genome; 'allpaths' - fast and works for homozygous enomes", ) p.add_option( "--maxiter", default=100, type="int", help="Max iterations for optimization. Only used with --method nbinom", ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option( "--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks", ) opts, args, iopts = p.set_image_options(args, figsize="7x7") if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args method = opts.method vmin, vmax = opts.vmin, opts.vmax ascii = not opts.pdf peaks = not opts.nopeaks and method == "allpaths" N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = merylhistogram(histfile) ks = KmerSpectrum(histfile) method_info = ks.analyze(K=N, maxiter=opts.maxiter, method=method) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.lambda_ if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1:.1f}x".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f} Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print(msg, file=sys.stderr) x, y = ks.get_xy(vmin, vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (iopts.w, iopts.h)) plt.bar(x, y, fc="#b2df8a", lw=0) # Plot the negative binomial fit if method == "nbinom": generative_model = method_info["generative_model"] GG = method_info["Gbins"] ll = method_info["lambda"] rr = method_info["rho"] kf_range = method_info["kf_range"] stacked = generative_model(GG, ll, rr) plt.plot( kf_range, stacked, ":", color="#6a3d9a", lw=2, ) ax = plt.gca() if peaks: # Only works for method 'allpaths' t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=3, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak") ax.text(ks.max2, tcounts[ks.max2], "Main peak") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 if method == "nbinom": # Plot multiple CN locations, CN1, CN2, ... up to ploidy cn_color = "#a6cee3" for i in range(1, ks.ploidy + 1): x = i * ks.lambda_ plt.plot((x, x), (0, ymax), "-.", color=cn_color) plt.text( x, ymax * 0.95, "CN{}".format(i), ha="right", va="center", color=cn_color, rotation=90, ) messages = [ Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg, ] if method == "nbinom": messages += [ks.ploidy_message] + ks.copy_messages write_messages(ax, messages) ax.set_title(markup(title)) ax.set_xlim((0, vmax)) ax.set_ylim((0, ymax)) adjust_spines(ax, ["left", "bottom"], outward=True) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + "." + iopts.format savefig(imagename, dpi=100) return Genome_size
def draw(self, roundrect=False, plot_label=True, plot_circles=True, pad=0.03, vpad=0.09): if self.empty: return y = self.y color = self.color ax = self.ax xstart = self.xstart gap = self.gap va = self.va nseqids = len(self.seqids) tr = self.tr for i, sid in enumerate(self.seqids): size = self.sizes[sid] rsize = self.ratio * size xend = xstart + rsize hc = HorizontalChromosome( ax, xstart, xend, y, height=self.height, lw=self.lw, fc=color, roundrect=roundrect, ) hc.set_transform(tr) si = make_circle_name(sid, self.rev) xx = (xstart + xend) / 2 xstart = xend + gap step = 2 if nseqids <= 40 else 10 if nseqids >= 2 * MaxSeqids and (i + 1) % step != 0: continue if nseqids < 5: continue hpad = -pad if va == "bottom" else pad if plot_circles: TextCircle( ax, xx, y + hpad, si, fc="w", color=color, size=10, transform=tr, ) label = markup(self.label) c = color if color != "gainsboro" else "k" if plot_label: if self.label_va == "top": x, y = self.x, self.y + vpad elif self.label_va == "bottom": x, y = self.x, self.y - vpad else: # "center" x, y = self.xstart - vpad / 2, self.y ax.text(x, y, label, ha="center", va="center", color=c, transform=tr)
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\ format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") tc = "gray" axt = ax.transAxes ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right") ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right") ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right") ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right") ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title), color='r') ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel, color='r') ax.set_ylabel(ylabel, color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram [reads.fasta|reads.fastq] Plot read length distribution for reads. The plot would be similar to the one generated by SMRT-portal, for example: http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html Plot has two axes - corresponding to pdf and cdf, respectively. Also adding number of reads, average/median, N50, and total length. """ from jcvi.utils.cbook import human_size, thousands, SUFFIXES from jcvi.formats.fastq import fasta from jcvi.graphics.histogram import stem_leaf_plot from jcvi.graphics.base import ( plt, markup, human_formatter, human_base_formatter, savefig, set2, set_ticklabels_helvetica, ) p = OptionParser(histogram.__doc__) p.set_histogram(vmax=50000, bins=100, xlabel="Read length", title="Read length distribution") p.add_option("--ylabel1", default="Counts", help="Label of y-axis on the left") p.add_option( "--color", default="0", choices=[str(x) for x in range(8)], help="Color of bars, which is an index 0-7 in brewer set2", ) opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark") if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args fastafile, qualfile = fasta([fastafile, "--seqtk"]) sizes = Sizes(fastafile) all_sizes = sorted(sizes.sizes) xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins) plt.figure(1, (iopts.w, iopts.h)) ax1 = plt.gca() width = (xmax - xmin) * 0.5 / bins color = set2[int(opts.color)] ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center") ax1.set_xlabel(markup(opts.xlabel)) ax1.set_ylabel(opts.ylabel1) ax2 = ax1.twinx() cur_size = 0 total_size, l50, n50 = sizes.summary cdf = {} hsize = human_size(total_size) tag = hsize[-2:] unit = 1000**SUFFIXES[1000].index(tag) for x in all_sizes: if x not in cdf: cdf[x] = (total_size - cur_size) * 1.0 / unit cur_size += x x, y = zip(*sorted(cdf.items())) ax2.plot(x, y, "-", color="darkslategray") ylabel2 = "{0} above read length".format(tag) ax2.set_ylabel(ylabel2) for ax in (ax1, ax2): set_ticklabels_helvetica(ax) ax.set_xlim((xmin - width / 2, xmax + width / 2)) tc = "gray" axt = ax1.transAxes xx, yy = 0.95, 0.95 ma = "Total bases: {0}".format(hsize) mb = "Total reads: {0}".format(thousands(len(sizes))) mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes))) md = "Median read length: {0}bp".format(thousands(np.median(all_sizes))) me = "N50 read length: {0}bp".format(thousands(l50)) for t in (ma, mb, mc, md, me): print(t, file=sys.stderr) ax1.text(xx, yy, t, color=tc, transform=axt, ha="right") yy -= 0.05 ax1.set_title(markup(opts.title)) # Seaborn removes ticks for all styles except 'ticks'. Now add them back: ax1.tick_params( axis="x", direction="out", length=3, left=False, right=False, top=False, bottom=True, ) ax1.xaxis.set_major_formatter(human_base_formatter) ax1.yaxis.set_major_formatter(human_formatter) figname = sizes.filename + ".pdf" savefig(figname)
def draw_depth( root, ax, bed, chrinfo={}, defaultcolor="k", sepcolor="w", ylim=100, title=None, subtitle=None, ): """ Draw depth plot on the given axes, using data from bed Args: root (matplotlib.Axes): Canvas axes ax (matplotlib.Axes): Axes to plot data on bed (Bed): Bed data from mosdepth chrinfo (ChrInfoFile): seqid => color, new name defaultcolor (str): matplotlib-compatible color for data points sepcolor (str): matplotlib-compatible color for chromosome breaks ylim (int): Upper limit of the y-axis (depth) title (str): Title of the figure, to the right of the axis subtitle (str): Subtitle of the figure, just below title """ if chrinfo is None: chrinfo = {} sizes = bed.max_bp_in_chr seqids = chrinfo.keys() if chrinfo else sizes.keys() starts = {} ends = {} label_positions = [] start = 0 for seqid in seqids: starts[seqid] = start end = start + sizes[seqid] ends[seqid] = end label_positions.append((seqid, (start + end) / 2)) start = end xsize = end # Extract plotting data data = [] data_by_seqid = defaultdict(list) for b in bed: seqid = b.seqid if seqid not in starts: continue # chr01A 2000000 3000000 113.00 x = starts[seqid] + (b.start + b.end) / 2 y = float(b.accn) c = chrinfo[seqid].color if seqid in chrinfo else "k" data.append((x, y, c)) data_by_seqid[seqid].append(y) x, y, c = zip(*data) ax.scatter( x, y, c=c, edgecolors="none", s=8, lw=0, ) logging.debug("Obtained {} data points with depth data".format(len(data))) # Per seqid median medians = {} for seqid, values in data_by_seqid.items(): c = chrinfo[seqid].color if seqid in chrinfo else defaultcolor seqid_start = starts[seqid] seqid_end = ends[seqid] seqid_median = np.median(values) medians[seqid] = seqid_median ax.plot( (seqid_start, seqid_end), (seqid_median, seqid_median), "-", lw=4, color=c, alpha=0.5, ) # vertical lines for all the breaks for pos in starts.values(): ax.plot((pos, pos), (0, ylim), "-", lw=1, color=sepcolor) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) median_depth_y = 0.88 chr_label_y = 0.08 for seqid, position in label_positions: xpos = 0.1 + position * 0.8 / xsize c = chrinfo[seqid].color if seqid in chrinfo else defaultcolor newseqid = chrinfo[seqid].new_name if seqid in chrinfo else seqid root.text(xpos, chr_label_y, newseqid, color=c, ha="center", va="center", rotation=20) seqid_median = medians[seqid] root.text( xpos, median_depth_y, str(int(seqid_median)), color=c, ha="center", va="center", ) if title: root.text( 0.95, 0.5, markup(title), color="darkslategray", ha="center", va="center", size=15, ) if subtitle: root.text( 0.95, 0.375, markup(subtitle), color="darkslategray", ha="center", va="center", size=15, ) ax.set_xticks([]) ax.set_xlim(0, xsize) ax.set_ylim(0, ylim) ax.set_ylabel("Depth") set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color="gray", size=10) normalize_axes(root)
def __init__(self, ax, ext, layout, bed, scale, switch=None, chr_label=True, pad=.04, vpad=.012): x, y = layout.x, layout.y ratio = layout.ratio scale /= ratio self.y = y lr = layout.rotation tr = mpl.transforms.Affine2D().\ rotate_deg_around(x, y, lr) + ax.transAxes inv = ax.transAxes.inverted() start, end, si, ei, chr, orientation, span = ext flank = span / scale / 2 xstart, xend = x - flank, x + flank self.xstart, self.xend = xstart, xend cv = lambda t: xstart + abs(t - startbp) / scale hidden = layout.hidden # Chromosome if not hidden: ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \ lw=2, zorder=1) self.genes = genes = bed[si:ei + 1] startbp, endbp = start.start, end.end if orientation == '-': startbp, endbp = endbp, startbp if switch: chr = switch.get(chr, chr) label = "-".join( (human_size(startbp, target="Mb")[:-2], human_size(endbp, target="Mb"))) height = .012 self.gg = {} # Genes for g in genes: gstart, gend = g.start, g.end strand = g.strand if strand == '-': gstart, gend = gend, gstart if orientation == '-': strand = "+" if strand == "-" else "-" x1, x2 = cv(gstart), cv(gend) a, b = tr.transform((x1, y)), tr.transform((x2, y)) a, b = inv.transform(a), inv.transform(b) self.gg[g.accn] = (a, b) color = "b" if strand == "+" else "g" if not hidden: gp = Glyph(ax, x1, x2, y, height, gradient=False, fc=color, zorder=3) gp.set_transform(tr) ha, va = layout.ha, layout.va hpad = .02 if ha == "left": xx = xstart - hpad ha = "right" elif ha == "right": xx = xend + hpad ha = "left" else: xx = x ha = "center" # Tentative solution to labels stick into glyph magic = 40. cc = abs(lr) / magic if abs(lr) > magic else 1 if va == "top": yy = y + cc * pad elif va == "bottom": yy = y - cc * pad else: yy = y l = np.array((xx, yy)) trans_angle = ax.transAxes.transform_angles(np.array((lr, )), l.reshape((1, 2)))[0] lx, ly = l if not hidden and chr_label: ax.text(lx, ly + vpad, markup(chr), color=layout.color, ha=ha, va="center", rotation=trans_angle) ax.text(lx, ly - vpad, label, color="k", ha=ha, va="center", rotation=trans_angle)