def draw_gauge(ax, margin, maxl, rightmargin=None): # Draw a gauge on the top of the canvas rightmargin = rightmargin or margin ax.plot([margin, 1 - rightmargin], [1 - margin, 1 - margin], "k-", lw=2) best_stride = autoscale(maxl) nintervals = maxl * 1. / best_stride xx, yy = margin, 1 - margin tip = .005 xinterval = (1 - margin - rightmargin) / nintervals l = human_size(best_stride) if l[-1] == 'b': suffix = target = l[-2:] for i in xrange(0, maxl + 1, best_stride): l = human_size(i, precision=0, target=target) if l[-1] == 'b': l, suffix = l[:-2], l[-2:] ax.plot([xx, xx], [yy, yy + tip], "k-", lw=2) ax.text(xx, yy + 2 * tip, l, ha="center", size=13) xx += xinterval xx += 4 * tip - xinterval ax.text(xx + tip, yy + 2 * tip, suffix) return best_stride / xinterval
def draw_gauge(ax, margin, maxl, rightmargin=None): # Draw a gauge on the top of the canvas rightmargin = rightmargin or margin ax.plot([margin, 1 - rightmargin], [1 - margin, 1 - margin], "k-", lw=2) best_stride = autoscale(maxl) nintervals = maxl * 1.0 / best_stride xx, yy = margin, 1 - margin tip = 0.005 xinterval = (1 - margin - rightmargin) / nintervals l = human_size(best_stride) if l[-1] == "b": suffix = target = l[-2:] for i in range(0, maxl + 1, best_stride): l = human_size(i, precision=0, target=target) if l[-1] == "b": l, suffix = l[:-2], l[-2:] ax.plot([xx, xx], [yy, yy + tip], "k-", lw=2) ax.text(xx, yy + 2 * tip, l, ha="center", size=13) xx += xinterval xx += 4 * tip - xinterval ax.text(xx + tip, yy + 2 * tip, suffix) return best_stride / xinterval
def summary(args): """ %prog summary old.new.chain old.fasta new.fasta Provide stats of the chain file. """ from jcvi.formats.fasta import summary as fsummary from jcvi.utils.cbook import percentage, human_size p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) chainfile, oldfasta, newfasta = args chain = Chain(chainfile) ungapped, dt, dq = chain.ungapped, chain.dt, chain.dq print >> sys.stderr, "File `{0}` contains {1} chains.".\ format(chainfile, len(chain)) print >> sys.stderr, "ungapped={0} dt={1} dq={2}".\ format(human_size(ungapped), human_size(dt), human_size(dq)) oldreal, oldnn, oldlen = fsummary([oldfasta, "--outfile=/dev/null"]) print >> sys.stderr, "Old fasta (`{0}`) mapped: {1}".\ format(oldfasta, percentage(ungapped, oldreal)) newreal, newnn, newlen = fsummary([newfasta, "--outfile=/dev/null"]) print >> sys.stderr, "New fasta (`{0}`) mapped: {1}".\ format(newfasta, percentage(ungapped, newreal))
def rstats(self, object, bacs, components, scaffold_sizes, length): from jcvi.utils.cbook import human_size nbacs = len(bacs) nscaffolds = len(scaffold_sizes) a50, l50, n50 = calculate_A50(scaffold_sizes) l50 = human_size(l50) length = human_size(length) return (object, nbacs, components, nscaffolds, n50, l50, length)
def lineplot(ax, binfiles, nbins, chr, window, shift, color="br"): assert len(binfiles) <= 2, "A max of two line plots are supported" t = np.arange(nbins) bf = binfiles[0] m = linearray(bf, chr, window, shift) ax.plot(t, m, "{0}-".format(color[0]), lw=2) formatter = ticker.FuncFormatter( lambda x, pos: human_readable_base(int(x) * shift, pos)) ax.xaxis.set_major_formatter(formatter) for tl in ax.get_xticklabels(): tl.set_color("darkslategray") label = bf.filename.split(".")[0] perw = "per {0}".format(human_size(window, precision=0)) ax.set_ylabel(label + " " + perw, color=color[0]) if len(binfiles) == 2: ax2 = ax.twinx() bf = binfiles[1] m = linearray(bf, chr, window, shift) ax2.plot(t, m, "{0}-".format(color[1]), lw=2) # Differentiate tick labels through colors for tl in ax.get_yticklabels(): tl.set_color(color[0]) for tl in ax2.get_yticklabels(): tl.set_color(color[1]) label = bf.filename.split(".")[0] ax2.set_ylabel(label + " " + perw, color=color[1]) ax.set_xlim(0, nbins)
def lineplot(ax, binfiles, nbins, chr, window, shift, color="br"): assert len(binfiles) <= 2, "A max of two line plots are supported" t = np.arange(nbins) bf = binfiles[0] m = linearray(bf, chr, window, shift) ax.plot(t, m, "{0}-".format(color[0]), lw=2) formatter = ticker.FuncFormatter(lambda x, pos: \ human_readable_base(int(x) * shift, pos)) ax.xaxis.set_major_formatter(formatter) for tl in ax.get_xticklabels(): tl.set_color('darkslategray') label = bf.filename.split(".")[0] perw = "per {0}".format(human_size(window, precision=0)) ax.set_ylabel(label + " " + perw, color=color[0]) if len(binfiles) == 2: ax2 = ax.twinx() bf = binfiles[1] m = linearray(bf, chr, window, shift) ax2.plot(t, m, "{0}-".format(color[1]), lw=2) # Differentiate tick labels through colors for tl in ax.get_yticklabels(): tl.set_color(color[0]) for tl in ax2.get_yticklabels(): tl.set_color(color[1]) label = bf.filename.split(".")[0] ax2.set_ylabel(label + " " + perw, color=color[1]) ax.set_xlim(0, nbins)
def velvet(args): """ %prog velvet readsize genomesize numreads K Calculate velvet memory requirement. <http://seqanswers.com/forums/showthread.php?t=2101> Ram required for velvetg = -109635 + 18977*ReadSize + 86326*GenomeSize + 233353*NumReads - 51092*K Read size is in bases. Genome size is in millions of bases (Mb) Number of reads is in millions K is the kmer hash value used in velveth """ p = OptionParser(velvet.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) readsize, genomesize, numreads, K = [int(x) for x in args] ram = -109635 + 18977 * readsize + 86326 * genomesize + \ 233353 * numreads - 51092 * K print >> sys.stderr, "ReadSize: {0}".format(readsize) print >> sys.stderr, "GenomeSize: {0}Mb".format(genomesize) print >> sys.stderr, "NumReads: {0}M".format(numreads) print >> sys.stderr, "K: {0}".format(K) ram = human_size(ram * 1000, a_kilobyte_is_1024_bytes=True) print >> sys.stderr, "RAM usage: {0} (MAXKMERLENGTH=31)".format(ram)
def size(args): """ find folder -type l | %prog size Get the size for all the paths that are pointed by the links """ from jcvi.utils.cbook import human_size p = OptionParser(size.__doc__) fp = sys.stdin results = [] for link_name in fp: link_name = link_name.strip() if not op.islink(link_name): continue source = get_abs_path(link_name) link_name = op.basename(link_name) filesize = op.getsize(source) results.append((filesize, link_name)) # sort by descending file size for filesize, link_name in sorted(results, reverse=True): filesize = human_size(filesize, a_kilobyte_is_1024_bytes=True) print("%10s\t%s" % (filesize, link_name), file=sys.stderr)
def size(args): """ find folder -type l | %prog size Get the size for all the paths that are pointed by the links """ from jcvi.utils.cbook import human_size p = OptionParser(size.__doc__) fp = sys.stdin results = [] for link_name in fp: link_name = link_name.strip() if not op.islink(link_name): continue source = get_abs_path(link_name) link_name = op.basename(link_name) filesize = op.getsize(source) results.append((filesize, link_name)) # sort by descending file size for filesize, link_name in sorted(results, reverse=True): filesize = human_size(filesize, a_kilobyte_is_1024_bytes=True) print >>sys.stderr, "%10s\t%s" % (filesize, link_name)
def jellyfish(args): """ %prog jellyfish [*.fastq|*.fasta] Run jellyfish to dump histogram to be used in kmer.histogram(). """ from jcvi.apps.base import getfilesize from jcvi.utils.cbook import human_size p = OptionParser(jellyfish.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]") p.add_option("--coverage", default=40, type="int", help="Expected sequence coverage [default: %default]") p.add_option("--prefix", default="jf", help="Database prefix [default: %default]") p.add_option("--nohist", default=False, action="store_true", help="Do not print histogram [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args K = opts.K coverage = opts.coverage totalfilesize = sum(getfilesize(x) for x in fastqfiles) fq = fastqfiles[0] pf = opts.prefix gzip = fq.endswith(".gz") hashsize = totalfilesize / coverage logging.debug("Total file size: {0}, hashsize (-s): {1}".\ format(human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), hashsize)) jfpf = "{0}-K{1}".format(pf, K) jfdb = jfpf fastqfiles = " ".join(fastqfiles) cmd = "jellyfish count -t {0} -C -o {1}".format(opts.cpus, jfpf) cmd += " -s {0} -m {1}".format(hashsize, K) if gzip: cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0" else: cmd += " " + fastqfiles if need_update(fastqfiles, jfdb): sh(cmd) if opts.nohist: return jfhisto = jfpf + ".histogram" cmd = "jellyfish histo -t 64 {0} -o {1}".format(jfdb, jfhisto) if need_update(jfdb, jfhisto): sh(cmd)
def summary(args): """ %prog summary gffile fastafile Print summary stats, including: - Gene/Exon/Intron - Number - Average size (bp) - Median size (bp) - Total length (Mb) - % of genome - % GC """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gff_file, ref = args s = Fasta(ref) g = make_index(gff_file) geneseqs, exonseqs, intronseqs = [], [], [] # Calc % GC for f in g.features_of_type("gene"): fid = f.id fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop}) geneseqs.append(fseq) exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") exons = list(exons) for chrom, start, stop in exons: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) exonseqs.append(fseq) introns = range_interleave(exons) for chrom, start, stop in introns: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) intronseqs.append(fseq) r = {} # Report for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)): tsizes = [len(x) for x in tseqs] tsummary = SummaryStats(tsizes, dtype="int") r[t, "Number"] = tsummary.size r[t, "Average size (bp)"] = tsummary.mean r[t, "Median size (bp)"] = tsummary.median r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb") r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1) r[t, "% GC"] = gc(tseqs) print >> sys.stderr, tabulate(r)
def summary(args): """ %prog summary gffile fastafile Print summary stats, including: - Gene/Exon/Intron - Number - Average size (bp) - Median size (bp) - Total length (Mb) - % of genome - % GC """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gff_file, ref = args s = Fasta(ref) g = make_index(gff_file) geneseqs, exonseqs, intronseqs = [], [], [] # Calc % GC for f in g.features_of_type("gene"): fid = f.id fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop}) geneseqs.append(fseq) exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") exons = list(exons) for chrom, start, stop in exons: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) exonseqs.append(fseq) introns = range_interleave(exons) for chrom, start, stop in introns: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) intronseqs.append(fseq) r = {} # Report for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)): tsizes = [len(x) for x in tseqs] tsummary = SummaryStats(tsizes, dtype="int") r[t, "Number"] = tsummary.size r[t, "Average size (bp)"] = tsummary.mean r[t, "Median size (bp)"] = tsummary.median r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb") r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1) r[t, "% GC"] = gc(tseqs) print(tabulate(r), file=sys.stderr)
def plot_heatmap(ax, M, breaks, iopts, binsize=BINSIZE): ax.imshow(M, cmap=iopts.cmap, interpolation='none') xlim = ax.get_xlim() for b in breaks[:-1]: ax.plot([b, b], xlim, 'w-') ax.plot(xlim, [b, b], 'w-') ax.set_xlim(xlim) ax.set_ylim((xlim[1], xlim[0])) # Flip the y-axis so the origin is at the top ax.set_xticklabels([int(x) for x in ax.get_xticks()], family='Helvetica', color="gray") ax.set_yticklabels([int(x) for x in ax.get_yticks()], family='Helvetica', color="gray") binlabel = "Bins ({} per bin)".format(human_size(binsize, precision=0)) ax.set_xlabel(binlabel)
def plot_heatmap(ax, M, breaks, iopts, binsize=BINSIZE): ax.imshow(M, cmap=iopts.cmap, origin="lower", interpolation='none') xlim = ax.get_xlim() for b in breaks[:-1]: ax.plot([b, b], xlim, 'w-') ax.plot(xlim, [b, b], 'w-') ax.set_xlim(xlim) ax.set_ylim(xlim) ax.set_xticklabels([int(x) for x in ax.get_xticks()], family='Helvetica', color="gray") ax.set_yticklabels([int(x) for x in ax.get_yticks()], family='Helvetica', color="gray") binlabel = "Bins ({} per bin)".format(human_size(binsize, precision=0)) ax.set_xlabel(binlabel)
def __init__(self, filename, human=False): super(FastQCdata, self).__init__(filename) if not op.exists(filename): logging.debug("File `{0}` not found.".format(filename)) # Sample_RF37-1/RF37-1_GATCAG_L008_R2_fastqc => # RF37-1_GATCAG_L008_R2 self["Filename"] = op.basename(\ op.split(filename)[0]).rsplit("_", 1)[0] self["Total Sequences"] = self["Sequence length"] = \ self["Total Bases"] = "na" return fp = open(filename) for row in fp: atoms = row.rstrip().split("\t") if atoms[0] in ("#", ">"): continue if len(atoms) != 2: continue a, b = atoms self[a] = b ts = self["Total Sequences"] sl = self["Sequence length"] if "-" in sl: a, b = sl.split("-") sl = (int(a) + int(b)) / 2 if a == "30": sl = int(b) ts, sl = int(ts), int(sl) tb = ts * sl self["Total Sequences"] = human_size(ts).rstrip("b") if human else ts self["Total Bases"] = human_size(tb).rstrip("b") if human else tb
def plot_heatmap(ax, M, breaks, iopts): ax.imshow(M, cmap=iopts.cmap, origin="lower", interpolation='none') xlim = ax.get_xlim() for b in breaks[:-1]: ax.plot([b, b], xlim, 'w-') ax.plot(xlim, [b, b], 'w-') ax.set_xlim(xlim) ax.set_ylim(xlim) ax.set_xticklabels([int(x) for x in ax.get_xticks()], family='Helvetica', color="gray") ax.set_yticklabels([int(x) for x in ax.get_yticks()], family='Helvetica', color="gray") binlabel = "Bins ({} per bin)".format(human_size(BINSIZE, precision=0)) ax.set_xlabel(binlabel) ax.set_ylabel(binlabel)
def velvet(readsize, genomesize, numreads, K): """ Calculate velvet memory requirement. <http://seqanswers.com/forums/showthread.php?t=2101> Ram required for velvetg = -109635 + 18977*ReadSize + 86326*GenomeSize + 233353*NumReads - 51092*K Read size is in bases. Genome size is in millions of bases (Mb) Number of reads is in millions K is the kmer hash value used in velveth """ ram = -109635 + 18977 * readsize + 86326 * genomesize + 233353 * numreads - 51092 * K print >>sys.stderr, "ReadSize: {0}".format(readsize) print >>sys.stderr, "GenomeSize: {0}Mb".format(genomesize) print >>sys.stderr, "NumReads: {0}M".format(numreads) print >>sys.stderr, "K: {0}".format(K) ram = human_size(ram * 1000, a_kilobyte_is_1024_bytes=True) print >>sys.stderr, "RAM usage: {0} (MAXKMERLENGTH=31)".format(ram)
def velvet(readsize, genomesize, numreads, K): """ Calculate velvet memory requirement. <http://seqanswers.com/forums/showthread.php?t=2101> Ram required for velvetg = -109635 + 18977*ReadSize + 86326*GenomeSize + 233353*NumReads - 51092*K Read size is in bases. Genome size is in millions of bases (Mb) Number of reads is in millions K is the kmer hash value used in velveth """ ram = -109635 + 18977 * readsize + 86326 * genomesize + \ 233353 * numreads - 51092 * K print >> sys.stderr, "ReadSize: {0}".format(readsize) print >> sys.stderr, "GenomeSize: {0}Mb".format(genomesize) print >> sys.stderr, "NumReads: {0}M".format(numreads) print >> sys.stderr, "K: {0}".format(K) ram = human_size(ram * 1000, a_kilobyte_is_1024_bytes=True) print >> sys.stderr, "RAM usage: {0} (MAXKMERLENGTH=31)".format(ram)
def plot(args): """ %prog plot input.bed seqid Plot the matchings between the reconstructed pseudomolecules and the maps. Two types of visualizations are available in one canvas: 1. Parallel axes, and matching markers are shown in connecting lines; 2. Scatter plot. """ from jcvi.graphics.base import plt, savefig, normalize_axes, \ set2, panel_labels from jcvi.graphics.chromosome import Chromosome, GeneticMap, \ HorizontalChromosome p = OptionParser(plot.__doc__) add_allmaps_plot_options(p) opts, args, iopts = p.set_image_options(args, figsize="10x6") if len(args) != 2: sys.exit(not p.print_help()) inputbed, seqid = args pf = inputbed.rsplit(".", 1)[0] bedfile = pf + ".lifted.bed" agpfile = pf + ".agp" weightsfile = opts.weightsfile links = opts.links function = get_function(opts.distance) cc = Map(bedfile, function) allseqids = cc.seqids mapnames = cc.mapnames weights = Weights(weightsfile, mapnames) assert seqid in allseqids, "{0} not in {1}".format(seqid, allseqids) s = Scaffold(seqid, cc) mlgs = [k for k, v in s.mlg_counts.items() if v >= links] while not mlgs: links /= 2 logging.error("No markers to plot, --links reset to {0}".format(links)) mlgs = [k for k, v in s.mlg_counts.items() if v >= links] mlgsizes = {} for mlg in mlgs: mm = cc.extract_mlg(mlg) mlgsize = max(function(x) for x in mm) mlgsizes[mlg] = mlgsize fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) ax1 = fig.add_axes([0, 0, .5, 1]) ax2 = fig.add_axes([.5, 0, .5, 1]) # Find the layout first ystart, ystop = .9, .1 L = Layout(mlgsizes) coords = L.coords tip = .02 marker_pos = {} # Palette colors = dict((mapname, set2[i]) for i, mapname in enumerate(mapnames)) colors = dict((mlg, colors[mlg.split("-")[0]]) for mlg in mlgs) rhos = {} # Parallel coordinates for mlg, (x, y1, y2) in coords.items(): mm = cc.extract_mlg(mlg) markers = [(m.accn, function(m)) for m in mm] # exhaustive marker list xy = [(m.pos, function(m)) for m in mm if m.seqid == seqid] mx, my = zip(*xy) rho = spearmanr(mx, my) rhos[mlg] = rho flip = rho < 0 g = GeneticMap(ax1, x, y1, y2, markers, tip=tip, flip=flip) extra = -3 * tip if x < .5 else 3 * tip ha = "right" if x < .5 else "left" mapname = mlg.split("-")[0] tlg = mlg.replace("_", ".") # Latex does not like underscore char label = "{0} (w={1})".format(tlg, weights[mapname]) ax1.text(x + extra, (y1 + y2) / 2, label, color=colors[mlg], ha=ha, va="center", rotation=90) marker_pos.update(g.marker_pos) agp = AGP(agpfile) agp = [x for x in agp if x.object == seqid] chrsize = max(x.object_end for x in agp) # Pseudomolecules in the center r = ystart - ystop ratio = r / chrsize f = lambda x: (ystart - ratio * x) patchstart = [f(x.object_beg) for x in agp if not x.is_gap] Chromosome(ax1, .5, ystart, ystop, width=2 * tip, patch=patchstart, lw=2) label = "{0} ({1})".format(seqid, human_size(chrsize, precision=0)) ax1.text(.5, ystart + tip, label, ha="center") scatter_data = defaultdict(list) # Connecting lines for b in s.markers: marker_name = b.accn if marker_name not in marker_pos: continue cx = .5 cy = f(b.pos) mx = coords[b.mlg][0] my = marker_pos[marker_name] extra = -tip if mx < cx else tip extra *= 1.25 # leave boundaries for aesthetic reasons cx += extra mx -= extra ax1.plot((cx, mx), (cy, my), "-", color=colors[b.mlg]) scatter_data[b.mlg].append((b.pos, function(b))) # Scatter plot, same data as parallel coordinates xstart, xstop = sorted((ystart, ystop)) f = lambda x: (xstart + ratio * x) pp = [x.object_beg for x in agp if not x.is_gap] patchstart = [f(x) for x in pp] HorizontalChromosome(ax2, xstart, xstop, ystop, height=2 * tip, patch=patchstart, lw=2) gap = .03 ratio = (r - gap * len(mlgs) - tip) / sum(mlgsizes.values()) tlgs = [] for mlg, mlgsize in sorted(mlgsizes.items()): height = ratio * mlgsize ystart -= height xx = .5 + xstart / 2 width = r / 2 color = colors[mlg] ax = fig.add_axes([xx, ystart, width, height]) ypos = ystart + height / 2 ystart -= gap sd = scatter_data[mlg] xx, yy = zip(*sd) ax.vlines(pp, 0, mlgsize, colors="beige") ax.plot(xx, yy, ".", color=color) rho = rhos[mlg] ax.text(.5, 1 - .4 * gap / height, r"$\rho$={0:.3f}".format(rho), ha="center", va="top", transform=ax.transAxes, color="gray") tlg = mlg.replace("_", ".") tlgs.append((tlg, ypos, color)) ax.set_xlim(0, chrsize) ax.set_ylim(0, mlgsize) ax.set_xticks([]) while height / len(ax.get_yticks()) < .03 and len(ax.get_yticks()) >= 2: ax.set_yticks(ax.get_yticks()[::2]) # Sparsify the ticks yticklabels = [int(x) for x in ax.get_yticks()] ax.set_yticklabels(yticklabels, family='Helvetica') if rho < 0: ax.invert_yaxis() for i, (tlg, ypos, color) in enumerate(tlgs): ha = "center" if len(tlgs) > 4: ha = "right" if i % 2 else "left" root.text(.5, ypos, tlg, color=color, rotation=90, ha=ha, va="center") if opts.panels: labels = ((.04, .96, 'A'), (.48, .96, 'B')) panel_labels(root, labels) normalize_axes((ax1, ax2, root)) image_name = seqid + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts) plt.close(fig)
def __init__(self, fig, root, datafile, bedfile, layoutfile, switch=None, tree=None, extra_features=None, chr_label=True, loc_label=True, pad=.04, scalebar=False): w, h = fig.get_figwidth(), fig.get_figheight() bed = Bed(bedfile) order = bed.order bf = BlockFile(datafile) self.layout = lo = Layout(layoutfile) switch = DictFile(switch, delimiter="\t") if switch else None if extra_features: extra_features = Bed(extra_features) exts = [] extras = [] for i in xrange(bf.ncols): ext = bf.get_extent(i, order) exts.append(ext) if extra_features: start, end, si, ei, chr, orientation, span = ext start, end = start.start, end.end # start, end coordinates ef = list(extra_features.extract(chr, start, end)) # Pruning removes minor features with < 0.1% of the region ef_pruned = [x for x in ef if x.span >= span / 1000] print >> sys.stderr, "Extracted {0} features "\ "({1} after pruning)".format(len(ef), len(ef_pruned)) extras.append(ef_pruned) maxspan = max(exts, key=lambda x: x[-1])[-1] scale = maxspan / .65 self.gg = gg = {} self.rr = [] ymids = [] vpad = .012 * w / h for i in xrange(bf.ncols): ext = exts[i] ef = extras[i] if extras else None r = Region(root, ext, lo[i], bed, scale, switch, chr_label=chr_label, loc_label=loc_label, vpad=vpad, extra_features=ef) self.rr.append(r) # Use tid and accn to store gene positions gg.update(dict(((i, k), v) for k, v in r.gg.items())) ymids.append(r.y) for i, j in lo.edges: for ga, gb, h in bf.iter_pairs(i, j): a, b = gg[(i, ga)], gg[(j, gb)] ymid = (ymids[i] + ymids[j]) / 2 Shade(root, a, b, ymid, fc="gainsboro", lw=0, alpha=1) for ga, gb, h in bf.iter_pairs(i, j, highlight=True): a, b = gg[(i, ga)], gg[(j, gb)] ymid = (ymids[i] + ymids[j]) / 2 Shade(root, a, b, ymid, alpha=1, highlight=h, zorder=2) if scalebar: print >> sys.stderr, "Build scalebar (scale={})".format(scale) # Find the best length of the scalebar ar = [1, 2, 5] candidates = [1000 * x for x in ar] + [10000 * x for x in ar] + \ [100000 * x for x in ar] # Find the one that's close to an optimal canvas size dists = [(abs(x / scale - .12), x) for x in candidates] dist, candidate = min(dists) dist = candidate / scale x, y, yp = .2, .96, .005 a, b = x - dist / 2, x + dist / 2 lsg = "lightslategrey" root.plot([a, a], [y - yp, y + yp], "-", lw=2, color=lsg) root.plot([b, b], [y - yp, y + yp], "-", lw=2, color=lsg) root.plot([a, b], [y, y], "-", lw=2, color=lsg) root.text(x, y + .02, human_size(candidate, precision=0), ha="center", va="center") if tree: from jcvi.graphics.tree import draw_tree, read_trees trees = read_trees(tree) ntrees = len(trees) logging.debug("A total of {0} trees imported.".format(ntrees)) xiv = 1. / ntrees yiv = .3 xstart = 0 ystart = min(ymids) - .4 for i in xrange(ntrees): ax = fig.add_axes([xstart, ystart, xiv, yiv]) label, outgroup, tx = trees[i] draw_tree(ax, tx, outgroup=outgroup, rmargin=.4, leaffont=11) xstart += xiv RoundLabel(ax, .5, .3, label, fill=True, fc="lavender", color="r")
def histogram(args): """ %prog histogram [reads.fasta|reads.fastq] Plot read length distribution for reads. The plot would be similar to the one generated by SMRT-portal, for example: http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html Plot has two axes - corresponding to pdf and cdf, respectively. Also adding number of reads, average/median, N50, and total length. """ from jcvi.utils.cbook import human_size, thousands, SUFFIXES from jcvi.formats.fastq import fasta from jcvi.graphics.histogram import stem_leaf_plot from jcvi.graphics.base import ( plt, markup, human_formatter, human_base_formatter, savefig, set2, set_ticklabels_helvetica, ) p = OptionParser(histogram.__doc__) p.set_histogram(vmax=50000, bins=100, xlabel="Read length", title="Read length distribution") p.add_option("--ylabel1", default="Counts", help="Label of y-axis on the left") p.add_option( "--color", default="0", choices=[str(x) for x in range(8)], help="Color of bars, which is an index 0-7 in brewer set2", ) opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark") if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args fastafile, qualfile = fasta([fastafile, "--seqtk"]) sizes = Sizes(fastafile) all_sizes = sorted(sizes.sizes) xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins) plt.figure(1, (iopts.w, iopts.h)) ax1 = plt.gca() width = (xmax - xmin) * 0.5 / bins color = set2[int(opts.color)] ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center") ax1.set_xlabel(markup(opts.xlabel)) ax1.set_ylabel(opts.ylabel1) ax2 = ax1.twinx() cur_size = 0 total_size, l50, n50 = sizes.summary cdf = {} hsize = human_size(total_size) tag = hsize[-2:] unit = 1000**SUFFIXES[1000].index(tag) for x in all_sizes: if x not in cdf: cdf[x] = (total_size - cur_size) * 1.0 / unit cur_size += x x, y = zip(*sorted(cdf.items())) ax2.plot(x, y, "-", color="darkslategray") ylabel2 = "{0} above read length".format(tag) ax2.set_ylabel(ylabel2) for ax in (ax1, ax2): set_ticklabels_helvetica(ax) ax.set_xlim((xmin - width / 2, xmax + width / 2)) tc = "gray" axt = ax1.transAxes xx, yy = 0.95, 0.95 ma = "Total bases: {0}".format(hsize) mb = "Total reads: {0}".format(thousands(len(sizes))) mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes))) md = "Median read length: {0}bp".format(thousands(np.median(all_sizes))) me = "N50 read length: {0}bp".format(thousands(l50)) for t in (ma, mb, mc, md, me): print(t, file=sys.stderr) ax1.text(xx, yy, t, color=tc, transform=axt, ha="right") yy -= 0.05 ax1.set_title(markup(opts.title)) # Seaborn removes ticks for all styles except 'ticks'. Now add them back: ax1.tick_params( axis="x", direction="out", length=3, left=False, right=False, top=False, bottom=True, ) ax1.xaxis.set_major_formatter(human_base_formatter) ax1.yaxis.set_major_formatter(human_formatter) figname = sizes.filename + ".pdf" savefig(figname)
def histogram(args): """ %prog histogram [reads.fasta|reads.fastq] Plot read length distribution for reads. The plot would be similar to the one generated by SMRT-portal, for example: http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html Plot has two axes - corresponding to pdf and cdf, respectively. Also adding number of reads, average/median, N50, and total length. """ from jcvi.utils.cbook import human_size, thousands, SUFFIXES from jcvi.formats.fastq import fasta from jcvi.graphics.histogram import stem_leaf_plot from jcvi.graphics.base import plt, markup, human_formatter, \ human_base_formatter, savefig, set2, set_ticklabels_helvetica p = OptionParser(histogram.__doc__) p.set_histogram(vmax=50000, bins=100, xlabel="Read length", title="Read length distribution") p.add_option("--ylabel1", default="Counts", help="Label of y-axis on the left") p.add_option("--color", default='0', choices=[str(x) for x in range(8)], help="Color of bars, which is an index 0-7 in brewer set2") opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark") if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args fastafile, qualfile = fasta([fastafile, "--seqtk"]) sizes = Sizes(fastafile) all_sizes = sorted(sizes.sizes) xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins) plt.figure(1, (iopts.w, iopts.h)) ax1 = plt.gca() width = (xmax - xmin) * .5 / bins color = set2[int(opts.color)] ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center") ax1.set_xlabel(markup(opts.xlabel)) ax1.set_ylabel(opts.ylabel1) ax2 = ax1.twinx() cur_size = 0 total_size, l50, n50 = sizes.summary cdf = {} hsize = human_size(total_size) tag = hsize[-2:] unit = 1000 ** SUFFIXES[1000].index(tag) for x in all_sizes: if x not in cdf: cdf[x] = (total_size - cur_size) * 1. / unit cur_size += x x, y = zip(*sorted(cdf.items())) ax2.plot(x, y, '-', color="darkslategray") ylabel2 = "{0} above read length".format(tag) ax2.set_ylabel(ylabel2) for ax in (ax1, ax2): set_ticklabels_helvetica(ax) ax.set_xlim((xmin - width / 2, xmax + width / 2)) tc = "gray" axt = ax1.transAxes xx, yy = .95, .95 ma = "Total bases: {0}".format(hsize) mb = "Total reads: {0}".format(thousands(len(sizes))) mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes))) md = "Median read length: {0}bp".format(thousands(np.median(all_sizes))) me = "N50 read length: {0}bp".format(thousands(l50)) for t in (ma, mb, mc, md, me): print >> sys.stderr, t ax1.text(xx, yy, t, color=tc, transform=axt, ha="right") yy -= .05 ax1.set_title(markup(opts.title)) # Seaborn removes ticks for all styles except 'ticks'. Now add them back: ax1.tick_params(axis="x", direction="out", length=3, left=False, right=False, top=False, bottom=True) ax1.xaxis.set_major_formatter(human_base_formatter) ax1.yaxis.set_major_formatter(human_formatter) figname = sizes.filename + ".pdf" savefig(figname)
def __init__(self, ax, ext, layout, bed, scale, switch=None, chr_label=True, pad=.04, vpad=.012, extra_features=None): x, y = layout.x, layout.y ratio = layout.ratio scale /= ratio self.y = y lr = layout.rotation tr = mpl.transforms.Affine2D().\ rotate_deg_around(x, y, lr) + ax.transAxes inv = ax.transAxes.inverted() start, end, si, ei, chr, orientation, span = ext flank = span / scale / 2 xstart, xend = x - flank, x + flank self.xstart, self.xend = xstart, xend cv = lambda t: xstart + abs(t - startbp) / scale hidden = layout.hidden # Chromosome if not hidden: ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \ lw=2, zorder=1) self.genes = genes = bed[si: ei + 1] startbp, endbp = start.start, end.end if orientation == '-': startbp, endbp = endbp, startbp if switch: chr = switch.get(chr, chr) label = "-".join((human_size(startbp, target="Mb")[:-2], human_size(endbp, target="Mb"))) height = .012 self.gg = {} # Genes for g in genes: gstart, gend = g.start, g.end strand = g.strand if strand == '-': gstart, gend = gend, gstart if orientation == '-': strand = "+" if strand == "-" else "-" x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv) self.gg[g.accn] = (a, b) color = forward if strand == "+" else backward if not hidden: gp = Glyph(ax, x1, x2, y, height, gradient=False, fc=color, zorder=3) gp.set_transform(tr) # Extra features (like repeats) if extra_features: for g in extra_features: gstart, gend = g.start, g.end x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv) gp = Glyph(ax, x1, x2, y, height * 3 / 4, gradient=False, fc='#ff7f00', zorder=2) gp.set_transform(tr) ha, va = layout.ha, layout.va hpad = .02 if ha == "left": xx = xstart - hpad ha = "right" elif ha == "right": xx = xend + hpad ha = "left" else: xx = x ha = "center" # Tentative solution to labels stick into glyph magic = 40. cc = abs(lr) / magic if abs(lr) > magic else 1 if va == "top": yy = y + cc * pad elif va == "bottom": yy = y - cc * pad - .01 else: yy = y l = np.array((xx, yy)) trans_angle = ax.transAxes.transform_angles(np.array((lr, )), l.reshape((1, 2)))[0] lx, ly = l if not hidden and chr_label: bbox = dict(boxstyle="round", fc='w', ec='w', alpha=.5) ax.text(lx, ly + vpad, markup(chr), color=layout.color, ha=ha, va="center", rotation=trans_angle, bbox=bbox, zorder=10) ax.text(lx, ly - vpad, label, color="lightslategrey", size=10, ha=ha, va="center", rotation=trans_angle, bbox=bbox, zorder=10)
def __init__(self, ax, ext, layout, bed, scale, switch=None, chr_label=True, pad=.04, vpad=.012): x, y = layout.x, layout.y ratio = layout.ratio scale /= ratio self.y = y lr = layout.rotation tr = Affine2D().rotate_deg_around(x, y, lr) + ax.transAxes inv = ax.transAxes.inverted() start, end, si, ei, chr, orientation, span = ext flank = span / scale / 2 xstart, xend = x - flank, x + flank self.xstart, self.xend = xstart, xend cv = lambda t: xstart + abs(t - startbp) / scale hidden = layout.hidden # Chromosome if not hidden: ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \ lw=2, zorder=1) self.genes = genes = bed[si: ei + 1] startbp, endbp = start.start, end.end if orientation == '-': startbp, endbp = endbp, startbp if switch: chr = switch.get(chr, chr) label = "-".join((human_size(startbp, target="Mb")[:-2], human_size(endbp, target="Mb"))) height = .012 self.gg = {} # Genes for g in genes: gstart, gend = g.start, g.end strand = g.strand if strand == '-': gstart, gend = gend, gstart if orientation == '-': strand = "+" if strand == "-" else "-" x1, x2 = cv(gstart), cv(gend) a, b = tr.transform((x1, y)), tr.transform((x2, y)) a, b = inv.transform(a), inv.transform(b) self.gg[g.accn] = (a, b) color = "b" if strand == "+" else "g" if not hidden: gp = Glyph(ax, x1, x2, y, height, gradient=False, fc=color, zorder=3) gp.set_transform(tr) ha, va = layout.ha, layout.va hpad = .02 if ha == "left": xx = xstart - hpad ha = "right" elif ha == "right": xx = xend + hpad ha = "left" else: xx = x ha = "center" # Tentative solution to labels stick into glyph magic = 40. cc = abs(lr) / magic if abs(lr) > magic else 1 if va == "top": yy = y + cc * pad elif va == "bottom": yy = y - cc * pad else: yy = y l = np.array((xx, yy)) trans_angle = ax.transAxes.transform_angles(np.array((lr, )), l.reshape((1, 2)))[0] lx, ly = l if not hidden and chr_label: ax.text(lx, ly + vpad, markup(chr), color=layout.color, ha=ha, va="center", rotation=trans_angle) ax.text(lx, ly - vpad, label, color="k", ha=ha, va="center", rotation=trans_angle)
def jellyfish(args): """ %prog jellyfish [*.fastq|*.fasta] Run jellyfish to dump histogram to be used in kmer.histogram(). """ from jcvi.apps.base import getfilesize from jcvi.utils.cbook import human_size p = OptionParser(jellyfish.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size") p.add_option( "--coverage", default=40, type="int", help="Expected sequence coverage", ) p.add_option("--prefix", default="jf", help="Database prefix") p.add_option( "--nohist", default=False, action="store_true", help="Do not print histogram", ) p.set_home("jellyfish") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args K = opts.K coverage = opts.coverage totalfilesize = sum(getfilesize(x) for x in fastqfiles) fq = fastqfiles[0] pf = opts.prefix gzip = fq.endswith(".gz") hashsize = totalfilesize / coverage logging.debug("Total file size: {0}, hashsize (-s): {1}".format( human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), hashsize)) jfpf = "{0}-K{1}".format(pf, K) jfdb = jfpf fastqfiles = " ".join(fastqfiles) jfcmd = op.join(opts.jellyfish_home, "jellyfish") cmd = jfcmd cmd += " count -t {0} -C -o {1}".format(opts.cpus, jfpf) cmd += " -s {0} -m {1}".format(hashsize, K) if gzip: cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0" else: cmd += " " + fastqfiles if need_update(fastqfiles, jfdb): sh(cmd) if opts.nohist: return jfhisto = jfpf + ".histogram" cmd = jfcmd + " histo -t 64 {0} -o {1}".format(jfdb, jfhisto) if need_update(jfdb, jfhisto): sh(cmd)
def plot(args): """ %prog plot input.bed seqid Plot the matchings between the reconstructed pseudomolecules and the maps. Two types of visualizations are available in one canvas: 1. Parallel axes, and matching markers are shown in connecting lines; 2. Scatter plot. """ from jcvi.graphics.base import plt, savefig, normalize_axes, \ set2, panel_labels from jcvi.graphics.chromosome import Chromosome, GeneticMap, \ HorizontalChromosome p = OptionParser(plot.__doc__) add_allmaps_plot_options(p) opts, args, iopts = p.set_image_options(args, figsize="10x6") if len(args) != 2: sys.exit(not p.print_help()) inputbed, seqid = args pf = inputbed.rsplit(".", 1)[0] bedfile = pf + ".lifted.bed" agpfile = pf + ".agp" weightsfile = opts.weightsfile links = opts.links function = get_function(opts.distance) cc = Map(bedfile, function) allseqids = cc.seqids mapnames = cc.mapnames weights = Weights(weightsfile, mapnames) assert seqid in allseqids, "{0} not in {1}".format(seqid, allseqids) s = Scaffold(seqid, cc) mlgs = [k for k, v in s.mlg_counts.items() if v >= links] mlgsizes = {} for mlg in mlgs: mm = cc.extract_mlg(mlg) mlgsize = max(function(x) for x in mm) mlgsizes[mlg] = mlgsize fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) ax1 = fig.add_axes([0, 0, .5, 1]) ax2 = fig.add_axes([.5, 0, .5, 1]) # Find the layout first ystart, ystop = .9, .1 L = Layout(mlgsizes) coords = L.coords tip = .02 marker_pos = {} # Palette colors = dict((mapname, set2[i]) for i, mapname in enumerate(mapnames)) colors = dict((mlg, colors[mlg.split("-")[0]]) for mlg in mlgs) rhos = {} # Parallel coordinates for mlg, (x, y1, y2) in coords.items(): mm = cc.extract_mlg(mlg) markers = [(m.accn, function(m)) for m in mm] # exhaustive marker list xy = [(m.pos, function(m)) for m in mm if m.seqid == seqid] mx, my = zip(*xy) rho = spearmanr(mx, my) rhos[mlg] = rho flip = rho < 0 g = GeneticMap(ax1, x, y1, y2, markers, tip=tip, flip=flip) extra = -3 * tip if x < .5 else 3 * tip ha = "right" if x < .5 else "left" mapname = mlg.split("-")[0] tlg = mlg.replace("_", ".") # Latex does not like underscore char label = "{0} (w={1})".format(tlg, weights[mapname]) ax1.text(x + extra, (y1 + y2) / 2, label, color=colors[mlg], ha=ha, va="center", rotation=90) marker_pos.update(g.marker_pos) agp = AGP(agpfile) agp = [x for x in agp if x.object == seqid] chrsize = max(x.object_end for x in agp) # Pseudomolecules in the center r = ystart - ystop ratio = r / chrsize f = lambda x: (ystart - ratio * x) patchstart = [f(x.object_beg) for x in agp if not x.is_gap] Chromosome(ax1, .5, ystart, ystop, width=2 * tip, patch=patchstart, lw=2) label = "{0} ({1})".format(seqid, human_size(chrsize, precision=0)) ax1.text(.5, ystart + tip, label, ha="center") scatter_data = defaultdict(list) # Connecting lines for b in s.markers: marker_name = b.accn if marker_name not in marker_pos: continue cx = .5 cy = f(b.pos) mx = coords[b.mlg][0] my = marker_pos[marker_name] extra = -tip if mx < cx else tip extra *= 1.25 # leave boundaries for aesthetic reasons cx += extra mx -= extra ax1.plot((cx, mx), (cy, my), "-", color=colors[b.mlg]) scatter_data[b.mlg].append((b.pos, function(b))) # Scatter plot, same data as parallel coordinates xstart, xstop = sorted((ystart, ystop)) f = lambda x: (xstart + ratio * x) pp = [x.object_beg for x in agp if not x.is_gap] patchstart = [f(x) for x in pp] HorizontalChromosome(ax2, xstart, xstop, ystop, height=2 * tip, patch=patchstart, lw=2) gap = .03 ratio = (r - gap * len(mlgs) - tip) / sum(mlgsizes.values()) tlgs = [] for mlg, mlgsize in sorted(mlgsizes.items()): height = ratio * mlgsize ystart -= height xx = .5 + xstart / 2 width = r / 2 color = colors[mlg] ax = fig.add_axes([xx, ystart, width, height]) ypos = ystart + height / 2 ystart -= gap sd = scatter_data[mlg] xx, yy = zip(*sd) ax.vlines(pp, 0, mlgsize, colors="beige") ax.plot(xx, yy, ".", color=color) rho = rhos[mlg] ax.text(.5, 1 - .4 * gap / height, r"$\rho$={0:.3f}".format(rho), ha="center", va="top", transform=ax.transAxes, color="gray") tlg = mlg.replace("_", ".") tlgs.append((tlg, ypos, color)) ax.set_xlim(0, chrsize) ax.set_ylim(0, mlgsize) ax.set_xticks([]) while height / len(ax.get_yticks()) < .03 and len( ax.get_yticks()) >= 2: ax.set_yticks(ax.get_yticks()[::2]) # Sparsify the ticks yticklabels = [int(x) for x in ax.get_yticks()] ax.set_yticklabels(yticklabels, family='Helvetica') if rho < 0: ax.invert_yaxis() for i, (tlg, ypos, color) in enumerate(tlgs): ha = "center" if len(tlgs) > 4: ha = "right" if i % 2 else "left" root.text(.5, ypos, tlg, color=color, rotation=90, ha=ha, va="center") if opts.panels: labels = ((.04, .96, 'A'), (.48, .96, 'B')) panel_labels(root, labels) normalize_axes((ax1, ax2, root)) image_name = seqid + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts) plt.close(fig)
def __init__( self, ax, ext, layout, bed, scale, switch=None, chr_label=True, loc_label=True, genelabelsize=0, pad=0.05, vpad=0.015, extra_features=None, glyphstyle="box", glyphcolor: BasePalette = OrientationPalette(), ): x, y = layout.x, layout.y ratio = layout.ratio scale /= ratio self.y = y lr = layout.rotation tr = mpl.transforms.Affine2D().rotate_deg_around(x, y, lr) + ax.transAxes inv = ax.transAxes.inverted() start, end, si, ei, chr, orientation, span = ext flank = span / scale / 2 xstart, xend = x - flank, x + flank self.xstart, self.xend = xstart, xend cv = lambda t: xstart + abs(t - startbp) / scale hidden = layout.hidden # Chromosome if not hidden: ax.plot((xstart, xend), (y, y), color="gray", transform=tr, lw=2, zorder=1) self.genes = genes = bed[si : ei + 1] startbp, endbp = start.start, end.end if orientation == "-": startbp, endbp = endbp, startbp if switch: chr = switch.get(chr, chr) if layout.label: chr = layout.label label = "-".join( ( human_size(startbp, target="Mb", precision=2)[:-2], human_size(endbp, target="Mb", precision=2), ) ) height = 0.012 self.gg = {} # Genes for g in genes: gstart, gend = g.start, g.end strand = g.strand if strand == "-": gstart, gend = gend, gstart if orientation == "-": strand = "+" if strand == "-" else "-" x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv) gene_name = g.accn self.gg[gene_name] = (a, b) color, zorder = ( glyphcolor.get_color_and_zorder(strand) if isinstance(glyphcolor, OrientationPalette) else glyphcolor.get_color_and_zorder(gene_name) ) if hidden: continue gp = Glyph( ax, x1, x2, y, height, gradient=False, fc=color, style=glyphstyle, zorder=zorder, ) gp.set_transform(tr) if genelabelsize: ax.text( (x1 + x2) / 2, y + height / 2 + genelabelsize * vpad / 3, markup(gene_name), size=genelabelsize, rotation=25, ha="left", va="center", color="lightslategray", ) # Extra features (like repeats) if extra_features: for g in extra_features: gstart, gend = g.start, g.end x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv) gp = Glyph( ax, x1, x2, y, height * 3 / 4, gradient=False, fc="#ff7f00", style=glyphstyle, zorder=2, ) gp.set_transform(tr) ha, va = layout.ha, layout.va hpad = 0.02 if ha == "left": xx = xstart - hpad ha = "right" elif ha == "right": xx = xend + hpad ha = "left" else: xx = x ha = "center" # Tentative solution to labels stick into glyph magic = 40.0 cc = abs(lr) / magic if abs(lr) > magic else 1 if va == "top": yy = y + cc * pad elif va == "bottom": yy = y - cc * pad else: yy = y l = np.array((xx, yy)) trans_angle = ax.transAxes.transform_angles(np.array((lr,)), l.reshape((1, 2)))[ 0 ] lx, ly = l if not hidden: bbox = dict(boxstyle="round", fc="w", ec="w", alpha=0.5) kwargs = dict( ha=ha, va="center", rotation=trans_angle, bbox=bbox, zorder=10 ) # TODO: I spent several hours on trying to make this work - with no # good solutions. To generate labels on multiple lines, each line # with a different style is difficult in matplotlib. The only way, # if you can tolerate an extra dot (.), is to use the recipe below. # chr_label = r"\noindent " + markup(chr) + r" \\ ." if chr_label else None # loc_label = r"\noindent . \\ " + label if loc_label else None chr_label = markup(chr) if chr_label else None loc_label = label if loc_label else None if chr_label: if loc_label: ax.text(lx, ly + vpad, chr_label, color=layout.color, **kwargs) ax.text( lx, ly - vpad, loc_label, color="lightslategrey", size=10, **kwargs ) else: ax.text(lx, ly, chr_label, color=layout.color, **kwargs)
def simple(args): """ %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options] Write the block ends for each block in the anchorfile. GeneA1 GeneA2 GeneB1 GeneB2 +/- score Optional additional columns: orderA1 orderA2 orderB1 orderB2 sizeA sizeB size block_id With base coordinates (--coords): block_id seqidA startA endA bpSpanA GeneA1 GeneA2 geneSpanA block_id seqidB startB endB bpSpanB GeneB1 GeneB2 geneSpanB """ p = OptionParser(simple.__doc__) p.add_option("--rich", default=False, action="store_true", \ help="Output additional columns [default: %default]") p.add_option("--coords", default=False, action="store_true", help="Output columns with base coordinates [default: %default]") p.add_option("--bed", default=False, action="store_true", help="Generate BED file for the blocks") p.add_option("--noheader", default=False, action="store_true", help="Don't output header [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args additional = opts.rich coords = opts.coords header = not opts.noheader bed = opts.bed if bed: coords = True bbed = Bed() ac = AnchorFile(anchorfile) simplefile = anchorfile.rsplit(".", 1)[0] + ".simple" qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) pf = "-".join(anchorfile.split(".", 2)[:2]) blocks = ac.blocks if coords: h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation" else: h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score" if additional: h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\ "SizeA|SizeB|Size|Block" fws = open(simplefile, "w") if header: print >> fws, "\t".join(h.split("|")) atotalbase = btotalbase = 0 for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] ia, oa = zip(*a) ib, ob = zip(*b) astarti, aendi = min(ia), max(ia) bstarti, bendi = min(ib), max(ib) astart, aend = min(a)[1].accn, max(a)[1].accn bstart, bend = min(b)[1].accn, max(b)[1].accn sizeA = len(set(ia)) sizeB = len(set(ib)) size = len(block) slope, intercept = np.polyfit(ia, ib, 1) orientation = "+" if slope >= 0 else '-' aspan = aendi - astarti + 1 bspan = bendi - bstarti + 1 score = int((aspan * bspan) ** .5) score = str(score) block_id = pf + "-block-{0}".format(i) if coords: aseqid, astartbase, aendbase = \ get_boundary_bases(astart, aend, qorder) bseqid, bstartbase, bendbase = \ get_boundary_bases(bstart, bend, sorder) abase = aendbase - astartbase + 1 bbase = bendbase - bstartbase + 1 atotalbase += abase btotalbase += bbase # Write dual lines aargs = [block_id, aseqid, astartbase, aendbase, abase, astart, aend, aspan, "+"] bargs = [block_id, bseqid, bstartbase, bendbase, bbase, bstart, bend, bspan, orientation] if bed: bbed.append(BedLine("\t".join(str(x) for x in \ (bseqid, bstartbase - 1, bendbase, "{}:{}-{}".format(aseqid, astartbase, aendbase), size, orientation)))) for args in (aargs, bargs): print >> fws, "\t".join(str(x) for x in args) continue args = [astart, aend, bstart, bend, score, orientation] if additional: args += [astarti, aendi, bstarti, bendi, sizeA, sizeB, size, block_id] print >> fws, "\t".join(str(x) for x in args) fws.close() logging.debug("A total of {0} blocks written to `{1}`.".format(i + 1, simplefile)) if coords: print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \ human_size(atotalbase, precision=2)) print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \ human_size(btotalbase, precision=2)) print >> sys.stderr, "Ratio: {0:.1f}x".format(\ max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase)) if bed: bedfile = simplefile + ".bed" bbed.print_to_file(filename=bedfile, sorted=True) logging.debug("Bed file written to `{}`".format(bedfile))
def __init__( self, fig, root, datafile, bedfile, layoutfile, switch=None, tree=None, extra_features=None, chr_label=True, loc_label=True, genelabelsize=0, pad=0.05, vpad=0.015, scalebar=False, shadestyle="curve", glyphstyle="arrow", glyphcolor: BasePalette = OrientationPalette(), ): _, h = fig.get_figwidth(), fig.get_figheight() bed = Bed(bedfile) order = bed.order bf = BlockFile(datafile) self.layout = lo = Layout(layoutfile) switch = DictFile(switch, delimiter="\t") if switch else None if extra_features: extra_features = Bed(extra_features) exts = [] extras = [] for i in range(bf.ncols): ext = bf.get_extent(i, order) exts.append(ext) if extra_features: start, end, si, ei, chr, orientation, span = ext start, end = start.start, end.end # start, end coordinates ef = list(extra_features.extract(chr, start, end)) # Pruning removes minor features with < 0.1% of the region ef_pruned = [x for x in ef if x.span >= span / 1000] print( "Extracted {0} features " "({1} after pruning)".format(len(ef), len(ef_pruned)), file=sys.stderr, ) extras.append(ef_pruned) maxspan = max(exts, key=lambda x: x[-1])[-1] scale = maxspan / 0.65 self.gg = gg = {} self.rr = [] ymids = [] glyphcolor = ( OrientationPalette() if glyphcolor == "orientation" else OrthoGroupPalette(bf.grouper()) ) for i in range(bf.ncols): ext = exts[i] ef = extras[i] if extras else None r = Region( root, ext, lo[i], bed, scale, switch, genelabelsize=genelabelsize, chr_label=chr_label, loc_label=loc_label, vpad=vpad, extra_features=ef, glyphstyle=glyphstyle, glyphcolor=glyphcolor, ) self.rr.append(r) # Use tid and accn to store gene positions gg.update(dict(((i, k), v) for k, v in r.gg.items())) ymids.append(r.y) def offset(samearc): if samearc == "above": return 2 * pad if samearc == "above2": return 4 * pad if samearc == "below": return -2 * pad if samearc == "below2": return -4 * pad for i, j, blockcolor, samearc in lo.edges: for ga, gb, h in bf.iter_pairs(i, j): a, b = gg[(i, ga)], gg[(j, gb)] if samearc is not None: ymid = ymids[i] + offset(samearc) else: ymid = (ymids[i] + ymids[j]) / 2 Shade(root, a, b, ymid, fc=blockcolor, lw=0, alpha=1, style=shadestyle) for ga, gb, h in bf.iter_pairs(i, j, highlight=True): a, b = gg[(i, ga)], gg[(j, gb)] if samearc is not None: ymid = ymids[i] + offset(samearc) else: ymid = (ymids[i] + ymids[j]) / 2 Shade( root, a, b, ymid, alpha=1, highlight=h, zorder=2, style=shadestyle ) if scalebar: print("Build scalebar (scale={})".format(scale), file=sys.stderr) # Find the best length of the scalebar ar = [1, 2, 5] candidates = ( [1000 * x for x in ar] + [10000 * x for x in ar] + [100000 * x for x in ar] ) # Find the one that's close to an optimal canvas size dists = [(abs(x / scale - 0.12), x) for x in candidates] dist, candidate = min(dists) dist = candidate / scale x, y, yp = 0.22, 0.92, 0.005 a, b = x - dist / 2, x + dist / 2 lsg = "lightslategrey" root.plot([a, a], [y - yp, y + yp], "-", lw=2, color=lsg) root.plot([b, b], [y - yp, y + yp], "-", lw=2, color=lsg) root.plot([a, b], [y, y], "-", lw=2, color=lsg) root.text( x, y + 0.02, human_size(candidate, precision=0), ha="center", va="center", ) if tree: from jcvi.graphics.tree import draw_tree, read_trees trees = read_trees(tree) ntrees = len(trees) logging.debug("A total of {0} trees imported.".format(ntrees)) xiv = 1.0 / ntrees yiv = 0.3 xstart = 0 ystart = min(ymids) - 0.4 for i in range(ntrees): ax = fig.add_axes([xstart, ystart, xiv, yiv]) label, outgroup, color, tx = trees[i] draw_tree( ax, tx, outgroup=outgroup, rmargin=0.4, leaffont=11, treecolor=color, supportcolor=color, leafcolor=color, ) xstart += xiv RoundLabel(ax, 0.5, 0.3, label, fill=True, fc="lavender", color=color)
def __init__(self, fig, root, datafile, bedfile, layoutfile, switch=None, tree=None, extra_features=None, chr_label=True, loc_label=True, pad=.05, vpad=.015, scalebar=False): w, h = fig.get_figwidth(), fig.get_figheight() bed = Bed(bedfile) order = bed.order bf = BlockFile(datafile) self.layout = lo = Layout(layoutfile) switch = DictFile(switch, delimiter="\t") if switch else None if extra_features: extra_features = Bed(extra_features) exts = [] extras = [] for i in xrange(bf.ncols): ext = bf.get_extent(i, order) exts.append(ext) if extra_features: start, end, si, ei, chr, orientation, span = ext start, end = start.start, end.end # start, end coordinates ef = list(extra_features.extract(chr, start, end)) # Pruning removes minor features with < 0.1% of the region ef_pruned = [x for x in ef if x.span >= span / 1000] print >> sys.stderr, "Extracted {0} features "\ "({1} after pruning)".format(len(ef), len(ef_pruned)) extras.append(ef_pruned) maxspan = max(exts, key=lambda x: x[-1])[-1] scale = maxspan / .65 self.gg = gg = {} self.rr = [] ymids = [] #vpad = .012 * w / h for i in xrange(bf.ncols): ext = exts[i] ef = extras[i] if extras else None r = Region(root, ext, lo[i], bed, scale, switch, chr_label=chr_label, loc_label=loc_label, vpad=vpad, extra_features=ef) self.rr.append(r) # Use tid and accn to store gene positions gg.update(dict(((i, k), v) for k, v in r.gg.items())) ymids.append(r.y) for i, j in lo.edges: for ga, gb, h in bf.iter_pairs(i, j): a, b = gg[(i, ga)], gg[(j, gb)] ymid = (ymids[i] + ymids[j]) / 2 Shade(root, a, b, ymid, fc="gainsboro", lw=0, alpha=1) for ga, gb, h in bf.iter_pairs(i, j, highlight=True): a, b = gg[(i, ga)], gg[(j, gb)] ymid = (ymids[i] + ymids[j]) / 2 Shade(root, a, b, ymid, alpha=1, highlight=h, zorder=2) if scalebar: print >> sys.stderr, "Build scalebar (scale={})".format(scale) # Find the best length of the scalebar ar = [1, 2, 5] candidates = [1000 * x for x in ar] + [10000 * x for x in ar] + \ [100000 * x for x in ar] # Find the one that's close to an optimal canvas size dists = [(abs(x / scale - .12), x) for x in candidates] dist, candidate = min(dists) dist = candidate / scale x, y, yp = .2, .96, .005 a, b = x - dist / 2, x + dist / 2 lsg = "lightslategrey" root.plot([a, a], [y - yp, y + yp], "-", lw=2, color=lsg) root.plot([b, b], [y - yp, y + yp], "-", lw=2, color=lsg) root.plot([a, b], [y, y], "-", lw=2, color=lsg) root.text(x, y + .02, human_size(candidate, precision=0), ha="center", va="center") if tree: from jcvi.graphics.tree import draw_tree, read_trees trees = read_trees(tree) ntrees = len(trees) logging.debug("A total of {0} trees imported.".format(ntrees)) xiv = 1. / ntrees yiv = .3 xstart = 0 ystart = min(ymids) - .4 for i in xrange(ntrees): ax = fig.add_axes([xstart, ystart, xiv, yiv]) label, outgroup, tx = trees[i] draw_tree(ax, tx, outgroup=outgroup, rmargin=.4, leaffont=11) xstart += xiv RoundLabel(ax, .5, .3, label, fill=True, fc="lavender", color="r")
def __init__(self, ax, ext, layout, bed, scale, switch=None, chr_label=True, loc_label=True, pad=.05, vpad=.015, extra_features=None): x, y = layout.x, layout.y ratio = layout.ratio scale /= ratio self.y = y lr = layout.rotation tr = mpl.transforms.Affine2D().\ rotate_deg_around(x, y, lr) + ax.transAxes inv = ax.transAxes.inverted() start, end, si, ei, chr, orientation, span = ext flank = span / scale / 2 xstart, xend = x - flank, x + flank self.xstart, self.xend = xstart, xend cv = lambda t: xstart + abs(t - startbp) / scale hidden = layout.hidden # Chromosome if not hidden: ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \ lw=2, zorder=1) self.genes = genes = bed[si: ei + 1] startbp, endbp = start.start, end.end if orientation == '-': startbp, endbp = endbp, startbp if switch: chr = switch.get(chr, chr) if layout.label: chr = layout.label label = "-".join((human_size(startbp, target="Mb", precision=2)[:-2], human_size(endbp, target="Mb", precision=2))) height = .012 self.gg = {} # Genes for g in genes: gstart, gend = g.start, g.end strand = g.strand if strand == '-': gstart, gend = gend, gstart if orientation == '-': strand = "+" if strand == "-" else "-" x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv) self.gg[g.accn] = (a, b) color = forward if strand == "+" else backward if not hidden: gp = Glyph(ax, x1, x2, y, height, gradient=False, fc=color, zorder=3) gp.set_transform(tr) # Extra features (like repeats) if extra_features: for g in extra_features: gstart, gend = g.start, g.end x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv) gp = Glyph(ax, x1, x2, y, height * 3 / 4, gradient=False, fc='#ff7f00', zorder=2) gp.set_transform(tr) ha, va = layout.ha, layout.va hpad = .02 if ha == "left": xx = xstart - hpad ha = "right" elif ha == "right": xx = xend + hpad ha = "left" else: xx = x ha = "center" # Tentative solution to labels stick into glyph magic = 40. cc = abs(lr) / magic if abs(lr) > magic else 1 if va == "top": yy = y + cc * pad elif va == "bottom": yy = y - cc * pad else: yy = y l = np.array((xx, yy)) trans_angle = ax.transAxes.transform_angles(np.array((lr, )), l.reshape((1, 2)))[0] lx, ly = l if not hidden: bbox = dict(boxstyle="round", fc='w', ec='w', alpha=.5) kwargs = dict(ha=ha, va="center", rotation=trans_angle, bbox=bbox, zorder=10) # TODO: I spent several hours on trying to make this work - with no # good solutions. To generate labels on multiple lines, each line # with a different style is difficult in matplotlib. The only way, # if you can tolerate an extra dot (.), is to use the recipe below. #chr_label = r"\noindent " + markup(chr) + r" \\ ." if chr_label else None #loc_label = r"\noindent . \\ " + label if loc_label else None chr_label = markup(chr) if chr_label else None loc_label = label if loc_label else None if chr_label: if loc_label: ax.text(lx, ly + vpad, chr_label, color=layout.color, **kwargs) ax.text(lx, ly - vpad, loc_label, color="lightslategrey", size=10, **kwargs) else: ax.text(lx, ly, chr_label, color=layout.color, **kwargs)
def simple(args): """ %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options] Write the block ends for each block in the anchorfile. GeneA1 GeneA2 GeneB1 GeneB2 +/- score Optional additional columns: orderA1 orderA2 orderB1 orderB2 sizeA sizeB size block_id With base coordinates (--coords): block_id seqidA startA endA bpSpanA GeneA1 GeneA2 geneSpanA block_id seqidB startB endB bpSpanB GeneB1 GeneB2 geneSpanB """ p = OptionParser(simple.__doc__) p.add_option("--rich", default=False, action="store_true", \ help="Output additional columns [default: %default]") p.add_option( "--coords", default=False, action="store_true", help="Output columns with base coordinates [default: %default]") p.add_option("--bed", default=False, action="store_true", help="Generate BED file for the blocks") p.add_option("--noheader", default=False, action="store_true", help="Don't output header [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args additional = opts.rich coords = opts.coords header = not opts.noheader bed = opts.bed if bed: coords = True bbed = Bed() ac = AnchorFile(anchorfile) simplefile = anchorfile.rsplit(".", 1)[0] + ".simple" qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) pf = "-".join(anchorfile.split(".", 2)[:2]) blocks = ac.blocks if coords: h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation" else: h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score" if additional: h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\ "SizeA|SizeB|Size|Block" fws = open(simplefile, "w") if header: print >> fws, "\t".join(h.split("|")) atotalbase = btotalbase = 0 for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] ia, oa = zip(*a) ib, ob = zip(*b) astarti, aendi = min(ia), max(ia) bstarti, bendi = min(ib), max(ib) astart, aend = min(a)[1].accn, max(a)[1].accn bstart, bend = min(b)[1].accn, max(b)[1].accn sizeA = len(set(ia)) sizeB = len(set(ib)) size = len(block) slope, intercept = np.polyfit(ia, ib, 1) orientation = "+" if slope >= 0 else '-' aspan = aendi - astarti + 1 bspan = bendi - bstarti + 1 score = int((aspan * bspan)**.5) score = str(score) block_id = pf + "-block-{0}".format(i) if coords: aseqid, astartbase, aendbase = \ get_boundary_bases(astart, aend, qorder) bseqid, bstartbase, bendbase = \ get_boundary_bases(bstart, bend, sorder) abase = aendbase - astartbase + 1 bbase = bendbase - bstartbase + 1 atotalbase += abase btotalbase += bbase # Write dual lines aargs = [ block_id, aseqid, astartbase, aendbase, abase, astart, aend, aspan, "+" ] bargs = [ block_id, bseqid, bstartbase, bendbase, bbase, bstart, bend, bspan, orientation ] if bed: bbed.append(BedLine("\t".join(str(x) for x in \ (bseqid, bstartbase - 1, bendbase, "{}:{}-{}".format(aseqid, astartbase, aendbase), size, orientation)))) for args in (aargs, bargs): print >> fws, "\t".join(str(x) for x in args) continue args = [astart, aend, bstart, bend, score, orientation] if additional: args += [ astarti, aendi, bstarti, bendi, sizeA, sizeB, size, block_id ] print >> fws, "\t".join(str(x) for x in args) fws.close() logging.debug("A total of {0} blocks written to `{1}`.".format( i + 1, simplefile)) if coords: print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \ human_size(atotalbase, precision=2)) print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \ human_size(btotalbase, precision=2)) print >> sys.stderr, "Ratio: {0:.1f}x".format(\ max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase)) if bed: bedfile = simplefile + ".bed" bbed.print_to_file(filename=bedfile, sorted=True) logging.debug("Bed file written to `{}`".format(bedfile))
def __init__(self, ax, ext, layout, bed, scale, switch=None, chr_label=True, pad=.04, vpad=.012): x, y = layout.x, layout.y ratio = layout.ratio scale /= ratio self.y = y lr = layout.rotation tr = mpl.transforms.Affine2D().\ rotate_deg_around(x, y, lr) + ax.transAxes inv = ax.transAxes.inverted() start, end, si, ei, chr, orientation, span = ext flank = span / scale / 2 xstart, xend = x - flank, x + flank self.xstart, self.xend = xstart, xend cv = lambda t: xstart + abs(t - startbp) / scale hidden = layout.hidden # Chromosome if not hidden: ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \ lw=2, zorder=1) self.genes = genes = bed[si:ei + 1] startbp, endbp = start.start, end.end if orientation == '-': startbp, endbp = endbp, startbp if switch: chr = switch.get(chr, chr) label = "-".join( (human_size(startbp, target="Mb")[:-2], human_size(endbp, target="Mb"))) height = .012 self.gg = {} # Genes for g in genes: gstart, gend = g.start, g.end strand = g.strand if strand == '-': gstart, gend = gend, gstart if orientation == '-': strand = "+" if strand == "-" else "-" x1, x2 = cv(gstart), cv(gend) a, b = tr.transform((x1, y)), tr.transform((x2, y)) a, b = inv.transform(a), inv.transform(b) self.gg[g.accn] = (a, b) color = "b" if strand == "+" else "g" if not hidden: gp = Glyph(ax, x1, x2, y, height, gradient=False, fc=color, zorder=3) gp.set_transform(tr) ha, va = layout.ha, layout.va hpad = .02 if ha == "left": xx = xstart - hpad ha = "right" elif ha == "right": xx = xend + hpad ha = "left" else: xx = x ha = "center" # Tentative solution to labels stick into glyph magic = 40. cc = abs(lr) / magic if abs(lr) > magic else 1 if va == "top": yy = y + cc * pad elif va == "bottom": yy = y - cc * pad else: yy = y l = np.array((xx, yy)) trans_angle = ax.transAxes.transform_angles(np.array((lr, )), l.reshape((1, 2)))[0] lx, ly = l if not hidden and chr_label: ax.text(lx, ly + vpad, markup(chr), color=layout.color, ha=ha, va="center", rotation=trans_angle) ax.text(lx, ly - vpad, label, color="k", ha=ha, va="center", rotation=trans_angle)