def print_stats(qrycovered, refcovered, id_pct): from jcvi.utils.cbook import thousands m1 = "Reference coverage: {0} bp".format(thousands(refcovered)) m2 = "Query coverage: {0} bp".format(thousands(qrycovered)) m3 = "Identity: {0:.1f}%".format(id_pct) print >> sys.stderr, "\n".join((m1, m2, m3))
def summary(args): """ %prog summary bedfile Sum the total lengths of the intervals. """ from jcvi.utils.cbook import SummaryStats p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) bedfile, = args bed = Bed(bedfile) stats = SummaryStats([x.span for x in bed]) print >> sys.stderr, "Total seqids: {0}".format(len(bed.seqids)) print >> sys.stderr, "Total ranges: {0}".format(len(bed)) total_bases = bed.sum(unique=False) unique_bases = bed.sum() print >> sys.stderr, "Total unique bases: {0} bp".format( thousands(unique_bases)) print >> sys.stderr, "Total bases: {0} bp".format(thousands(total_bases)) print >> sys.stderr, "Estimated coverage: {0:.1f}x".\ format(total_bases * 1. / unique_bases) print >> sys.stderr, stats
def summary(args): """ %prog summary bedfile Sum the total lengths of the intervals. """ from jcvi.utils.cbook import SummaryStats p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) bedfile, = args bed = Bed(bedfile) stats = SummaryStats([x.span for x in bed]) print >> sys.stderr, "Total seqids: {0}".format(len(bed.seqids)) print >> sys.stderr, "Total ranges: {0}".format(len(bed)) total_bases = bed.sum(unique=False) unique_bases = bed.sum() print >> sys.stderr, "Total unique bases: {0} bp".format(thousands(unique_bases)) print >> sys.stderr, "Total bases: {0} bp".format(thousands(total_bases)) print >> sys.stderr, "Estimated coverage: {0:.1f}x".\ format(total_bases * 1. / unique_bases) print >> sys.stderr, stats
def summary(args): """ %prog summary bedfile Sum the total lengths of the intervals. """ import numpy as np p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) bedfile, = args bed = Bed(bedfile) spans = np.array([x.span for x in bed]) avg = int(np.average(spans)) std = int(np.std(spans)) print >> sys.stderr, "Total seqids: {0}".format(len(bed.seqids)) print >> sys.stderr, "Total ranges: {0}".format(len(bed)) total_bases = bed.sum(unique=False) unique_bases = bed.sum() print >> sys.stderr, "Total unique bases: {0} bp".format(thousands(unique_bases)) print >> sys.stderr, "Total bases: {0} bp".format(thousands(total_bases)) print >> sys.stderr, "Estimated coverage: {0:.1f}x".format(total_bases * 1.0 / unique_bases) print >> sys.stderr, "Average spans: {0}, stdev: {1}".format(avg, std)
def report(self): print >> sys.stderr, "Total seqids: {0}".format(self.nseqids) print >> sys.stderr, "Total ranges: {0}".format(self.nfeats) print >> sys.stderr, "Total unique bases: {0} bp".format(thousands(self.unique_bases)) print >> sys.stderr, "Total bases: {0} bp".format(thousands(self.total_bases)) print >> sys.stderr, "Estimated coverage: {0:.1f}x".format(self.coverage) print >> sys.stderr, self.stats maxspan, maxaccn = max(self.mspans) minspan, minaccn = min(self.mspans) print >> sys.stderr, "Longest: {0} ({1})".format(maxaccn, maxspan) print >> sys.stderr, "Shortest: {0} ({1})".format(minaccn, minspan)
def fillstats(args): """ %prog fillstats genome.fill Build stats on .fill file from GapCloser. """ from jcvi.utils.cbook import SummaryStats, percentage, thousands p = OptionParser(fillstats.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fillfile, ) = args fp = open(fillfile) scaffolds = 0 gaps = [] for row in fp: if row[0] == ">": scaffolds += 1 continue fl = FillLine(row) gaps.append(fl) print("{0} scaffolds in total".format(scaffolds), file=sys.stderr) closed = [x for x in gaps if x.closed] closedbp = sum(x.before for x in closed) notClosed = [x for x in gaps if not x.closed] notClosedbp = sum(x.before for x in notClosed) totalgaps = len(closed) + len(notClosed) print( "Closed gaps: {0} size: {1} bp".format( percentage(len(closed), totalgaps), thousands(closedbp)), file=sys.stderr, ) ss = SummaryStats([x.after for x in closed]) print(ss, file=sys.stderr) ss = SummaryStats([x.delta for x in closed]) print("Delta:", ss, file=sys.stderr) print( "Remaining gaps: {0} size: {1} bp".format( percentage(len(notClosed), totalgaps), thousands(notClosedbp)), file=sys.stderr, ) ss = SummaryStats([x.after for x in notClosed]) print(ss, file=sys.stderr)
def report(self): print >> sys.stderr, "Total seqids: {0}".format(self.nseqids) print >> sys.stderr, "Total ranges: {0}".format(self.nfeats) print >> sys.stderr, "Total unique bases: {0} bp".format( thousands(self.unique_bases)) print >> sys.stderr, "Total bases: {0} bp".format( thousands(self.total_bases)) print >> sys.stderr, "Estimated coverage: {0:.1f}x".format( self.coverage) print >> sys.stderr, self.stats maxspan, maxaccn = max(self.mspans) minspan, minaccn = min(self.mspans) print >> sys.stderr, "Longest: {0} ({1})".format(maxaccn, maxspan) print >> sys.stderr, "Shortest: {0} ({1})".format(minaccn, minspan)
def fillstats(args): """ %prog fillstats genome.fill Build stats on .fill file from GapCloser. """ from jcvi.utils.cbook import SummaryStats, percentage, thousands p = OptionParser(fillstats.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fillfile, = args fp = open(fillfile) scaffolds = 0 gaps = [] for row in fp: if row[0] == ">": scaffolds += 1 continue fl = FillLine(row) gaps.append(fl) print >> sys.stderr, "{0} scaffolds in total".format(scaffolds) closed = [x for x in gaps if x.closed] closedbp = sum(x.before for x in closed) notClosed = [x for x in gaps if not x.closed] notClosedbp = sum(x.before for x in notClosed) totalgaps = len(closed) + len(notClosed) totalbp = closedbp + notClosedbp print >> sys.stderr, "Closed gaps: {0} size: {1} bp".\ format(percentage(len(closed), totalgaps), thousands(closedbp)) ss = SummaryStats([x.after for x in closed]) print >> sys.stderr, ss ss = SummaryStats([x.delta for x in closed]) print >> sys.stderr, "Delta:", ss print >> sys.stderr, "Remaining gaps: {0} size: {1} bp".\ format(percentage(len(notClosed), totalgaps), thousands(notClosedbp)) ss = SummaryStats([x.after for x in notClosed]) print >> sys.stderr, ss
def summary(args): """ %prog summary bedfile Sum the total lengths of the intervals. """ from jcvi.utils.cbook import SummaryStats p = OptionParser(summary.__doc__) p.add_option("--sizes", default=False, action="store_true", help="Write .sizes file") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args bed = Bed(bedfile) mspans = [(x.span, x.accn) for x in bed] if opts.sizes: sizesfile = bedfile + ".sizes" fw = open(sizesfile, "w") for span, accn in mspans: print >> fw, span fw.close() logging.debug("Spans written to `{0}`.".format(sizesfile)) spans, accns = zip(*mspans) stats = SummaryStats(spans) print >> sys.stderr, "Total seqids: {0}".format(len(bed.seqids)) print >> sys.stderr, "Total ranges: {0}".format(len(bed)) total_bases = bed.sum(unique=False) unique_bases = bed.sum() print >> sys.stderr, "Total unique bases: {0} bp".format(thousands(unique_bases)) print >> sys.stderr, "Total bases: {0} bp".format(thousands(total_bases)) print >> sys.stderr, "Estimated coverage: {0:.1f}x".\ format(total_bases * 1. / unique_bases) print >> sys.stderr, stats maxspan, maxaccn = max(mspans) minspan, minaccn = min(mspans) print >> sys.stderr, "Longest: {0} ({1})".format(maxaccn, maxspan) print >> sys.stderr, "Shortest: {0} ({1})".format(minaccn, minspan)
def scaffold(args): """ %prog scaffold scaffold.fasta synteny.blast synteny.sizes synteny.bed physicalmap.blast physicalmap.sizes physicalmap.bed As evaluation of scaffolding, visualize external line of evidences: * Plot synteny to an external genome * Plot alignments to physical map * Plot alignments to genetic map (TODO) Each trio defines one panel to be plotted. blastfile defines the matchings between the evidences vs scaffolds. Then the evidence sizes, and evidence bed to plot dot plots. This script will plot a dot in the dot plot in the corresponding location the plots are one contig/scaffold per plot. """ from jcvi.graphics.base import set_image_options from jcvi.utils.iter import grouper p = OptionParser(scaffold.__doc__) p.add_option("--cutoff", type="int", default=1000000, help="Plot scaffolds with size larger than [default: %default]") p.add_option("--highlights", help="A set of regions in BED format to highlight [default: %default]") opts, args, iopts = set_image_options(p, args, figsize="14x8", dpi=150) if len(args) < 4 or len(args) % 3 != 1: sys.exit(not p.print_help()) highlights = opts.highlights scafsizes = Sizes(args[0]) trios = list(grouper(3, args[1:])) trios = [(a, Sizes(b), Bed(c)) for a, b, c in trios] if highlights: hlbed = Bed(highlights) for scaffoldID, scafsize in scafsizes.iter_sizes(): if scafsize < opts.cutoff: continue logging.debug("Loading {0} (size={1})".format(scaffoldID, thousands(scafsize))) tmpname = scaffoldID + ".sizes" tmp = open(tmpname, "w") tmp.write("{0}\t{1}".format(scaffoldID, scafsize)) tmp.close() tmpsizes = Sizes(tmpname) tmpsizes.close(clean=True) if highlights: subhighlights = list(hlbed.sub_bed(scaffoldID)) imagename = ".".join((scaffoldID, opts.format)) plot_one_scaffold(scaffoldID, tmpsizes, None, trios, imagename, iopts, highlights=subhighlights)
def plot_one_scaffold(scaffoldID, ssizes, sbed, trios, imagename, iopts, highlights=None): ntrios = len(trios) fig = plt.figure(1, (14, 8)) plt.cla() plt.clf() root = fig.add_axes([0, 0, 1, 1]) axes = [fig.add_subplot(1, ntrios, x) for x in range(1, ntrios + 1)] scafsize = ssizes.get_size(scaffoldID) for trio, ax in zip(trios, axes): blastf, qsizes, qbed = trio scaffolding(ax, scaffoldID, blastf, qsizes, ssizes, qbed, sbed, highlights=highlights) root.text(.5, .95, "{0} (size={1})".format(scaffoldID, thousands(scafsize)), size=18, ha="center", color='b') root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() savefig(imagename, dpi=iopts.dpi, iopts=iopts)
def plot_one_scaffold(scaffoldID, ssizes, sbed, trios, imagename, iopts, highlights=None): ntrios = len(trios) fig = plt.figure(1, (14, 8)) plt.cla() plt.clf() root = fig.add_axes([0, 0, 1, 1]) axes = [fig.add_subplot(1, ntrios, x) for x in range(1, ntrios + 1)] scafsize = ssizes.get_size(scaffoldID) for trio, ax in zip(trios, axes): blastf, qsizes, qbed = trio scaffolding(ax, scaffoldID, blastf, qsizes, ssizes, qbed, sbed, highlights=highlights) root.text( 0.5, 0.95, "{0} (size={1})".format(scaffoldID, thousands(scafsize)), size=18, ha="center", color="b", ) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() savefig(imagename, dpi=iopts.dpi, iopts=iopts)
def analyze_nbinom(self, K=23, maxiter=100): """Analyze the K-mer histogram using negative binomial distribution. Args: K (int, optional): K-mer size used when generating the histogram. Defaults to 23. """ from scipy.stats import nbinom from scipy.optimize import minimize_scalar from functools import lru_cache method, xopt = "bounded", "xatol" MAX_1CN_SIZE = 1e10 MAX_OPTIMIZED_SIZE = 9.9e9 # Generate bins for the decomposed negative binomial distributions bins = [(i, i) for i in range(1, 9) ] # The first 8 CN are critical often determins ploidy for i in (8, 16, 32, 64, 128, 256, 512): # 14 geometricly sized bins a, b = i + 1, int(round(i * 2**0.5)) bins.append((a, b)) a, b = b + 1, i * 2 bins.append((a, b)) # Convert histogram to np array so we can index by CN kf_ceil = max([cov for cov, _ in self.data]) N = kf_ceil + 1 hist = np.zeros(N, dtype=np.int) for cov, count in self.data: hist[cov] = count # min1: find first minimum _kf_min1 = 10 while _kf_min1 - 1 >= 2 and hist[_kf_min1 - 1] < hist[_kf_min1]: _kf_min1 -= 1 while _kf_min1 <= kf_ceil and hist[_kf_min1 + 1] < hist[_kf_min1]: _kf_min1 += 1 # max2: find absolute maximum mx2 above first minimum min1 _kf_max2 = _kf_min1 for kf in range(_kf_min1 + 1, int(0.8 * kf_ceil)): if hist[kf] > hist[_kf_max2]: _kf_max2 = kf # Discard the last entry as that is usually an inflated number hist = hist[:-1] kf_range = np.arange(_kf_min1, len(hist), dtype=np.int) P = hist[kf_range] * kf_range # Target distribution print("==> Start nbinom method on range ({}, {})".format( _kf_min1, len(hist))) # Below is the optimization schemes, we optimize one variable at a time @lru_cache(maxsize=None) def nbinom_pmf_range(lambda_: int, rho: int, bin_id: int): stacked = np.zeros(len(kf_range), dtype=np.float64) lambda_ /= 100 # 2-digit precision rho /= 100 # 2-digit precision n = lambda_ / (rho - 1) p = 1 / rho start, end = bins[bin_id] for i in range(start, end + 1): stacked += nbinom.pmf(kf_range, n * i, p) return stacked def generative_model(G, lambda_, rho): stacked = np.zeros(len(kf_range), dtype=np.float64) lambda_ = int(round(lambda_ * 100)) rho = int(round(rho * 100)) for bin_id, g in enumerate(G): stacked += g * nbinom_pmf_range(lambda_, rho, bin_id) stacked *= kf_range return stacked def func(lambda_, rho, G): stacked = generative_model(G, lambda_, rho) return np.sum((P - stacked)**2) # L2 norm def optimize_func(lambda_, rho, G): # Iterate over all G for i, g in enumerate(G): G_i = optimize_func_Gi(lambda_, rho, G, i) if (not 1 < G_i < MAX_OPTIMIZED_SIZE ): # Optimizer did not optimize this G_i break # Also remove the last bin since it is subject to marginal effect G[i - 1] = 0 lambda_ = optimize_func_lambda_(lambda_, rho, G) rho = optimize_func_rho(lambda_, rho, G) score = func(lambda_, rho, G) return lambda_, rho, G, score def optimize_func_lambda_(lambda_, rho, G): def f(arg): return func(arg, rho, G) res = minimize_scalar(f, bounds=(_kf_min1, 100), method=method, options={xopt: 0.01}) return res.x def optimize_func_rho(lambda_, rho, G): def f(arg): return func(lambda_, arg, G) res = minimize_scalar(f, bounds=(1.001, 5), method=method, options={xopt: 0.01}) return res.x def optimize_func_Gi(lambda_, rho, G, i): # Iterate a single G_i def f(arg): G[i] = arg return func(lambda_, rho, G) res = minimize_scalar(f, bounds=(0, MAX_1CN_SIZE), method=method, options={xopt: 100}) return res.x def run_optimization(termination=0.999, maxiter=100): ll, rr, GG = l0, r0, G0 prev_score = np.inf for i in range(maxiter): print("Iteration", i + 1, file=sys.stderr) ll, rr, GG, score = optimize_func(ll, rr, GG) if score / prev_score > termination: break prev_score = score if i % 10 == 0: print(ll, rr, GG, score, file=sys.stderr) print("Success!", file=sys.stderr) # Remove bogus values that are close to the bounds final_GG = [g for g in GG if 1 < g < MAX_OPTIMIZED_SIZE] return ll, rr, final_GG # Optimization - very slow G0 = np.zeros(len(bins)) l0 = _kf_max2 r0 = 1.5 print(l0, r0, G0, file=sys.stderr) ll, rr, GG = run_optimization(maxiter=maxiter) print(ll, rr, GG, file=sys.stderr) # Ready for genome summary m = "\n==> Kmer (K={0}) Spectrum Analysis\n".format(K) genome_size = int(round(self.totalKmers / ll)) inferred_genome_size = 0 for i, g in enumerate(GG): start, end = bins[i] mid = (start + end) / 2 inferred_genome_size += g * mid * (end - start + 1) inferred_genome_size = int(round(inferred_genome_size)) genome_size = max(genome_size, inferred_genome_size) m += "Genome size estimate = {0}\n".format(thousands(genome_size)) copy_series = [] copy_messages = [] for i, g in enumerate(GG): start, end = bins[i] mid = (start + end) / 2 copy_num = start if start == end else "{}-{}".format(start, end) g_copies = int(round(g * mid * (end - start + 1))) copy_series.append((mid, copy_num, g_copies, g)) copy_message = "CN {}: {:.1f} Mb ({:.1f} percent)".format( copy_num, g_copies / 1e6, g_copies * 100 / genome_size) copy_messages.append(copy_message) m += copy_message + "\n" if genome_size > inferred_genome_size: g_copies = genome_size - inferred_genome_size copy_num = "{}+".format(end + 1) copy_series.append( (end + 1, copy_num, g_copies, g_copies / (end + 1))) m += "CN {}: {:.1f} Mb ({:.1f} percent)\n".format( copy_num, g_copies / 1e6, g_copies * 100 / genome_size) # Determine ploidy def determine_ploidy(copy_series, threshold=0.15): counts_so_far = 1 ploidy_so_far = 0 for mid, copy_num, g_copies, g in copy_series: if g_copies / counts_so_far < threshold: break counts_so_far += g_copies ploidy_so_far = mid return int(ploidy_so_far) ploidy = determine_ploidy(copy_series) self.ploidy = ploidy self.ploidy_message = "Ploidy: {}".format(ploidy) m += self.ploidy_message + "\n" self.copy_messages = copy_messages[:ploidy] # Repeat content def calc_repeats(copy_series, ploidy, genome_size): unique = 0 for mid, copy_num, g_copies, g in copy_series: if mid <= ploidy: unique += g_copies else: break return 1 - unique / genome_size repeats = calc_repeats(copy_series, ploidy, genome_size) self.repetitive = "Repeats: {:.1f} percent".format(repeats * 100) m += self.repetitive + "\n" # SNP rate def calc_snp_rate(copy_series, ploidy, genome_size, K): # We can calculate the SNP rate s, assuming K-mer of length K: # s = 1-(1-L/G)^(1/K) # L: # of unique K-mers under 'het' peak # G: genome size # K: K-mer length L = 0 for mid, copy_num, g_copies, g in copy_series: if mid < ploidy: L += g else: break return 1 - (1 - L / genome_size)**(1 / K) snp_rate = calc_snp_rate(copy_series, ploidy, genome_size, K) self.snprate = "SNP rate: {:.2f} percent".format(snp_rate * 100) m += self.snprate + "\n" print(m, file=sys.stderr) self.lambda_ = ll return { "generative_model": generative_model, "Gbins": GG, "lambda": ll, "rho": rr, "kf_range": kf_range, }
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. """ p = OptionParser(histogram.__doc__) p.add_option( "--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive", ) p.add_option( "--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive", ) p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot", ) p.add_option( "--method", choices=("nbinom", "allpaths"), default="nbinom", help= "'nbinom' - slow but more accurate for het or polyploid genome; 'allpaths' - fast and works for homozygous enomes", ) p.add_option( "--maxiter", default=100, type="int", help="Max iterations for optimization. Only used with --method nbinom", ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option( "--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks", ) opts, args, iopts = p.set_image_options(args, figsize="7x7") if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args method = opts.method vmin, vmax = opts.vmin, opts.vmax ascii = not opts.pdf peaks = not opts.nopeaks and method == "allpaths" N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = merylhistogram(histfile) ks = KmerSpectrum(histfile) method_info = ks.analyze(K=N, maxiter=opts.maxiter, method=method) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.lambda_ if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1:.1f}x".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f} Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print(msg, file=sys.stderr) x, y = ks.get_xy(vmin, vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (iopts.w, iopts.h)) plt.bar(x, y, fc="#b2df8a", lw=0) # Plot the negative binomial fit if method == "nbinom": generative_model = method_info["generative_model"] GG = method_info["Gbins"] ll = method_info["lambda"] rr = method_info["rho"] kf_range = method_info["kf_range"] stacked = generative_model(GG, ll, rr) plt.plot( kf_range, stacked, ":", color="#6a3d9a", lw=2, ) ax = plt.gca() if peaks: # Only works for method 'allpaths' t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=3, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak") ax.text(ks.max2, tcounts[ks.max2], "Main peak") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 if method == "nbinom": # Plot multiple CN locations, CN1, CN2, ... up to ploidy cn_color = "#a6cee3" for i in range(1, ks.ploidy + 1): x = i * ks.lambda_ plt.plot((x, x), (0, ymax), "-.", color=cn_color) plt.text( x, ymax * 0.95, "CN{}".format(i), ha="right", va="center", color=cn_color, rotation=90, ) messages = [ Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg, ] if method == "nbinom": messages += [ks.ploidy_message] + ks.copy_messages write_messages(ax, messages) ax.set_title(markup(title)) ax.set_xlim((0, vmax)) ax.set_ylim((0, ymax)) adjust_spines(ax, ["left", "bottom"], outward=True) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + "." + iopts.format savefig(imagename, dpi=100) return Genome_size
def analyze_allpaths(self, ploidy=2, K=23, covmax=1000000): """ Analyze Kmer spectrum, calculations derived from allpathslg/src/kmers/KmerSpectra.cc """ from math import sqrt data = self.data kf_ceil = max(K for (K, c) in data) if kf_ceil > covmax: exceeds = sum(1 for (K, c) in data if K > covmax) logging.debug("A total of {0} distinct K-mers appear > " "{1} times. Ignored ...".format(exceeds, covmax)) kf_ceil = covmax nkf = kf_ceil + 1 a = [0] * nkf for kf, c in data: if kf > kf_ceil: continue a[kf] = c ndk = a # number of distinct kmers nk = [k * c for k, c in enumerate(a)] # number of kmers cndk = [0] * nkf # cumulative number of distinct kmers cnk = [0] * nkf # cumulative number of kmers for kf in range(1, nkf): cndk[kf] = cndk[kf - 1] + 0.5 * (ndk[kf - 1] + ndk[kf]) cnk[kf] = cnk[kf - 1] + 0.5 * (nk[kf - 1] + nk[kf]) # Separate kmer spectrum in 5 regions based on the kf # 1 ... kf_min1 : bad kmers with low frequency # kf_min1 ... kf_min2 : good kmers CN = 1/2 (SNPs) # kf_min2 ... kf_min3 : good kmers CN = 1 # kf_min3 ... kf_hi : good kmers CN > 1 (repetitive) # kf_hi ... inf : bad kmers with high frequency # min1: find first minimum _kf_min1 = 10 while _kf_min1 - 1 >= 2 and nk[_kf_min1 - 1] < nk[_kf_min1]: _kf_min1 -= 1 while _kf_min1 <= kf_ceil and nk[_kf_min1 + 1] < nk[_kf_min1]: _kf_min1 += 1 # max2: find absolute maximum mx2 above first minimum min1 _kf_max2 = _kf_min1 for kf in range(_kf_min1 + 1, int(0.8 * kf_ceil)): if nk[kf] > nk[_kf_max2]: _kf_max2 = kf # max2: resetting max2 for cases of very high polymorphism if ploidy == 2: ndk_half = ndk[_kf_max2 // 2] ndk_double = ndk[_kf_max2 * 2] if ndk_double > ndk_half: _kf_max2 *= 2 # max1: SNPs local maximum max1 as half global maximum max2 _kf_max1 = _kf_max2 // 2 # min2: SNPs local minimum min2 between max1 and max2 _kf_min2 = (_kf_max1 * (2 * ndk[_kf_max1] + ndk[_kf_max2]) // (ndk[_kf_max1] + ndk[_kf_max2])) # min1: refine between min1 and max2/2 for kf in range(_kf_min1 + 1, _kf_max1): if nk[kf] < nk[_kf_min1]: _kf_min1 = kf # min3: not a minimum, really. upper edge of main peak _kf_min3 = _kf_max2 * 3 // 2 print("kfs:", _kf_min1, _kf_max1, _kf_min2, _kf_max2, _kf_min3, file=sys.stderr) self.min1 = _kf_min1 self.max1 = _kf_max1 self.min2 = _kf_min2 self.max2 = _kf_max2 self.min3 = _kf_min3 self.lambda_ = self.max2 # Main peak # Define maximum kf above which we neglect data _kf_hi = (_kf_max2 * sqrt(4 * ndk[2 * _kf_max2] * _kf_max2) if 2 * _kf_max2 < len(ndk) else _kf_max2 * sqrt(4 * ndk[len(ndk) - 1] * _kf_max2)) _kf_hi = int(_kf_hi) if _kf_hi > kf_ceil: _kf_hi = kf_ceil _nk_total = cnk[len(cnk) - 1] _nk_bad_low_kf = cnk[_kf_min1] _nk_good_uniq = cnk[_kf_min3] - cnk[_kf_min2] _nk_bad_high_kf = _nk_total - cnk[_kf_hi] _ndk_good_snp = cndk[_kf_min2] - cndk[_kf_min1] _ndk_good_uniq = cndk[_kf_min3] - cndk[_kf_min2] # kmer coverage C_k _kf_ave_uniq = _nk_good_uniq * 1.0 / _ndk_good_uniq _genome_size = (_nk_total - _nk_bad_low_kf - _nk_bad_high_kf) / _kf_ave_uniq _genome_size_unique = _ndk_good_uniq + _ndk_good_snp / 2 _genome_size_repetitive = _genome_size - _genome_size_unique _coverage = _nk_total / _genome_size if _genome_size else 0 # SNP rate estimation, assumes uniform distribution of SNPs over the # genome and accounts for the reduction in SNP kmer counts when # polymorphism is very high if ploidy == 2: _d_SNP = (1.0 / (1.0 - (1.0 - 0.5 * _ndk_good_snp / _genome_size)**(1.0 / K)) if _ndk_good_snp > 0 else 1000000) G = int(_genome_size) G1 = int(_genome_size_unique) GR = int(_genome_size_repetitive) coverage = int(_coverage) m = "Kmer (K={0}) Spectrum Analysis\n".format(K) m += "Genome size estimate = {0}\n".format(thousands(G)) m += "Genome size estimate CN = 1 = {0} ({1})\n".format( thousands(G1), percentage(G1, G)) m += "Genome size estimate CN > 1 = {0} ({1})\n".format( thousands(GR), percentage(GR, G)) m += "Coverage estimate: {0} x\n".format(coverage) self.repetitive = "Repeats: {0} percent".format(GR * 100 // G) if ploidy == 2: d_SNP = int(_d_SNP) self.snprate = "SNP rate ~= 1/{0}".format(d_SNP) else: self.snprate = "SNP rate not computed (Ploidy = {0})".format( ploidy) m += self.snprate + "\n" self.genomesize = int(round(self.totalKmers * 1.0 / self.max2)) print(m, file=sys.stderr) return {}
def histogram(args): """ %prog histogram [reads.fasta|reads.fastq] Plot read length distribution for reads. The plot would be similar to the one generated by SMRT-portal, for example: http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html Plot has two axes - corresponding to pdf and cdf, respectively. Also adding number of reads, average/median, N50, and total length. """ from jcvi.utils.cbook import human_size, thousands, SUFFIXES from jcvi.formats.fastq import fasta from jcvi.graphics.histogram import stem_leaf_plot from jcvi.graphics.base import ( plt, markup, human_formatter, human_base_formatter, savefig, set2, set_ticklabels_helvetica, ) p = OptionParser(histogram.__doc__) p.set_histogram(vmax=50000, bins=100, xlabel="Read length", title="Read length distribution") p.add_option("--ylabel1", default="Counts", help="Label of y-axis on the left") p.add_option( "--color", default="0", choices=[str(x) for x in range(8)], help="Color of bars, which is an index 0-7 in brewer set2", ) opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark") if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args fastafile, qualfile = fasta([fastafile, "--seqtk"]) sizes = Sizes(fastafile) all_sizes = sorted(sizes.sizes) xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins) plt.figure(1, (iopts.w, iopts.h)) ax1 = plt.gca() width = (xmax - xmin) * 0.5 / bins color = set2[int(opts.color)] ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center") ax1.set_xlabel(markup(opts.xlabel)) ax1.set_ylabel(opts.ylabel1) ax2 = ax1.twinx() cur_size = 0 total_size, l50, n50 = sizes.summary cdf = {} hsize = human_size(total_size) tag = hsize[-2:] unit = 1000**SUFFIXES[1000].index(tag) for x in all_sizes: if x not in cdf: cdf[x] = (total_size - cur_size) * 1.0 / unit cur_size += x x, y = zip(*sorted(cdf.items())) ax2.plot(x, y, "-", color="darkslategray") ylabel2 = "{0} above read length".format(tag) ax2.set_ylabel(ylabel2) for ax in (ax1, ax2): set_ticklabels_helvetica(ax) ax.set_xlim((xmin - width / 2, xmax + width / 2)) tc = "gray" axt = ax1.transAxes xx, yy = 0.95, 0.95 ma = "Total bases: {0}".format(hsize) mb = "Total reads: {0}".format(thousands(len(sizes))) mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes))) md = "Median read length: {0}bp".format(thousands(np.median(all_sizes))) me = "N50 read length: {0}bp".format(thousands(l50)) for t in (ma, mb, mc, md, me): print(t, file=sys.stderr) ax1.text(xx, yy, t, color=tc, transform=axt, ha="right") yy -= 0.05 ax1.set_title(markup(opts.title)) # Seaborn removes ticks for all styles except 'ticks'. Now add them back: ax1.tick_params( axis="x", direction="out", length=3, left=False, right=False, top=False, bottom=True, ) ax1.xaxis.set_major_formatter(human_base_formatter) ax1.yaxis.set_major_formatter(human_formatter) figname = sizes.filename + ".pdf" savefig(figname)
def histogram(args): """ %prog histogram [reads.fasta|reads.fastq] Plot read length distribution for reads. The plot would be similar to the one generated by SMRT-portal, for example: http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html Plot has two axes - corresponding to pdf and cdf, respectively. Also adding number of reads, average/median, N50, and total length. """ from jcvi.utils.cbook import human_size, thousands, SUFFIXES from jcvi.formats.fastq import fasta from jcvi.graphics.histogram import stem_leaf_plot from jcvi.graphics.base import plt, markup, human_formatter, \ human_base_formatter, savefig, set2, set_ticklabels_helvetica p = OptionParser(histogram.__doc__) p.set_histogram(vmax=50000, bins=100, xlabel="Read length", title="Read length distribution") p.add_option("--ylabel1", default="Counts", help="Label of y-axis on the left") p.add_option("--color", default='0', choices=[str(x) for x in range(8)], help="Color of bars, which is an index 0-7 in brewer set2") opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark") if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args fastafile, qualfile = fasta([fastafile, "--seqtk"]) sizes = Sizes(fastafile) all_sizes = sorted(sizes.sizes) xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins) plt.figure(1, (iopts.w, iopts.h)) ax1 = plt.gca() width = (xmax - xmin) * .5 / bins color = set2[int(opts.color)] ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center") ax1.set_xlabel(markup(opts.xlabel)) ax1.set_ylabel(opts.ylabel1) ax2 = ax1.twinx() cur_size = 0 total_size, l50, n50 = sizes.summary cdf = {} hsize = human_size(total_size) tag = hsize[-2:] unit = 1000 ** SUFFIXES[1000].index(tag) for x in all_sizes: if x not in cdf: cdf[x] = (total_size - cur_size) * 1. / unit cur_size += x x, y = zip(*sorted(cdf.items())) ax2.plot(x, y, '-', color="darkslategray") ylabel2 = "{0} above read length".format(tag) ax2.set_ylabel(ylabel2) for ax in (ax1, ax2): set_ticklabels_helvetica(ax) ax.set_xlim((xmin - width / 2, xmax + width / 2)) tc = "gray" axt = ax1.transAxes xx, yy = .95, .95 ma = "Total bases: {0}".format(hsize) mb = "Total reads: {0}".format(thousands(len(sizes))) mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes))) md = "Median read length: {0}bp".format(thousands(np.median(all_sizes))) me = "N50 read length: {0}bp".format(thousands(l50)) for t in (ma, mb, mc, md, me): print >> sys.stderr, t ax1.text(xx, yy, t, color=tc, transform=axt, ha="right") yy -= .05 ax1.set_title(markup(opts.title)) # Seaborn removes ticks for all styles except 'ticks'. Now add them back: ax1.tick_params(axis="x", direction="out", length=3, left=False, right=False, top=False, bottom=True) ax1.xaxis.set_major_formatter(human_base_formatter) ax1.yaxis.set_major_formatter(human_formatter) figname = sizes.filename + ".pdf" savefig(figname)
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1, is_self=False, synteny=False, cmap_text=None, cmap="copper", genomenames=None, sample_number=10000, minfont=5, palette=None, chrlw=.1, title=None, sep=True, sepcolor="g", stdpf=True): fp = open(anchorfile) # add genome names if genomenames: gx, gy = genomenames.split("_") else: to_ax_label = lambda fname: op.basename(fname).split(".")[0] gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)] gx, gy = markup(gx), markup(gy) qorder = qbed.order sorder = sbed.order data = [] if cmap_text: logging.debug("Capping values within [{0:.1f}, {1:.1f}]"\ .format(vmin, vmax)) block_id = 0 for row in fp: atoms = row.split() block_color = None if row[0] == "#": block_id += 1 if palette: block_color = palette.get(block_id, "k") continue # first two columns are query and subject, and an optional third column if len(atoms) < 2: continue query, subject = atoms[:2] value = atoms[-1] if cmap_text: try: value = float(value) except ValueError: value = vmax if value < vmin: continue if value > vmax: continue else: value = 0 if query not in qorder: continue if subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] nv = value if block_color is None else block_color data.append((qi, si, nv)) if is_self: # Mirror image data.append((si, qi, nv)) npairs = downsample(data, sample_number=sample_number) x, y, c = zip(*data) if palette: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) else: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=cmap, vmin=vmin, vmax=vmax) if synteny: clusters = batch_scan(data, qbed, sbed) draw_box(clusters, ax) if cmap_text: draw_cmap(root, cmap_text, vmin, vmax, cmap=cmap) xsize, ysize = len(qbed), len(sbed) logging.debug("xsize=%d ysize=%d" % (xsize, ysize)) qbreaks = qbed.get_breaks() sbreaks = sbed.get_breaks() xlim, ylim = plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize, qbreaks, sbreaks, sep=sep, chrlw=chrlw, sepcolor=sepcolor, minfont=minfont, stdpf=stdpf) # create a diagonal to separate mirror image for self comparison if is_self: ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2) if palette: # bottom-left has the palette, if available colors = palette.colors xstart, ystart = .1, .05 for category, c in sorted(colors.items()): root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c)) root.text(xstart + .04, ystart, category, color=c) xstart += .1 if title is None: title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy) if is_self: title = "Intra-genomic comparison within {0}".format(gx) npairs /= 2 title += " ({0} gene pairs)".format(thousands(npairs)) root.set_title(title, x=.5, y=.96, color="k") if title: logging.debug("Dot plot title: {}".format(title)) normalize_axes(root)
def summary(args): """ %prog summary txtfile fastafile The txtfile can be generated by: %prog mstmap --noheader --freq=0 Tabulate on all possible combinations of genotypes and provide results in a nicely-formatted table. Give a fastafile for SNP rate (average # of SNPs per Kb). Only three-column file is supported: locus_id intra- genotype inter- genotype """ from jcvi.utils.cbook import thousands from jcvi.utils.table import tabulate p = OptionParser(summary.__doc__) p.add_option("--counts", help="Print SNP counts in a txt file [default: %default]") p.add_option("--bed", help="Print SNPs locations in a bed file [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) txtfile, fastafile = args bedfw = open(opts.bed, "w") if opts.bed else None fp = open(txtfile) header = fp.next().split() # Header snps = defaultdict(list) # contig => list of loci combinations = defaultdict(int) intraSNPs = interSNPs = 0 distinctSet = set() # set of genes that show A-B pattern ref, alt = header[1:3] snpcounts, goodsnpcounts = defaultdict(int), defaultdict(int) for row in fp: atoms = row.split() assert len(atoms) == 3, \ "Only three-column file is supported" locus, intra, inter = atoms ctg, pos = locus.rsplit(".", 1) pos = int(pos) snps[ctg].append(pos) snpcounts[ctg] += 1 if intra == 'X': intraSNPs += 1 if inter in ('B', 'X'): interSNPs += 1 if intra == 'A' and inter == 'B': distinctSet.add(ctg) goodsnpcounts[ctg] += 1 # Tabulate all possible combinations intra = ref + "-" + intra inter = alt + "-" + inter combinations[(intra, inter)] += 1 if bedfw: print >> bedfw, "\t".join(str(x) for x in \ (ctg, pos - 1, pos, locus)) if bedfw: logging.debug("SNP locations written to `{0}`.".format(opts.bed)) bedfw.close() nsites = sum(len(x) for x in snps.values()) sizes = Sizes(fastafile) bpsize = sizes.totalsize snprate = lambda a: a * 1000. / bpsize m = "Dataset `{0}` contains {1} contigs ({2} bp).\n".\ format(fastafile, len(sizes), thousands(bpsize)) m += "A total of {0} SNPs within {1} contigs ({2} bp).\n".\ format(nsites, len(snps), thousands(sum(sizes.mapping[x] for x in snps.keys()))) m += "SNP rate: {0:.1f}/Kb, ".format(snprate(nsites)) m += "IntraSNPs: {0} ({1:.1f}/Kb), InterSNPs: {2} ({3:.1f}/Kb)".\ format(intraSNPs, snprate(intraSNPs), interSNPs, snprate(interSNPs)) print >> sys.stderr, m print >> sys.stderr, tabulate(combinations) leg = "Legend: A - homozygous same, B - homozygous different, X - heterozygous" print >> sys.stderr, leg tag = (ref + "-A", alt + "-B") distinctSNPs = combinations[tag] tag = str(tag).replace("'", "") print >> sys.stderr, "A total of {0} disparate {1} SNPs in {2} contigs.".\ format(distinctSNPs, tag, len(distinctSet)) if not opts.counts: return snpcountsfile = opts.counts fw = open(snpcountsfile, "w") header = "\t".join(("Contig", "#_SNPs", "#_AB_SNP")) print >> fw, header assert sum(snpcounts.values()) == nsites assert sum(goodsnpcounts.values()) == distinctSNPs for ctg in sorted(snps.keys()): snpcount = snpcounts[ctg] goodsnpcount = goodsnpcounts[ctg] print >> fw, "\t".join(str(x) for x in (ctg, snpcount, goodsnpcount)) fw.close() logging.debug("SNP counts per contig is written to `{0}`.".\ format(snpcountsfile))
def coverage(args): """ %prog coverage fastafile ctg bedfile1 bedfile2 .. Plot coverage from a set of BED files that contain the read mappings. The paired read span will be converted to a new bedfile that contain the happy mates. ctg is the chr/scf/ctg that you want to plot the histogram on. If the bedfiles already contain the clone spans, turn on --spans. """ from jcvi.formats.bed import mates, bedpe p = OptionParser(coverage.__doc__) p.add_option("--ymax", default=None, type="int", help="Limit ymax [default: %default]") p.add_option( "--spans", default=False, action="store_true", help="BED files already contain clone spans [default: %default]") opts, args, iopts = p.set_image_options(args, figsize="8x5") if len(args) < 3: sys.exit(not p.print_help()) fastafile, ctg = args[0:2] bedfiles = args[2:] sizes = Sizes(fastafile) size = sizes.mapping[ctg] plt.figure(1, (iopts.w, iopts.h)) ax = plt.gca() bins = 100 # smooth the curve lines = [] legends = [] not_covered = [] yy = .9 for bedfile, c in zip(bedfiles, "rgbcky"): if not opts.spans: pf = bedfile.rsplit(".", 1)[0] matesfile = pf + ".mates" if need_update(bedfile, matesfile): matesfile, matesbedfile = mates([bedfile, "--lib"]) bedspanfile = pf + ".spans.bed" if need_update(matesfile, bedspanfile): bedpefile, bedspanfile = bedpe( [bedfile, "--span", "--mates={0}".format(matesfile)]) bedfile = bedspanfile bedsum = Bed(bedfile).sum(seqid=ctg) notcoveredbases = size - bedsum legend = bedfile.split(".")[0] msg = "{0}: {1} bp not covered".format(legend, thousands(notcoveredbases)) not_covered.append(msg) print >> sys.stderr, msg ax.text(.1, yy, msg, color=c, size=9, transform=ax.transAxes) yy -= .08 cov = Coverage(bedfile, sizes.filename) x, y = cov.get_plot_data(ctg, bins=bins) line, = ax.plot(x, y, '-', color=c, lw=2, alpha=.5) lines.append(line) legends.append(legend) leg = ax.legend(lines, legends, shadow=True, fancybox=True) leg.get_frame().set_alpha(.5) ylabel = "Average depth per {0}Kb".format(size / bins / 1000) ax.set_xlim(0, size) ax.set_ylim(0, opts.ymax) ax.set_xlabel(ctg) ax.set_ylabel(ylabel) set_human_base_axis(ax) figname = "{0}.{1}.pdf".format(fastafile, ctg) savefig(figname, dpi=iopts.dpi, iopts=iopts)
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1, is_self=False, synteny=False, cmap_text=None, cmap="copper", genomenames=None, sample_number=10000, minfont=5, palette=None, chrlw=.01, title=None, sepcolor="gainsboro"): fp = open(anchorfile) qorder = qbed.order sorder = sbed.order data = [] if cmap_text: logging.debug("Capping values within [{0:.1f}, {1:.1f}]"\ .format(vmin, vmax)) block_id = 0 for row in fp: atoms = row.split() block_color = None if row[0] == "#": block_id += 1 if palette: block_color = palette.get(block_id, "k") continue # first two columns are query and subject, and an optional third column if len(atoms) < 2: continue query, subject = atoms[:2] value = atoms[-1] if cmap_text: try: value = float(value) except ValueError: value = vmax if value < vmin: continue if value > vmax: continue else: value = 0 if query not in qorder: continue if subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] nv = value if block_color is None else block_color data.append((qi, si, nv)) if is_self: # Mirror image data.append((si, qi, nv)) npairs = len(data) # Only show random subset if npairs > sample_number: logging.debug("Showing a random subset of {0} data points (total {1}) " \ "for clarity.".format(sample_number, npairs)) data = sample(data, sample_number) # the data are plotted in this order, the least value are plotted # last for aesthetics #if not palette: # data.sort(key=lambda x: -x[2]) x, y, c = zip(*data) if palette: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) else: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=cmap, vmin=vmin, vmax=vmax) if synteny: clusters = batch_scan(data, qbed, sbed) draw_box(clusters, ax) if cmap_text: draw_cmap(root, cmap_text, vmin, vmax, cmap=cmap) xsize, ysize = len(qbed), len(sbed) logging.debug("xsize=%d ysize=%d" % (xsize, ysize)) xlim = (0, xsize) ylim = (ysize, 0) # invert the y-axis # Tag to mark whether to plot chr name (skip small ones) xchr_labels, ychr_labels = [], [] th = TextHandler(fig) # plot the chromosome breaks for (seqid, beg, end) in qbed.get_breaks(): xsize_ratio = abs(end - beg) * .8 / xsize fontsize = th.select_fontsize(xsize_ratio) seqid = "".join(seqid_parse(seqid)[:2]) xchr_labels.append((seqid, (beg + end) / 2, fontsize)) ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor) for (seqid, beg, end) in sbed.get_breaks(): ysize_ratio = abs(end - beg) * .8 / ysize fontsize = th.select_fontsize(ysize_ratio) seqid = "".join(seqid_parse(seqid)[:2]) ychr_labels.append((seqid, (beg + end) / 2, fontsize)) ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor) # plot the chromosome labels for label, pos, fontsize in xchr_labels: pos = .1 + pos * .8 / xsize if fontsize >= minfont: root.text(pos, .91, latex(label), size=fontsize, ha="center", va="bottom", rotation=45, color="grey") # remember y labels are inverted for label, pos, fontsize in ychr_labels: pos = .9 - pos * .8 / ysize if fontsize >= minfont: root.text(.91, pos, latex(label), size=fontsize, va="center", color="grey") # create a diagonal to separate mirror image for self comparison if is_self: ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2) ax.set_xlim(xlim) ax.set_ylim(ylim) # add genome names if genomenames: gx, gy = genomenames.split("_") else: to_ax_label = lambda fname: op.basename(fname).split(".")[0] gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)] ax.set_xlabel(markup(gx), size=16) ax.set_ylabel(markup(gy), size=16) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color='gray', size=10) if palette: # bottom-left has the palette, if available colors = palette.colors xstart, ystart = .1, .05 for category, c in sorted(colors.items()): root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c)) root.text(xstart + .04, ystart, category, color=c) xstart += .1 if not title: title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy) if is_self: title = "Intra-genomic comparison within {0}".format(gx) npairs /= 2 title += " ({0} gene pairs)".format(thousands(npairs)) root.set_title(markup(title), x=.5, y=.96, color="k") logging.debug(title) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off()
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]" ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = meryl([histfile]) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, "g-", lw=2, alpha=0.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=2, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") messages = [Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg] write_messages(ax, messages) ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title)) ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\ format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") tc = "gray" axt = ax.transAxes ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right") ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right") ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right") ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right") ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title), color='r') ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel, color='r') ax.set_ylabel(ylabel, color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. """ p = OptionParser(histogram.__doc__) p.add_option( "--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive", ) p.add_option( "--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive", ) p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot", ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option( "--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = merylhistogram(histfile) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print(msg, file=sys.stderr) x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, "g-", lw=2, alpha=0.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=2, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") messages = [ Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg, ] write_messages(ax, messages) ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title)) ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1, is_self=False, synteny=False, cmap_text=None, genomenames=None, sample_number=10000, minfont=5, palette=None, chrlw=.01, title=None, sepcolor="gainsboro"): fp = open(anchorfile) qorder = qbed.order sorder = sbed.order data = [] if cmap_text: logging.debug("Normalize values to [%.1f, %.1f]" % (vmin, vmax)) block_id = 0 for row in fp: atoms = row.split() block_color = None if row[0] == "#": block_id += 1 if palette: block_color = palette.get(block_id, "k") continue # first two columns are query and subject, and an optional third column if len(atoms) < 2: continue query, subject = atoms[:2] value = atoms[-1] try: value = float(value) except ValueError: value = vmax if value < vmin: value = vmin if value > vmax: value = vmax if query not in qorder: continue if subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] nv = vmax - value if block_color is None else block_color data.append((qi, si, nv)) if is_self: # Mirror image data.append((si, qi, nv)) npairs = len(data) # Only show random subset if npairs > sample_number: logging.debug("Showing a random subset of {0} data points (total {1}) " \ "for clarity.".format(sample_number, npairs)) data = sample(data, sample_number) # the data are plotted in this order, the least value are plotted # last for aesthetics if not palette: data.sort(key=lambda x: -x[2]) default_cm = cm.copper x, y, c = zip(*data) if palette: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) else: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=default_cm, vmin=vmin, vmax=vmax) if synteny: clusters = batch_scan(data, qbed, sbed) draw_box(clusters, ax) if cmap_text: draw_cmap(root, cmap_text, vmin, vmax, cmap=default_cm, reverse=True) xsize, ysize = len(qbed), len(sbed) logging.debug("xsize=%d ysize=%d" % (xsize, ysize)) xlim = (0, xsize) ylim = (ysize, 0) # invert the y-axis # Tag to mark whether to plot chr name (skip small ones) xchr_labels, ychr_labels = [], [] th = TextHandler(fig) # plot the chromosome breaks for (seqid, beg, end) in qbed.get_breaks(): xsize_ratio = abs(end - beg) * .8 / xsize fontsize = th.select_fontsize(xsize_ratio) seqid = "".join(seqid_parse(seqid)[:2]) xchr_labels.append((seqid, (beg + end) / 2, fontsize)) ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor) for (seqid, beg, end) in sbed.get_breaks(): ysize_ratio = abs(end - beg) * .8 / ysize fontsize = th.select_fontsize(ysize_ratio) seqid = "".join(seqid_parse(seqid)[:2]) ychr_labels.append((seqid, (beg + end) / 2, fontsize)) ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor) # plot the chromosome labels for label, pos, fontsize in xchr_labels: pos = .1 + pos * .8 / xsize if fontsize >= minfont: root.text(pos, .91, latex(label), size=fontsize, ha="center", va="bottom", rotation=45, color="grey") # remember y labels are inverted for label, pos, fontsize in ychr_labels: pos = .9 - pos * .8 / ysize if fontsize >= minfont: root.text(.91, pos, latex(label), size=fontsize, va="center", color="grey") # create a diagonal to separate mirror image for self comparison if is_self: ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2) ax.set_xlim(xlim) ax.set_ylim(ylim) # add genome names if genomenames: gx, gy = genomenames.split("_") else: to_ax_label = lambda fname: op.basename(fname).split(".")[0] gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)] ax.set_xlabel(gx, size=16) ax.set_ylabel(gy, size=16) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color='gray', size=10) if palette: # bottom-left has the palette, if available colors = palette.colors xstart, ystart = .1, .05 for category, c in sorted(colors.items()): root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c)) root.text(xstart + .04, ystart, category, color=c) xstart += .1 if not title: title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy) if is_self: title = "Intra-genomic comparison within {0}".format(gx) npairs /= 2 title += " ({0} gene pairs)".format(thousands(npairs)) root.set_title(title, x=.5, y=.96, color="k") logging.debug(title) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off()
def analyze(self, ploidy=2, K=23, covmax=1000000): """ Analyze Kmer spectrum, calculations derived from allpathslg/src/kmers/KmerSpectra.cc """ from math import sqrt data = self.data kf_ceil = max(K for (K, c) in data) if kf_ceil > covmax: exceeds = sum(1 for (K, c) in data if K > covmax) logging.debug("A total of {0} distinct K-mers appear > " "{1} times. Ignored ...".format(exceeds, covmax)) kf_ceil = covmax nkf = kf_ceil + 1 a = [0] * nkf for kf, c in data: if kf > kf_ceil: continue a[kf] = c ndk = a # number of distinct kmers nk = [k * c for k, c in enumerate(a)] # number of kmers cndk = [0] * nkf # cumulative number of distinct kmers cnk = [0] * nkf # cumulative number of kmers for kf in xrange(1, nkf): cndk[kf] = cndk[kf - 1] + 0.5 * (ndk[kf - 1] + ndk[kf]) cnk[kf] = cnk[kf - 1] + 0.5 * (nk[kf - 1] + nk[kf]) # Separate kmer spectrum in 5 regions based on the kf # 1 ... kf_min1 : bad kmers with low frequency # kf_min1 ... kf_min2 : good kmers CN = 1/2 (SNPs) # kf_min2 ... kf_min3 : good kmers CN = 1 # kf_min3 ... kf_hi : good kmers CN > 1 (repetitive) # kf_hi ... inf : bad kmers with high frequency # min1: find first minimum _kf_min1 = 10 while _kf_min1 - 1 >= 2 and nk[_kf_min1 - 1] < nk[_kf_min1]: _kf_min1 -= 1 while _kf_min1 <= kf_ceil and nk[_kf_min1 + 1] < nk[_kf_min1]: _kf_min1 += 1 # max2: find absolute maximum mx2 above first minimum min1 _kf_max2 = _kf_min1 for kf in xrange(_kf_min1 + 1, int(0.8 * kf_ceil)): if nk[kf] > nk[_kf_max2]: _kf_max2 = kf # max2: resetting max2 for cases of very high polymorphism if ploidy == 2: ndk_half = ndk[_kf_max2 / 2] ndk_double = ndk[_kf_max2 * 2] if ndk_double > ndk_half: _kf_max2 *= 2 # max1: SNPs local maximum max1 as half global maximum max2 _kf_max1 = _kf_max2 / 2 # min2: SNPs local minimum min2 between max1 and max2 _kf_min2 = _kf_max1 * (2 * ndk[_kf_max1] + ndk[_kf_max2]) / (ndk[_kf_max1] + ndk[_kf_max2]) # min1: refine between min1 and max2/2 for kf in xrange(_kf_min1 + 1, _kf_max1): if nk[kf] < nk[_kf_min1]: _kf_min1 = kf # min3: not a minimum, really. upper edge of main peak _kf_min3 = _kf_max2 * 3 / 2 print >> sys.stderr, "kfs:", _kf_min1, _kf_max1, _kf_min2, _kf_max2, _kf_min3 self.min1 = _kf_min1 self.max1 = _kf_max1 self.min2 = _kf_min2 self.max2 = _kf_max2 self.min3 = _kf_min3 # Define maximum kf above which we neglect data _kf_hi = ( _kf_max2 * sqrt(4 * ndk[2 * _kf_max2] * _kf_max2) if 2 * _kf_max2 < len(ndk) else _kf_max2 * sqrt(4 * ndk[len(ndk) - 1] * _kf_max2) ) _kf_hi = int(_kf_hi) if _kf_hi > kf_ceil: _kf_hi = kf_ceil _nk_total = cnk[len(cnk) - 1] _nk_bad_low_kf = cnk[_kf_min1] _nk_good_uniq = cnk[_kf_min3] - cnk[_kf_min2] _nk_bad_high_kf = _nk_total - cnk[_kf_hi] _ndk_good_snp = cndk[_kf_min2] - cndk[_kf_min1] _ndk_good_uniq = cndk[_kf_min3] - cndk[_kf_min2] # kmer coverage C_k _kf_ave_uniq = _nk_good_uniq * 1.0 / _ndk_good_uniq _genome_size = (_nk_total - _nk_bad_low_kf - _nk_bad_high_kf) / _kf_ave_uniq _genome_size_unique = _ndk_good_uniq + _ndk_good_snp / 2 _genome_size_repetitive = _genome_size - _genome_size_unique _coverage = _nk_total / _genome_size if _genome_size else 0 # SNP rate estimation, assumes uniform distribution of SNPs over the # genome and accounts for the reduction in SNP kmer counts when # polymorphism is very high if ploidy == 2: _d_SNP = ( 1.0 / (1.0 - (1.0 - 0.5 * _ndk_good_snp / _genome_size) ** (1.0 / K)) if _ndk_good_snp > 0 else 1000000 ) G = int(_genome_size) G1 = int(_genome_size_unique) GR = int(_genome_size_repetitive) coverage = int(_coverage) m = "Kmer (K={0}) Spectrum Analysis\n".format(K) m += "Genome size estimate = {0}\n".format(thousands(G)) m += "Genome size estimate CN = 1 = {0} ({1})\n".format(thousands(G1), percentage(G1, G)) m += "Genome size estimate CN > 1 = {0} ({1})\n".format(thousands(GR), percentage(GR, G)) m += "Coverage estimate: {0} x\n".format(coverage) self.repetitive = "Repeats: {0} percent".format(GR * 100 / G) if ploidy == 2: d_SNP = int(_d_SNP) self.snprate = "SNP rate ~= 1/{0}".format(d_SNP) else: self.snprate = "SNP rate not computed (Ploidy = {0})".format(ploidy) m += self.snprate + "\n" self.genomesize = int(round(self.totalKmers * 1.0 / self.max2)) print >> sys.stderr, m
def coverage(args): """ %prog coverage fastafile ctg bedfile1 bedfile2 .. Plot coverage from a set of BED files that contain the read mappings. The paired read span will be converted to a new bedfile that contain the happy mates. ctg is the chr/scf/ctg that you want to plot the histogram on. If the bedfiles already contain the clone spans, turn on --spans. """ from jcvi.formats.bed import mates, bedpe p = OptionParser(coverage.__doc__) p.add_option("--ymax", default=None, type="int", help="Limit ymax [default: %default]") p.add_option("--spans", default=False, action="store_true", help="BED files already contain clone spans [default: %default]") opts, args, iopts = p.set_image_options(args, figsize="8x5") if len(args) < 3: sys.exit(not p.print_help()) fastafile, ctg = args[0:2] bedfiles = args[2:] sizes = Sizes(fastafile) size = sizes.mapping[ctg] plt.figure(1, (iopts.w, iopts.h)) ax = plt.gca() bins = 100 # smooth the curve lines = [] legends = [] not_covered = [] yy = .9 for bedfile, c in zip(bedfiles, "rgbcky"): if not opts.spans: pf = bedfile.rsplit(".", 1)[0] matesfile = pf + ".mates" if need_update(bedfile, matesfile): matesfile, matesbedfile = mates([bedfile, "--lib"]) bedspanfile = pf + ".spans.bed" if need_update(matesfile, bedspanfile): bedpefile, bedspanfile = bedpe([bedfile, "--span", "--mates={0}".format(matesfile)]) bedfile = bedspanfile bedsum = Bed(bedfile).sum(seqid=ctg) notcoveredbases = size - bedsum legend = bedfile.split(".")[0] msg = "{0}: {1} bp not covered".format(legend, thousands(notcoveredbases)) not_covered.append(msg) print >> sys.stderr, msg ax.text(.1, yy, msg, color=c, size=9, transform=ax.transAxes) yy -= .08 cov = Coverage(bedfile, sizes.filename) x, y = cov.get_plot_data(ctg, bins=bins) line, = ax.plot(x, y, '-', color=c, lw=2, alpha=.5) lines.append(line) legends.append(legend) leg = ax.legend(lines, legends, shadow=True, fancybox=True) leg.get_frame().set_alpha(.5) ylabel = "Average depth per {0}Kb".format(size / bins / 1000) ax.set_xlim(0, size) ax.set_ylim(0, opts.ymax) ax.set_xlabel(ctg) ax.set_ylabel(ylabel) set_human_base_axis(ax) figname ="{0}.{1}.pdf".format(fastafile, ctg) savefig(figname, dpi=iopts.dpi, iopts=iopts)