def allelefreq(args): """ %prog allelefreq HD,DM1,SCA1,SCA17 Plot the allele frequencies of some STRs. """ p = OptionParser(allelefreq.__doc__) opts, args, iopts = p.set_image_options(args, figsize="10x10") if len(args) != 1: sys.exit(not p.print_help()) loci, = args fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols=2, nrows=2, figsize=(iopts.w, iopts.h)) plt.tight_layout(pad=4) treds, df = read_treds() df = df.set_index(["abbreviation"]) for ax, locus in zip((ax1, ax2, ax3, ax4), loci.split(",")): plot_allelefreq(ax, df, locus) root = fig.add_axes([0, 0, 1, 1]) pad = .03 panel_labels(root, ((pad / 2, 1 - pad, "A"), (1 / 2., 1 - pad, "B"), (pad / 2, 1 / 2., "C"), (1 / 2., 1 / 2., "D"))) normalize_axes(root) image_name = "allelefreq." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def compare2(args): """ %prog compare2 Compare performances of various variant callers on simulated STR datasets. """ p = OptionParser(compare2.__doc__) p.add_option('--maxinsert', default=300, type="int", help="Maximum number of repeats") add_simulate_options(p) opts, args, iopts = p.set_image_options(args, figsize="10x5") if len(args) != 0: sys.exit(not p.print_help()) depth = opts.depth readlen = opts.readlen distance = opts.distance max_insert = opts.maxinsert fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(iopts.w, iopts.h)) plt.tight_layout(pad=2) # ax1: lobSTR vs TREDPARSE with haploid model lobstr_results = parse_results("lobstr_results_homo.txt") tredparse_results = parse_results("tredparse_results_homo.txt") title = SIMULATED_HAPLOID + \ r" ($D=%s\times, L=%dbp, V=%dbp$)" % (depth, readlen, distance) plot_compare(ax1, title, tredparse_results, lobstr_results, max_insert=max_insert) # ax2: lobSTR vs TREDPARSE with diploid model lobstr_results = parse_results("lobstr_results_het.txt", exclude=20) tredparse_results = parse_results("tredparse_results_het.txt", exclude=20) title = SIMULATED_DIPLOID + \ r" ($D=%s\times, L=%dbp, V=%dbp$)" % (depth, readlen, distance) plot_compare(ax2, title, tredparse_results, lobstr_results, max_insert=max_insert) for ax in (ax1, ax2): ax.set_xlim(0, max_insert) ax.set_ylim(0, max_insert) root = fig.add_axes([0, 0, 1, 1]) pad = .03 panel_labels(root, ((pad / 2, 1 - pad, "A"), (1 / 2., 1 - pad, "B"))) normalize_axes(root) image_name = "tredparse." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def compare(args): """ %prog compare Evaluation.csv Compare performances of various variant callers on simulated STR datasets. """ p = OptionParser(compare.__doc__) opts, args, iopts = p.set_image_options(args, figsize="10x10") if len(args) != 1: sys.exit(not p.print_help()) datafile, = args pf = datafile.rsplit(".", 1)[0] fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols=2, nrows=2, figsize=(iopts.w, iopts.h)) plt.tight_layout(pad=3) bbox = {'facecolor': 'tomato', 'alpha': .2, 'ec': 'w'} pad = 2 # Read benchmark data df = pd.read_csv("Evaluation.csv") truth = df["Truth"] axes = (ax1, ax2, ax3, ax4) progs = ("Manta", "Isaac", "GATK", "lobSTR") markers = ("bx-", "yo-", "md-", "c+-") for ax, prog, marker in zip(axes, progs, markers): ax.plot(truth, df[prog], marker) ax.plot(truth, truth, 'k--') # to show diagonal ax.axhline(infected_thr, color='tomato') ax.text(max(truth) - pad, infected_thr + pad, 'Risk threshold', bbox=bbox, ha="right") ax.axhline(ref_thr, color='tomato') ax.text(max(truth) - pad, ref_thr - pad, 'Reference repeat count', bbox=bbox, ha="right", va="top") ax.set_title(SIMULATED_HAPLOID) ax.set_xlabel(r'Num of CAG repeats inserted ($\mathit{h}$)') ax.set_ylabel('Num of CAG repeats called') ax.legend([prog, 'Truth'], loc='best') root = fig.add_axes([0, 0, 1, 1]) pad = .03 panel_labels(root, ((pad / 2, 1 - pad, "A"), (1 / 2., 1 - pad, "B"), (pad / 2, 1 / 2., "C"), (1 / 2., 1 / 2., "D"))) normalize_axes(root) image_name = pf + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def multilineplot(args): """ %prog multilineplot fastafile chr1 Combine multiple line plots in one vertical stack Inputs must be BED-formatted. --lines: traditional line plots, useful for plotting feature freq """ p = OptionParser(multilineplot.__doc__) p.add_option("--lines", help="Features to plot in lineplot [default: %default]") p.add_option("--colors", help="List of colors matching number of input bed files") p.add_option("--mode", default="span", choices=("span", "count", "score"), help="Accumulate feature based on [default: %default]") p.add_option("--binned", default=False, action="store_true", help="Specify whether the input is already binned; " + "if True, input files are considered to be binfiles") add_window_options(p) opts, args, iopts = p.set_image_options(args, figsize="8x5") if len(args) != 2: sys.exit(not p.print_help()) fastafile, chr = args window, shift, subtract = check_window_options(opts) linebeds = [] colors = opts.colors if opts.lines: lines = opts.lines.split(",") assert len(colors) == len(lines), "Number of chosen colors must match" + \ " number of input bed files" linebeds = get_beds(lines, binned=opts.binned) linebins = get_binfiles(linebeds, fastafile, shift, mode=opts.mode, binned=opts.binned) clen = Sizes(fastafile).mapping[chr] nbins = get_nbins(clen, shift) plt.rcParams["xtick.major.size"] = 0 plt.rcParams["ytick.major.size"] = 0 plt.rcParams["figure.figsize"] = iopts.w, iopts.h fig, axarr = plt.subplots(nrows=len(lines)) if len(linebeds) == 1: axarr = (axarr, ) fig.suptitle(chr, color="darkslategray") for i, ax in enumerate(axarr): lineplot(ax, [linebins[i]], nbins, chr, window, shift, \ color="{0}{1}".format(colors[i], 'r')) plt.subplots_adjust(hspace=0.5) image_name = chr + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def compare(args): """ %prog compare Evaluation.csv Compare performances of various variant callers on simulated STR datasets. """ p = OptionParser(__doc__) opts, args, iopts = p.set_image_options(args, figsize="15x5") if len(args) != 1: sys.exit(not p.print_help()) datafile, = args pf = datafile.rsplit(".", 1)[0] fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, nrows=1, figsize=(iopts.w, iopts.h)) plt.tight_layout(pad=2) # Huntington risk allele infected_thr = 40 ref_thr = 19 # ax1: Multiple callers at lower range df = pd.read_csv("Evaluation.csv") truth = df["Truth"] ax1.plot(truth, df["Manta"], 'bx-') ax1.plot(truth, df["Isaac"], 'yo-') ax1.plot(truth, df["GATK"], 'md-') ax1.plot(truth, df["lobSTR"], 'c+-') ax1.plot(truth, truth, 'k--') # to show diagonal bbox = {'facecolor': 'tomato', 'alpha': .2, 'ec': 'w'} pad = 2 ax1.axhline(infected_thr, color='tomato') ax1.text(max(truth) - pad, infected_thr + pad, 'Risk threshold', bbox=bbox, ha="right") ax1.axhline(ref_thr, color='tomato') ax1.text(max(truth) - pad, ref_thr - pad, 'Reference repeat count', bbox=bbox, ha="right", va="top") ax1.set_xlabel(r'Num of CAG repeats inserted ($\mathit{h}$)') ax1.set_ylabel('Num of CAG repeats called') ax1.set_title(r'Simulated haploid $\mathit{h}$') ax1.legend(['Manta', 'Isaac', 'GATK', 'lobSTR', 'Truth'], loc='best') max_insert = 120 # ax2: lobSTR vs TREDPARSE with haploid model lobstr_results = parse_results("lobstr_results_homo.txt") tredparse_results = parse_results("tredparse_results_homo.txt") truth = range(10, max_insert + 1) lx, ly = zip(*lobstr_results) tx, ty = zip(*tredparse_results) ax2.plot(lx, ly, 'c+-') ax2.plot(tx, ty, 'gx-') ax2.plot(truth, truth, 'k--') ax2.set_xlabel(r'Num of CAG repeats inserted ($\mathit{h}$)') ax2.set_ylabel('Num of CAG repeats called') ax2.set_title(r'Simulated haploid $\mathit{h}$') ax2.legend(['lobSTR', 'TREDPARSE', 'Truth'], loc='best') pad *= 2 ax2.axhline(infected_thr, color='tomato') ax2.text(max(truth) - pad, infected_thr + pad, 'Risk threshold', bbox=bbox, ha="right") ax2.set_xlim(10, max_insert) # ax3: lobSTR vs TREDPARSE with haploid model lobstr_results = parse_results("lobstr_results_het.txt", exclude=20) tredparse_results = parse_results("tredparse_results_het.txt", exclude=20) truth = range(10, max_insert + 1) lx, ly = zip(*lobstr_results) tx, ty = zip(*tredparse_results) ax3.plot(lx, ly, 'c+-') ax3.plot(tx, ty, 'gx-') ax3.plot(truth, truth, 'k--') ax3.set_xlabel(r'Num of CAG repeats inserted ($\mathit{h}$)') ax3.set_ylabel('Num of CAG repeats called') ax3.set_title(r'Simulated diploid $\mathit{20/h}$') ax3.legend(['lobSTR', 'TREDPARSE', 'Truth'], loc='best') ax3.axhline(infected_thr, color='tomato') ax3.text(max(truth) - pad, infected_thr + pad, 'Risk threshold', bbox=bbox, ha="right") ax3.set_xlim(10, max_insert) root = fig.add_axes([0, 0, 1, 1]) pad = .03 panel_labels(root, ((pad / 2, 1 - pad, "A"), (1 / 3., 1 - pad, "B"), (2 / 3., 1 - pad, "C"))) normalize_axes(root) image_name = pf + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def seeds(args): """ %prog seeds [pngfile|jpgfile] Extract seed metrics from [pngfile|jpgfile]. Use --rows and --cols to crop image. """ p = OptionParser(seeds.__doc__) p.set_outfile() opts, args, iopts = add_seeds_options(p, args) if len(args) != 1: sys.exit(not p.print_help()) pngfile, = args pf = opts.prefix or op.basename(pngfile).rsplit(".", 1)[0] sigma, kernel = opts.sigma, opts.kernel rows, cols = opts.rows, opts.cols labelrows, labelcols = opts.labelrows, opts.labelcols ff = opts.filter calib = opts.calibrate outdir = opts.outdir if outdir != '.': mkdir(outdir) if calib: calib = json.load(must_open(calib)) pixel_cm_ratio, tr = calib["PixelCMratio"], calib["RGBtransform"] tr = np.array(tr) resizefile, mainfile, labelfile, exif = \ convert_image(pngfile, pf, outdir=outdir, rotate=opts.rotate, rows=rows, cols=cols, labelrows=labelrows, labelcols=labelcols) oimg = load_image(resizefile) img = load_image(mainfile) fig, (ax1, ax2, ax3, ax4) = plt.subplots(ncols=4, nrows=1, figsize=(iopts.w, iopts.h)) # Edge detection img_gray = rgb2gray(img) logging.debug("Running {0} edge detection ...".format(ff)) if ff == "canny": edges = canny(img_gray, sigma=opts.sigma) elif ff == "roberts": edges = roberts(img_gray) elif ff == "sobel": edges = sobel(img_gray) edges = clear_border(edges, buffer_size=opts.border) selem = disk(kernel) closed = closing(edges, selem) if kernel else edges filled = binary_fill_holes(closed) # Watershed algorithm if opts.watershed: distance = distance_transform_edt(filled) local_maxi = peak_local_max(distance, threshold_rel=.05, indices=False) coordinates = peak_local_max(distance, threshold_rel=.05) markers, nmarkers = label(local_maxi, return_num=True) logging.debug("Identified {0} watershed markers".format(nmarkers)) labels = watershed(closed, markers, mask=filled) else: labels = label(filled) # Object size filtering w, h = img_gray.shape canvas_size = w * h min_size = int(round(canvas_size * opts.minsize / 100)) max_size = int(round(canvas_size * opts.maxsize / 100)) logging.debug("Find objects with pixels between {0} ({1}%) and {2} ({3}%)"\ .format(min_size, opts.minsize, max_size, opts.maxsize)) # Plotting ax1.set_title('Original picture') ax1.imshow(oimg) params = "{0}, $\sigma$={1}, $k$={2}".format(ff, sigma, kernel) if opts.watershed: params += ", watershed" ax2.set_title('Edge detection\n({0})'.format(params)) closed = gray2rgb(closed) ax2_img = labels if opts.edges: ax2_img = closed elif opts.watershed: ax2.plot(coordinates[:, 1], coordinates[:, 0], 'g.') ax2.imshow(ax2_img, cmap=iopts.cmap) ax3.set_title('Object detection') ax3.imshow(img) filename = op.basename(pngfile) if labelfile: accession = extract_label(labelfile) else: accession = pf # Calculate region properties rp = regionprops(labels) rp = [x for x in rp if min_size <= x.area <= max_size] nb_labels = len(rp) logging.debug("A total of {0} objects identified.".format(nb_labels)) objects = [] for i, props in enumerate(rp): i += 1 if i > opts.count: break y0, x0 = props.centroid orientation = props.orientation major, minor = props.major_axis_length, props.minor_axis_length major_dx = cos(orientation) * major / 2 major_dy = sin(orientation) * major / 2 minor_dx = sin(orientation) * minor / 2 minor_dy = cos(orientation) * minor / 2 ax2.plot((x0 - major_dx, x0 + major_dx), (y0 + major_dy, y0 - major_dy), 'r-') ax2.plot((x0 - minor_dx, x0 + minor_dx), (y0 - minor_dy, y0 + minor_dy), 'r-') npixels = int(props.area) # Sample the center of the blob for color d = min(int(round(minor / 2 * .35)) + 1, 50) x0d, y0d = int(round(x0)), int(round(y0)) square = img[(y0d - d):(y0d + d), (x0d - d):(x0d + d)] pixels = [] for row in square: pixels.extend(row) logging.debug("Seed #{0}: {1} pixels ({2} sampled) - {3:.2f}%".\ format(i, npixels, len(pixels), 100. * npixels / canvas_size)) rgb = pixel_stats(pixels) objects.append(Seed(filename, accession, i, rgb, props, exif)) minr, minc, maxr, maxc = props.bbox rect = Rectangle((minc, minr), maxc - minc, maxr - minr, fill=False, ec='w', lw=1) ax3.add_patch(rect) mc, mr = (minc + maxc) / 2, (minr + maxr) / 2 ax3.text(mc, mr, "{0}".format(i), color='w', ha="center", va="center", size=6) for ax in (ax2, ax3): ax.set_xlim(0, h) ax.set_ylim(w, 0) # Output identified seed stats ax4.text(.1, .92, "File: {0}".format(latex(filename)), color='g') ax4.text(.1, .86, "Label: {0}".format(latex(accession)), color='m') yy = .8 fw = must_open(opts.outfile, "w") if not opts.noheader: print(Seed.header(calibrate=calib), file=fw) for o in objects: if calib: o.calibrate(pixel_cm_ratio, tr) print(o, file=fw) i = o.seedno if i > 7: continue ax4.text(.01, yy, str(i), va="center", bbox=dict(fc='none', ec='k')) ax4.text(.1, yy, o.pixeltag, va="center") yy -= .04 ax4.add_patch(Rectangle((.1, yy - .025), .12, .05, lw=0, fc=rgb_to_hex(o.rgb))) ax4.text(.27, yy, o.hashtag, va="center") yy -= .06 ax4.text(.1 , yy, "(A total of {0} objects displayed)".format(nb_labels), color="darkslategrey") normalize_axes(ax4) for ax in (ax1, ax2, ax3): xticklabels = [int(x) for x in ax.get_xticks()] yticklabels = [int(x) for x in ax.get_yticks()] ax.set_xticklabels(xticklabels, family='Helvetica', size=8) ax.set_yticklabels(yticklabels, family='Helvetica', size=8) image_name = op.join(outdir, pf + "." + iopts.format) savefig(image_name, dpi=iopts.dpi, iopts=iopts) return objects
def seeds(args): """ %prog seeds [pngfile|jpgfile] Extract seed metrics from [pngfile|jpgfile]. Use --rows and --cols to crop image. """ p = OptionParser(seeds.__doc__) p.set_outfile() opts, args, iopts = add_seeds_options(p, args) if len(args) != 1: sys.exit(not p.print_help()) (pngfile, ) = args pf = opts.prefix or op.basename(pngfile).rsplit(".", 1)[0] sigma, kernel = opts.sigma, opts.kernel rows, cols = opts.rows, opts.cols labelrows, labelcols = opts.labelrows, opts.labelcols ff = opts.filter calib = opts.calibrate outdir = opts.outdir if outdir != ".": mkdir(outdir) if calib: calib = json.load(must_open(calib)) pixel_cm_ratio, tr = calib["PixelCMratio"], calib["RGBtransform"] tr = np.array(tr) nbcolor = opts.changeBackground pngfile = convert_background(pngfile, nbcolor) resizefile, mainfile, labelfile, exif = convert_image( pngfile, pf, outdir=outdir, rotate=opts.rotate, rows=rows, cols=cols, labelrows=labelrows, labelcols=labelcols, ) oimg = load_image(resizefile) img = load_image(mainfile) fig, (ax1, ax2, ax3, ax4) = plt.subplots(ncols=4, nrows=1, figsize=(iopts.w, iopts.h)) # Edge detection img_gray = rgb2gray(img) logging.debug("Running {0} edge detection ...".format(ff)) if ff == "canny": edges = canny(img_gray, sigma=opts.sigma) elif ff == "roberts": edges = roberts(img_gray) elif ff == "sobel": edges = sobel(img_gray) edges = clear_border(edges, buffer_size=opts.border) selem = disk(kernel) closed = closing(edges, selem) if kernel else edges filled = binary_fill_holes(closed) # Watershed algorithm if opts.watershed: distance = distance_transform_edt(filled) local_maxi = peak_local_max(distance, threshold_rel=0.05, indices=False) coordinates = peak_local_max(distance, threshold_rel=0.05) markers, nmarkers = label(local_maxi, return_num=True) logging.debug("Identified {0} watershed markers".format(nmarkers)) labels = watershed(closed, markers, mask=filled) else: labels = label(filled) # Object size filtering w, h = img_gray.shape canvas_size = w * h min_size = int(round(canvas_size * opts.minsize / 100)) max_size = int(round(canvas_size * opts.maxsize / 100)) logging.debug( "Find objects with pixels between {0} ({1}%) and {2} ({3}%)".format( min_size, opts.minsize, max_size, opts.maxsize)) # Plotting ax1.set_title("Original picture") ax1.imshow(oimg) params = "{0}, $\sigma$={1}, $k$={2}".format(ff, sigma, kernel) if opts.watershed: params += ", watershed" ax2.set_title("Edge detection\n({0})".format(params)) closed = gray2rgb(closed) ax2_img = labels if opts.edges: ax2_img = closed elif opts.watershed: ax2.plot(coordinates[:, 1], coordinates[:, 0], "g.") ax2.imshow(ax2_img, cmap=iopts.cmap) ax3.set_title("Object detection") ax3.imshow(img) filename = op.basename(pngfile) if labelfile: accession = extract_label(labelfile) else: accession = pf # Calculate region properties rp = regionprops(labels) rp = [x for x in rp if min_size <= x.area <= max_size] nb_labels = len(rp) logging.debug("A total of {0} objects identified.".format(nb_labels)) objects = [] for i, props in enumerate(rp): i += 1 if i > opts.count: break y0, x0 = props.centroid orientation = props.orientation major, minor = props.major_axis_length, props.minor_axis_length major_dx = cos(orientation) * major / 2 major_dy = sin(orientation) * major / 2 minor_dx = sin(orientation) * minor / 2 minor_dy = cos(orientation) * minor / 2 ax2.plot((x0 - major_dx, x0 + major_dx), (y0 + major_dy, y0 - major_dy), "r-") ax2.plot((x0 - minor_dx, x0 + minor_dx), (y0 - minor_dy, y0 + minor_dy), "r-") npixels = int(props.area) # Sample the center of the blob for color d = min(int(round(minor / 2 * 0.35)) + 1, 50) x0d, y0d = int(round(x0)), int(round(y0)) square = img[(y0d - d):(y0d + d), (x0d - d):(x0d + d)] pixels = [] for row in square: pixels.extend(row) logging.debug("Seed #{0}: {1} pixels ({2} sampled) - {3:.2f}%".format( i, npixels, len(pixels), 100.0 * npixels / canvas_size)) rgb = pixel_stats(pixels) objects.append(Seed(filename, accession, i, rgb, props, exif)) minr, minc, maxr, maxc = props.bbox rect = Rectangle((minc, minr), maxc - minc, maxr - minr, fill=False, ec="w", lw=1) ax3.add_patch(rect) mc, mr = (minc + maxc) / 2, (minr + maxr) / 2 ax3.text(mc, mr, "{0}".format(i), color="w", ha="center", va="center", size=6) for ax in (ax2, ax3): ax.set_xlim(0, h) ax.set_ylim(w, 0) # Output identified seed stats ax4.text(0.1, 0.92, "File: {0}".format(latex(filename)), color="g") ax4.text(0.1, 0.86, "Label: {0}".format(latex(accession)), color="m") yy = 0.8 fw = must_open(opts.outfile, "w") if not opts.noheader: print(Seed.header(calibrate=calib), file=fw) for o in objects: if calib: o.calibrate(pixel_cm_ratio, tr) print(o, file=fw) i = o.seedno if i > 7: continue ax4.text(0.01, yy, str(i), va="center", bbox=dict(fc="none", ec="k")) ax4.text(0.1, yy, o.pixeltag, va="center") yy -= 0.04 ax4.add_patch( Rectangle((0.1, yy - 0.025), 0.12, 0.05, lw=0, fc=rgb_to_hex(o.rgb))) ax4.text(0.27, yy, o.hashtag, va="center") yy -= 0.06 ax4.text( 0.1, yy, "(A total of {0} objects displayed)".format(nb_labels), color="darkslategray", ) normalize_axes(ax4) for ax in (ax1, ax2, ax3): xticklabels = [int(x) for x in ax.get_xticks()] yticklabels = [int(x) for x in ax.get_yticks()] ax.set_xticklabels(xticklabels, family="Helvetica", size=8) ax.set_yticklabels(yticklabels, family="Helvetica", size=8) image_name = op.join(outdir, pf + "." + iopts.format) savefig(image_name, dpi=iopts.dpi, iopts=iopts) return objects
def compare4(args): """ %prog compare4 Compare performances of various variant callers on simulated STR datasets. Adds coverage comparisons as panel C and D. """ p = OptionParser(compare4.__doc__) p.add_option('--maxinsert', default=300, type="int", help="Maximum number of repeats") add_simulate_options(p) opts, args, iopts = p.set_image_options(args, figsize="10x10") if len(args) != 0: sys.exit(not p.print_help()) depth = opts.depth max_insert = opts.maxinsert fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols=2, nrows=2, figsize=(iopts.w, iopts.h)) plt.tight_layout(pad=3) # ax1: lobSTR vs TREDPARSE with haploid model lobstr_results = parse_results("lobstr_results_homo-20x-150bp-500bp.txt") tredparse_results = parse_results( "tredparse_results_homo-20x-150bp-500bp.txt") title = SIMULATED_HAPLOID + r" ($Depth=%s\times)" % depth plot_compare(ax1, title, tredparse_results, lobstr_results, max_insert=max_insert) # ax2: lobSTR vs TREDPARSE with diploid model (depth=20x) lobstr_results = parse_results("lobstr_results_het-20x-150bp-500bp.txt", exclude=20) tredparse_results = parse_results( "tredparse_results_het-20x-150bp-500bp.txt", exclude=20) title = SIMULATED_DIPLOID + r" ($Depth=%s\times$)" % depth plot_compare(ax2, title, tredparse_results, lobstr_results, max_insert=max_insert) # ax3: lobSTR vs TREDPARSE with diploid model (depth=5x) lobstr_results = parse_results("lobstr_results_het-5x-150bp-500bp.txt", exclude=20) tredparse_results = parse_results( "tredparse_results_het-5x-150bp-500bp.txt", exclude=20) title = SIMULATED_DIPLOID + r" ($Depth=%s\times$)" % 5 plot_compare(ax3, title, tredparse_results, lobstr_results, max_insert=max_insert) # ax4: lobSTR vs TREDPARSE with diploid model (depth=80x) lobstr_results = parse_results("lobstr_results_het-80x-150bp-500bp.txt", exclude=20) tredparse_results = parse_results( "tredparse_results_het-80x-150bp-500bp.txt", exclude=20) title = SIMULATED_DIPLOID + r" ($Depth=%s\times$)" % 80 plot_compare(ax4, title, tredparse_results, lobstr_results, max_insert=max_insert) for ax in (ax1, ax2, ax3, ax4): ax.set_xlim(0, max_insert) ax.set_ylim(0, max_insert) root = fig.add_axes([0, 0, 1, 1]) pad = .03 panel_labels(root, ((pad / 2, 1 - pad, "A"), (1 / 2., 1 - pad, "B"), (pad / 2, 1 / 2., "C"), (1 / 2., 1 / 2., "D"))) normalize_axes(root) image_name = "tredparse." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def compare3(args): """ %prog compare3 Compare performances of various variant callers on simulated STR datasets. This compares the power of various evidence types. """ p = OptionParser(compare3.__doc__) p.add_option('--maxinsert', default=300, type="int", help="Maximum number of repeats") add_simulate_options(p) opts, args, iopts = p.set_image_options(args, figsize="10x10") if len(args) != 0: sys.exit(not p.print_help()) max_insert = opts.maxinsert fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols=2, nrows=2, figsize=(iopts.w, iopts.h)) plt.tight_layout(pad=3) color = "lightslategray" # ax1: Spanning tredparse_results = parse_results("tredparse_results_het-spanning.txt") title = SIMULATED_DIPLOID + "( Sub-model 1: Spanning reads)" plot_compare(ax1, title, tredparse_results, None, color=color, max_insert=max_insert, risk=False) # ax2: Partial tredparse_results = parse_results("tredparse_results_het-partial.txt", exclude=20) title = SIMULATED_DIPLOID + " (Sub-model 2: Partial reads)" plot_compare(ax2, title, tredparse_results, None, color=color, max_insert=max_insert, risk=False) # ax3: Repeat tredparse_results = parse_results("tredparse_results_het-repeat.txt", exclude=20) # HACK (repeat reads won't work under 50) tredparse_results = [x for x in tredparse_results if x[0] > 50] title = SIMULATED_DIPLOID + " (Sub-model 3: Repeat-only reads)" plot_compare(ax3, title, tredparse_results, None, color=color, max_insert=max_insert, risk=False) # ax4: Pair tredparse_results = parse_results("tredparse_results_het-pair.txt", exclude=20) title = SIMULATED_DIPLOID + " (Sub-model 4: Paired-end reads)" plot_compare(ax4, title, tredparse_results, None, color=color, max_insert=max_insert, risk=False) for ax in (ax1, ax2, ax3, ax4): ax.set_xlim(0, max_insert) ax.set_ylim(0, max_insert) root = fig.add_axes([0, 0, 1, 1]) pad = .03 panel_labels(root, ((pad / 2, 1 - pad, "A"), (1 / 2., 1 - pad, "B"), (pad / 2, 1 / 2., "C"), (1 / 2., 1 / 2., "D"))) normalize_axes(root) image_name = "tredparse." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def depth(args): """ %prog depth anchorfile --qbed qbedfile --sbed sbedfile Calculate the depths in the two genomes in comparison, given in --qbed and --sbed. The synteny blocks will be layered on the genomes, and the multiplicity will be summarized to stderr. """ from jcvi.utils.range import range_depth p = OptionParser(depth.__doc__) p.add_option("--depthfile", help="Generate file with gene and depth [default: %default]") p.add_option("--histogram", default=False, action="store_true", help="Plot histograms in PDF") p.add_option("--xmax", type="int", help="x-axis maximum to display in plot") p.add_option("--title", default=None, help="Title to display in plot") p.add_option("--quota", help="Force to use this quota, e.g. 1:1, 1:2 ...") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) depthfile = opts.depthfile ac = AnchorFile(anchorfile) qranges = [] sranges = [] blocks = ac.blocks for ib in blocks: q, s, t = zip(*ib) q = [qorder[x] for x in q] s = [sorder[x] for x in s] qrange = (min(q)[0], max(q)[0]) srange = (min(s)[0], max(s)[0]) qranges.append(qrange) sranges.append(srange) if is_self: qranges.append(srange) qgenome = op.basename(qbed.filename).split(".")[0] sgenome = op.basename(sbed.filename).split(".")[0] qtag = "Genome {0} depths".format(qgenome) print("{}:".format(qtag), file=sys.stderr) dsq, details = range_depth(qranges, len(qbed)) if depthfile: fw = open(depthfile, "w") write_details(fw, details, qbed) if is_self: return stag = "Genome {0} depths".format(sgenome) print("{}:".format(stag), file=sys.stderr) dss, details = range_depth(sranges, len(sbed)) if depthfile: write_details(fw, details, sbed) fw.close() logging.debug("Depth written to `{0}`.".format(depthfile)) if not opts.histogram: return from jcvi.graphics.base import plt, quickplot_ax, savefig, normalize_axes # Plot two histograms one for query genome, one for subject genome plt.figure(1, (6, 3)) f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) xmax = opts.xmax or max(4, max(dsq.keys() + dss.keys())) if opts.quota: speak, qpeak = opts.quota.split(":") qpeak, speak = int(qpeak), int(speak) else: qpeak = find_peak(dsq) speak = find_peak(dss) qtag = "# of {} blocks per {} gene".format(sgenome, qgenome) stag = "# of {} blocks per {} gene".format(qgenome, sgenome) quickplot_ax(ax1, dss, 0, xmax, stag, ylabel="Percentage of genome", highlight=range(1, speak + 1)) quickplot_ax(ax2, dsq, 0, xmax, qtag, ylabel=None, highlight=range(1, qpeak + 1)) title = opts.title or "{} vs {} syntenic depths\n{}:{} pattern"\ .format(qgenome, sgenome, speak, qpeak) root = f.add_axes([0, 0, 1, 1]) vs, pattern = title.split('\n') root.text(.5, .97, vs, ha="center", va="center", color="darkslategray") root.text(.5, .925, pattern, ha="center", va="center", color="tomato", size=16) print(title, file=sys.stderr) normalize_axes(root) pf = anchorfile.rsplit(".", 1)[0] + ".depth" image_name = pf + ".pdf" savefig(image_name)
def depth(args): """ %prog depth anchorfile --qbed qbedfile --sbed sbedfile Calculate the depths in the two genomes in comparison, given in --qbed and --sbed. The synteny blocks will be layered on the genomes, and the multiplicity will be summarized to stderr. """ from jcvi.utils.range import range_depth p = OptionParser(depth.__doc__) p.add_option("--depthfile", help="Generate file with gene and depth [default: %default]") p.add_option("--histogram", default=False, action="store_true", help="Plot histograms in PDF") p.add_option("--xmax", type="int", help="x-axis maximum to display in plot") p.add_option("--title", default=None, help="Title to display in plot") p.add_option("--quota", help="Force to use this quota, e.g. 1:1, 1:2 ...") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) depthfile = opts.depthfile ac = AnchorFile(anchorfile) qranges = [] sranges = [] blocks = ac.blocks for ib in blocks: q, s, t = zip(*ib) q = [qorder[x] for x in q] s = [sorder[x] for x in s] qrange = (min(q)[0], max(q)[0]) srange = (min(s)[0], max(s)[0]) qranges.append(qrange) sranges.append(srange) if is_self: qranges.append(srange) qgenome = op.basename(qbed.filename).split(".")[0] sgenome = op.basename(sbed.filename).split(".")[0] qtag = "Genome {0} depths".format(qgenome) print >> sys.stderr, "{}:".format(qtag) dsq, details = range_depth(qranges, len(qbed)) if depthfile: fw = open(depthfile, "w") write_details(fw, details, qbed) if is_self: return stag = "Genome {0} depths".format(sgenome) print >> sys.stderr, "{}:".format(stag) dss, details = range_depth(sranges, len(sbed)) if depthfile: write_details(fw, details, sbed) fw.close() logging.debug("Depth written to `{0}`.".format(depthfile)) if not opts.histogram: return from jcvi.graphics.base import plt, quickplot_ax, savefig, normalize_axes # Plot two histograms one for query genome, one for subject genome plt.figure(1, (6, 3)) f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) xmax = opts.xmax or max(4, max(dsq.keys() + dss.keys())) if opts.quota: speak, qpeak = opts.quota.split(":") qpeak, speak = int(qpeak), int(speak) else: qpeak = find_peak(dsq) speak = find_peak(dss) qtag = "# of {} blocks per {} gene".format(sgenome, qgenome) stag = "# of {} blocks per {} gene".format(qgenome, sgenome) quickplot_ax(ax1, dss, 0, xmax, stag, ylabel="Percentage of genome", highlight=range(1, speak + 1)) quickplot_ax(ax2, dsq, 0, xmax, qtag, ylabel=None, highlight=range(1, qpeak + 1)) title = opts.title or "{} vs {} syntenic depths\n{}:{} pattern"\ .format(qgenome, sgenome, speak, qpeak) root = f.add_axes([0, 0, 1, 1]) vs, pattern = title.split('\n') root.text(.5, .97, vs, ha="center", va="center", color="darkslategray") root.text(.5, .925, pattern, ha="center", va="center", color="tomato", size=16) print >> sys.stderr, title normalize_axes(root) pf = anchorfile.rsplit(".", 1)[0] + ".depth" image_name = pf + ".pdf" savefig(image_name)
def likelihood(args): """ %prog likelihood Plot likelihood surface. Look for two files in the current folder: - 100_100.log, haploid model - 100_20.log, diploid model """ p = OptionParser(likelihood.__doc__) opts, args, iopts = p.set_image_options(args, figsize="10x5", style="white", cmap="coolwarm") if len(args) != 0: sys.exit(not p.print_help()) fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(iopts.w, iopts.h)) plt.tight_layout(pad=4) # Haploid model LL, CI_h1, CI_h2, MLE = parse_log("100_100.log") data = [] for k, v in LL.items(): data.append((k[0], v)) data.sort() x, y = zip(*data) x = np.array(x) curve, = ax1.plot(x, y, "-", color=lsg, lw=2) ax1.set_title("Simulated haploid ($h^{truth}=100$)") h_hat, max_LL = max(data, key=lambda x: x[-1]) _, min_LL = min(data, key=lambda x: x[-1]) ymin, ymax = ax1.get_ylim() ax1.set_ylim([ymin, ymax + 30]) LL_label = "log(Likelihood)" ax1.plot([h_hat, h_hat], [ymin, max_LL], ":", color=lsg, lw=2) ax1.text(h_hat, max_LL + 10, r"$\hat{h}=93$", color=lsg) ax1.set_xlabel(r"$h$") ax1.set_ylabel(LL_label) a, b = CI_h1 ci = ax1.fill_between(x, [ymin] * len(x), y, where=(x >= a) & (x <= b), color=lsg, alpha=.5) ax1.legend([curve, ci], ["Likelihood curve", r'95$\%$ CI'], loc='best') # Diploid model LL, CI_h1, CI_h2, MLE = parse_log("100_20.log") h_hat, max_LL = max(data, key=lambda x: x[-1]) _, min_LL = min(data, key=lambda x: x[-1]) data = np.ones((301, 301)) * min_LL for k, v in LL.items(): a, b = k data[a, b] = v data[b, a] = v data = mask_upper_triangle(data) ax_imshow(ax2, data, opts.cmap, LL_label, 20, 104) root = fig.add_axes([0, 0, 1, 1]) pad = .04 panel_labels(root, ((pad / 2, 1 - pad, "A"), (1 / 2., 1 - pad, "B"))) normalize_axes(root) image_name = "likelihood." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)