def main(args=None): args = process_args(args) cr = countR.CountReadsPerBin( args.bamfiles, args.binSize, args.numberOfSamples, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude) num_reads_per_bin = cr.run() if num_reads_per_bin.sum() == 0: import sys sys.stderr.write( "\nNo reads were found in {} regions sampled. Check that the\n" "min mapping quality is not overly high and that the \n" "chromosome names between bam files are consistant.\n" "\n".format(num_reads_per_bin.shape[0])) exit(1) if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) total = len(num_reads_per_bin[:, 0]) x = np.arange(total).astype('float') / total # normalize from 0 to 1 i = 0 for reads in num_reads_per_bin.T: count = np.cumsum(np.sort(reads)) count = count / count[-1] # to normalyze y from 0 to 1 plt.plot(x, count, label=args.labels[i]) plt.xlabel('rank') plt.ylabel('fraction w.r.t. bin with highest coverage') i += 1 plt.legend(loc='upper left') plt.suptitle(args.plotTitle) # set the plotFileFormat explicitly to None to trigger the # format from the file-extension if not args.plotFileFormat: args.plotFileFormat = None plt.savefig(args.plotFile.name, bbox_inches=0, format=args.plotFileFormat) plt.close() if args.outRawCounts: args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n" for row in num_reads_per_bin: args.outRawCounts.write(fmt % tuple(row))
def main(args=None): args = process_args(args) cr = countR.CountReadsPerBin(args.bamfiles, args.binSize, args.numberOfSamples, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude) num_reads_per_bin = cr.run() if num_reads_per_bin.sum() == 0: import sys sys.stderr.write( "\nNo reads were found in {} regions sampled. Check that the\n" "min mapping quality is not overly high and that the \n" "chromosome names between bam files are consistant.\n" "\n".format(num_reads_per_bin.shape[0])) exit(1) if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) total = len(num_reads_per_bin[:, 0]) x = np.arange(total).astype('float') / total # normalize from 0 to 1 i = 0 for reads in num_reads_per_bin.T: count = np.cumsum(np.sort(reads)) count = count / count[-1] # to normalyze y from 0 to 1 plt.plot(x, count, label=args.labels[i]) plt.xlabel('rank') plt.ylabel('fraction w.r.t. bin with highest coverage') i += 1 plt.legend(loc='upper left') plt.suptitle(args.plotTitle) # set the plotFileFormat explicitly to None to trigger the # format from the file-extension if not args.plotFileFormat: args.plotFileFormat = None plt.savefig(args.plotFile.name, bbox_inches=0, format=args.plotFileFormat) if args.outRawCounts: args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n" for row in num_reads_per_bin: args.outRawCounts.write(fmt % tuple(row))
def main(args=None): args = process_args(args) if not args.outRawCounts and not args.plotFile and not args.outCoverageMetrics: sys.exit( "At least one of --plotFile, --outRawCounts and --outCoverageMetrics are required.\n" ) if 'BED' in args: bed_regions = args.BED else: bed_regions = None cr = countR.CountReadsPerBin(args.bamfiles, binLength=1, bedFile=bed_regions, numberOfSamples=args.numberOfSamples, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, blackListFileName=args.blackListFileName, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, bed_and_bin=True, out_file_for_raw_data=args.outRawCounts) num_reads_per_bin = cr.run() if args.outCoverageMetrics and args.coverageThresholds: args.coverageThresholds.sort( ) # Galaxy in particular tends to give things in a weird order of = open(args.outCoverageMetrics, "w") of.write("Sample\tThreshold\tPercent\n") nbins = float(num_reads_per_bin.shape[0]) for thresh in args.coverageThresholds: vals = np.sum(num_reads_per_bin >= thresh, axis=0) for lab, val in zip(args.labels, vals): of.write("{}\t{}\t{:6.3f}\n".format(lab, thresh, 100. * val / nbins)) of.close() if args.outRawCounts: # append to the generated file the # labels header = "#plotCoverage --outRawCounts\n#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" f = open(args.outRawCounts, 'r+') content = f.read() f.seek(0, 0) f.write(header + content) f.close() if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non-zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) if args.plotFile: if args.plotFileFormat == 'plotly': fig = go.Figure() fig['layout']['xaxis1'] = { 'domain': [0.0, 0.48], 'anchor': 'x1', 'title': 'coverage (#reads per base)' } fig['layout']['xaxis2'] = { 'domain': [0.52, 1.0], 'anchor': 'x2', 'title': 'coverage (#reads per base)' } fig['layout']['yaxis1'] = { 'domain': [0.0, 1.0], 'anchor': 'x1', 'title': 'fraction of bases sampled' } fig['layout']['yaxis2'] = { 'domain': [0.0, 1.0], 'anchor': 'x2', 'title': 'fraction of bases sampled >= coverage' } fig['layout'].update(title=args.plotTitle) else: fig, axs = plt.subplots(1, 2, figsize=(args.plotWidth, args.plotHeight)) plt.suptitle(args.plotTitle) # plot up to two std from mean num_reads_per_bin = num_reads_per_bin.astype(int) sample_mean = num_reads_per_bin.mean(axis=0) sample_std = num_reads_per_bin.std(axis=0) sample_max = num_reads_per_bin.max(axis=0) sample_min = num_reads_per_bin.min(axis=0) sample_25 = np.percentile(num_reads_per_bin, 25, axis=0) sample_50 = np.percentile(num_reads_per_bin, 50, axis=0) sample_75 = np.percentile(num_reads_per_bin, 75, axis=0) # use the largest 99th percentile from all samples to set the x_max value x_max = np.max(np.percentile(num_reads_per_bin, 99, axis=0)) # plot coverage # print headers for text output print("sample\tmean\tstd\tmin\t25%\t50%\t75%\tmax") # the determination of a sensible value for y_max of the first plot (fraction of bases sampled vs. # coverage) is important because, depending on the data, # it becomes very difficult to see the lines in the plot. For example, if the coverage of a sample # is a nice gaussian curve with a large mean of 50. Then a sensible range for the y axis (fraction of # reads having coverage=x) is (0, 0.02) which nicely shows the coverage curve. If instead the coverage is # very por and centers close to 1 then a good y axis range is (0,1). # the current implementation aims to find the y_value for which 50% of the reads >= x (coverage) and # sets that as the x_axis range. y_max = [] data = [] # We need to manually set the line colors so they're shared between the two plots. plotly_colors = [ "#d73027", "#fc8d59", "#f33090", "#e0f3f8", "#91bfdb", "#4575b4" ] plotly_styles = sum([ 6 * ["solid"], 6 * ["dot"], 6 * ["dash"], 6 * ["longdash"], 6 * ["dashdot"], 6 * ["longdashdot"] ], []) for idx, col in enumerate(num_reads_per_bin.T): if args.plotFile: frac_reads_per_coverage = np.bincount( col.astype(int)).astype(float) / num_reads_per_bin.shape[0] csum = np.bincount(col.astype(int))[::-1].cumsum() csum_frac = csum.astype(float)[::-1] / csum.max() if args.plotFileFormat == 'plotly': color = plotly_colors[idx % len(plotly_colors)] dash = plotly_styles[idx % len(plotly_styles)] trace = go.Scatter(x=np.arange(0, int(x_max) - 1), y=frac_reads_per_coverage[:int(x_max)], mode='lines', xaxis='x1', yaxis='y1', line=dict(color=color, dash=dash), name="{}, mean={:.1f}".format( args.labels[idx], sample_mean[idx]), legendgroup="{}".format(idx)) data.append(trace) trace = go.Scatter(x=np.arange(0, int(x_max) - 1), y=csum_frac[:int(x_max)], mode='lines', xaxis='x2', yaxis='y2', line=dict(color=color, dash=dash), name=args.labels[idx], showlegend=False, legendgroup="{}".format(idx)) data.append(trace) else: axs[0].plot(frac_reads_per_coverage, label="{}, mean={:.1f}".format( args.labels[idx], sample_mean[idx])) axs[1].plot(csum_frac, label=args.labels[idx]) # find the indexes (i.e. the x values) for which the cumulative distribution 'fraction of bases # sampled >= coverage' where fraction of bases sampled = 50%: `np.flatnonzero(csum_frac>0.5)` # then find the fraction of bases sampled that that have the largest x y_max.append(frac_reads_per_coverage[max( np.flatnonzero(csum_frac > 0.5))]) print("{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format( args.labels[idx], sample_mean[idx], sample_std[idx], sample_min[idx], sample_25[idx], sample_50[idx], sample_75[idx], sample_max[idx], )) if args.plotFile: # Don't clip plots y_max = max(y_max) if args.plotFileFormat == "plotly": fig['data'] = data fig['layout']['yaxis1'].update( range=[0.0, min(1, y_max + (y_max * 0.10))]) fig['layout']['yaxis2'].update(range=[0.0, 1.0]) py.plot(fig, filename=args.plotFile, auto_open=False) else: axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10))) axs[0].set_xlim(0, x_max) axs[0].set_xlabel('coverage (#reads per bp)') axs[0].legend(fancybox=True, framealpha=0.5) axs[0].set_ylabel('fraction of bases sampled') # plot cumulative coverage axs[1].set_xlim(0, x_max) axs[1].set_xlabel('coverage (#reads per bp)') axs[1].set_ylabel('fraction of bases sampled >= coverage') axs[1].legend(fancybox=True, framealpha=0.5) plt.savefig(args.plotFile, format=args.plotFileFormat) plt.close()
def main(args=None): args = process_args(args) if not args.plotFile and not args.outRawCounts and not args.outQualityMetrics: sys.stderr.write("\nAt least one of --plotFile, --outRawCounts or --outQualityMetrics is required.\n") sys.exit(1) cr = sumR.SumCoveragePerBin( args.bamfiles, args.binSize, args.numberOfSamples, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength) num_reads_per_bin = cr.run() if num_reads_per_bin.sum() == 0: import sys sys.stderr.write( "\nNo reads were found in {} regions sampled. Check that the\n" "min mapping quality is not overly high and that the \n" "chromosome names between bam files are consistant.\n" "\n".format(num_reads_per_bin.shape[0])) exit(1) if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) total = len(num_reads_per_bin[:, 0]) x = np.arange(total).astype('float') / total # normalize from 0 to 1 if args.plotFile is not None: i = 0 # matplotlib won't iterate through line styles by itself pyplot_line_styles = sum([7 * ["-"], 7 * ["--"], 7 * ["-."], 7 * [":"]], []) plotly_colors = ["#d73027", "#fc8d59", "#f33090", "#e0f3f8", "#91bfdb", "#4575b4"] plotly_line_styles = sum([6 * ["solid"], 6 * ["dot"], 6 * ["dash"], 6 * ["longdash"], 6 * ["dashdot"], 6 * ["longdashdot"]], []) data = [] for i, reads in enumerate(num_reads_per_bin.T): count = np.cumsum(np.sort(reads)) count = count / count[-1] # to normalize y from 0 to 1 if args.plotFileFormat == 'plotly': trace = go.Scatter(x=x, y=count, mode='lines', name=args.labels[i]) trace['line'].update(dash=plotly_line_styles[i % 36], color=plotly_colors[i % 6]) data.append(trace) else: j = i % 35 plt.plot(x, count, label=args.labels[i], linestyle=pyplot_line_styles[j]) plt.xlabel('rank') plt.ylabel('fraction w.r.t. bin with highest coverage') # set the plotFileFormat explicitly to None to trigger the # format from the file-extension if not args.plotFileFormat: args.plotFileFormat = None if args.plotFileFormat == 'plotly': fig = go.Figure() fig['data'] = data fig['layout'].update(title=args.plotTitle) fig['layout']['xaxis1'].update(title="rank") fig['layout']['yaxis1'].update(title="fraction w.r.t bin with highest coverage") py.plot(fig, filename=args.plotFile, auto_open=False) else: plt.legend(loc='upper left') plt.suptitle(args.plotTitle) plt.savefig(args.plotFile, bbox_inches=0, format=args.plotFileFormat) plt.close() if args.outRawCounts is not None: of = open(args.outRawCounts, "w") of.write("#plotFingerprint --outRawCounts\n") of.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n" for row in num_reads_per_bin: of.write(fmt % tuple(row)) of.close() if args.outQualityMetrics is not None: of = open(args.outQualityMetrics, "w") of.write("Sample\tAUC\tSynthetic AUC\tX-intercept\tSynthetic X-intercept\tElbow Point\tSynthetic Elbow Point") if args.JSDsample: of.write("\tJS Distance\tSynthetic JS Distance\t% genome enriched\tdiff. enrichment\tCHANCE divergence") else: of.write("\tSynthetic JS Distance") of.write("\n") line = np.arange(num_reads_per_bin.shape[0]) / float(num_reads_per_bin.shape[0] - 1) for idx, reads in enumerate(num_reads_per_bin.T): counts = np.cumsum(np.sort(reads)) counts = counts / float(counts[-1]) AUC = np.sum(counts) / float(len(counts)) XInt = (np.argmax(counts > 0) + 1) / float(counts.shape[0]) elbow = (np.argmax(line - counts) + 1) / float(counts.shape[0]) expected = getExpected(np.mean(reads)) # A tuple of expected (AUC, XInt, elbow) of.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(args.labels[idx], AUC, expected[0], XInt, expected[1], elbow, expected[2])) if args.JSDsample: JSD = getJSD(args, idx, num_reads_per_bin) syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx]) CHANCE = getCHANCE(args, idx, num_reads_per_bin) of.write("\t{0}\t{1}\t{2}\t{3}\t{4}".format(JSD, syntheticJSD, CHANCE[0], CHANCE[1], CHANCE[2])) else: syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx]) of.write("\t{0}".format(syntheticJSD)) of.write("\n") of.close()
def main(args=None): args = process_args(args) if not args.outRawCounts and not args.plotFile and not args.outCoverageMetrics: sys.exit("At least one of --plotFile, --outRawCounts and --outCoverageMetrics are required.\n") if 'BED' in args: bed_regions = args.BED else: bed_regions = None cr = countR.CountReadsPerBin(args.bamfiles, binLength=1, bedFile=bed_regions, numberOfSamples=args.numberOfSamples, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, blackListFileName=args.blackListFileName, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, bed_and_bin=True, out_file_for_raw_data=args.outRawCounts) num_reads_per_bin = cr.run() if args.outCoverageMetrics and args.coverageThresholds: args.coverageThresholds.sort() # Galaxy in particular tends to give things in a weird order of = open(args.outCoverageMetrics, "w") of.write("Sample\tThreshold\tPercent\n") nbins = float(num_reads_per_bin.shape[0]) for thresh in args.coverageThresholds: vals = np.sum(num_reads_per_bin >= thresh, axis=0) for lab, val in zip(args.labels, vals): of.write("{}\t{}\t{:6.3f}\n".format(lab, thresh, 100. * val / nbins)) of.close() if args.outRawCounts: # append to the generated file the # labels header = "#plotCoverage --outRawCounts\n#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" f = open(args.outRawCounts, 'r+') content = f.read() f.seek(0, 0) f.write(header + content) f.close() if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non-zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) if args.plotFile: if args.plotFileFormat == 'plotly': fig = go.Figure() fig['layout']['xaxis1'] = {'domain': [0.0, 0.48], 'anchor': 'x1', 'title': 'coverage (#reads per base)'} fig['layout']['xaxis2'] = {'domain': [0.52, 1.0], 'anchor': 'x2', 'title': 'coverage (#reads per base)'} fig['layout']['yaxis1'] = {'domain': [0.0, 1.0], 'anchor': 'x1', 'title': 'fraction of bases sampled'} fig['layout']['yaxis2'] = {'domain': [0.0, 1.0], 'anchor': 'x2', 'title': 'fraction of bases sampled >= coverage'} fig['layout'].update(title=args.plotTitle) else: fig, axs = plt.subplots(1, 2, figsize=(args.plotWidth, args.plotHeight)) plt.suptitle(args.plotTitle) # plot up to two std from mean num_reads_per_bin = num_reads_per_bin.astype(int) sample_mean = num_reads_per_bin.mean(axis=0) sample_std = num_reads_per_bin.std(axis=0) sample_max = num_reads_per_bin.max(axis=0) sample_min = num_reads_per_bin.min(axis=0) sample_25 = np.percentile(num_reads_per_bin, 25, axis=0) sample_50 = np.percentile(num_reads_per_bin, 50, axis=0) sample_75 = np.percentile(num_reads_per_bin, 75, axis=0) # use the largest 99th percentile from all samples to set the x_max value x_max = np.max(np.percentile(num_reads_per_bin, 99, axis=0)) # plot coverage # print headers for text output print("sample\tmean\tstd\tmin\t25%\t50%\t75%\tmax") # the determination of a sensible value for y_max of the first plot (fraction of bases sampled vs. # coverage) is important because, depending on the data, # it becomes very difficult to see the lines in the plot. For example, if the coverage of a sample # is a nice gaussian curve with a large mean of 50. Then a sensible range for the y axis (fraction of # reads having coverage=x) is (0, 0.02) which nicely shows the coverage curve. If instead the coverage is # very por and centers close to 1 then a good y axis range is (0,1). # the current implementation aims to find the y_value for which 50% of the reads >= x (coverage) and # sets that as the x_axis range. y_max = [] data = [] # We need to manually set the line colors so they're shared between the two plots. plotly_colors = ["#d73027", "#fc8d59", "#f33090", "#e0f3f8", "#91bfdb", "#4575b4"] plotly_styles = sum([6 * ["solid"], 6 * ["dot"], 6 * ["dash"], 6 * ["longdash"], 6 * ["dashdot"], 6 * ["longdashdot"]], []) for idx, col in enumerate(num_reads_per_bin.T): if args.plotFile: frac_reads_per_coverage = np.bincount(col.astype(int)).astype(float) / num_reads_per_bin.shape[0] csum = np.bincount(col.astype(int))[::-1].cumsum() csum_frac = csum.astype(float)[::-1] / csum.max() if args.plotFileFormat == 'plotly': color = plotly_colors[idx % len(plotly_colors)] dash = plotly_styles[idx % len(plotly_styles)] trace = go.Scatter(x=np.arange(0, int(x_max) - 1), y=frac_reads_per_coverage[:int(x_max)], mode='lines', xaxis='x1', yaxis='y1', line=dict(color=color, dash=dash), name="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx]), legendgroup="{}".format(idx)) data.append(trace) trace = go.Scatter(x=np.arange(0, int(x_max) - 1), y=csum_frac[:int(x_max)], mode='lines', xaxis='x2', yaxis='y2', line=dict(color=color, dash=dash), name=args.labels[idx], showlegend=False, legendgroup="{}".format(idx)) data.append(trace) else: axs[0].plot(frac_reads_per_coverage, label="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx])) axs[1].plot(csum_frac, label=args.labels[idx]) # find the indexes (i.e. the x values) for which the cumulative distribution 'fraction of bases # sampled >= coverage' where fraction of bases sampled = 50%: `np.flatnonzero(csum_frac>0.5)` # then find the fraction of bases sampled that that have the largest x y_max.append(frac_reads_per_coverage[max(np.flatnonzero(csum_frac > 0.5))]) print("{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format(args.labels[idx], sample_mean[idx], sample_std[idx], sample_min[idx], sample_25[idx], sample_50[idx], sample_75[idx], sample_max[idx], )) if args.plotFile: # Don't clip plots y_max = max(y_max) if args.plotFileFormat == "plotly": fig['data'] = data fig['layout']['yaxis1'].update(range=[0.0, min(1, y_max + (y_max * 0.10))]) fig['layout']['yaxis2'].update(range=[0.0, 1.0]) py.plot(fig, filename=args.plotFile, auto_open=False) else: axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10))) axs[0].set_xlim(0, x_max) axs[0].set_xlabel('coverage (#reads per bp)') axs[0].legend(fancybox=True, framealpha=0.5) axs[0].set_ylabel('fraction of bases sampled') # plot cumulative coverage axs[1].set_xlim(0, x_max) axs[1].set_xlabel('coverage (#reads per bp)') axs[1].set_ylabel('fraction of bases sampled >= coverage') axs[1].legend(fancybox=True, framealpha=0.5) plt.savefig(args.plotFile, format=args.plotFileFormat) plt.close()
def main(args=None): args = process_args(args) cr = countR.CountReadsPerBin(args.bamfiles, binLength=1, numberOfSamples=args.numberOfSamples, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, blackListFileName=args.blackListFileName, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, out_file_for_raw_data=args.outRawCounts) num_reads_per_bin = cr.run() sys.stderr.write("Number of non zero bins " "used: {}\n".format(num_reads_per_bin.shape[0])) if args.outRawCounts: # append to the generated file the # labels header = "#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" f = open(args.outRawCounts, 'r+') content = f.read() f.seek(0, 0) f.write(header + content) f.close() if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non-zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) fig, axs = plt.subplots(1, 2, figsize=(15, 5)) plt.suptitle(args.plotTitle) # plot up to two std from mean num_reads_per_bin = num_reads_per_bin.astype(int) sample_mean = num_reads_per_bin.mean(axis=0) sample_std = num_reads_per_bin.std(axis=0) sample_max = num_reads_per_bin.max(axis=0) sample_min = num_reads_per_bin.min(axis=0) sample_25 = np.percentile(num_reads_per_bin, 25, axis=0) sample_50 = np.percentile(num_reads_per_bin, 50, axis=0) sample_75 = np.percentile(num_reads_per_bin, 75, axis=0) # use the largest 99th percentile from all samples to set the x_max value x_max = np.max(np.percentile(num_reads_per_bin, 99, axis=0)) # plot coverage # print headers for text output print("sample\tmean\tstd\tmin\t25%\t50%\t75%\tmax") # the determination of a sensible value for y_max of the first plot (fraction of bases sampled vs. # coverage) is important because, depending on the data, # it becomes very difficult to see the lines in the plot. For example, if the coverage of a sample # is a nice gaussian curve with a large mean of 50. Then a sensible range for the y axis (fraction of # reads having coverage=x) is (0, 0.02) which nicely shows the coverage curve. If instead the coverage is # very por and centers close to 1 then a good y axis range is (0,1). # the current implementation aims to find the y_value for which 50% of the reads >= x (coverage) and # sets that as the x_axis range. y_max = [] for idx, col in enumerate(num_reads_per_bin.T): frac_reads_per_coverage = np.bincount(col.astype(int)).astype(float) / num_reads_per_bin.shape[0] axs[0].plot(frac_reads_per_coverage, label="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx])) csum = np.bincount(col.astype(int))[::-1].cumsum() csum_frac = csum.astype(float)[::-1] / csum.max() axs[1].plot(csum_frac, label=args.labels[idx]) # find the indexes (i.e. the x values) for which the cumulative distribution 'fraction of bases # sampled >= coverage' where fraction of bases sampled = 50%: `np.flatnonzero(csum_frac>0.5)` # then find the fraction of bases sampled that that have the largest x y_max.append(frac_reads_per_coverage[max(np.flatnonzero(csum_frac > 0.5))]) print("{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format(args.labels[idx], sample_mean[idx], sample_std[idx], sample_min[idx], sample_25[idx], sample_50[idx], sample_75[idx], sample_max[idx], )) # The 'good' x-axis is computed for each sample. The lower value is favored in which # distributions with a wider x-range can better be seen. y_max = min(y_max) axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10))) axs[0].set_xlim(0, x_max) axs[0].set_xlabel('coverage (#reads per bp)') axs[0].legend(fancybox=True, framealpha=0.5) axs[0].set_ylabel('fraction of bases sampled') # plot cumulative coverage axs[1].set_xlim(0, x_max) axs[1].set_xlabel('coverage (#reads per bp)') axs[1].set_ylabel('fraction of bases sampled >= coverage') axs[1].legend(fancybox=True, framealpha=0.5) plt.savefig(args.plotFile, format=args.plotFileFormat) plt.close()
def main(args=None): args = process_args(args) if not args.plotFile and not args.outRawCounts and not args.outQualityMetrics: sys.stderr.write( "\nAt least one of --plotFile, --outRawCounts or --outQualityMetrics is required.\n" ) sys.exit(1) cr = sumR.SumCoveragePerBin(args.bamfiles, args.binSize, args.numberOfSamples, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength) num_reads_per_bin = cr.run() if num_reads_per_bin.sum() == 0: import sys sys.stderr.write( "\nNo reads were found in {} regions sampled. Check that the\n" "min mapping quality is not overly high and that the \n" "chromosome names between bam files are consistant.\n" "\n".format(num_reads_per_bin.shape[0])) exit(1) if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) total = len(num_reads_per_bin[:, 0]) x = np.arange(total).astype('float') / total # normalize from 0 to 1 if args.plotFile: i = 0 # matplotlib won't iterate through line styles by itself pyplot_line_styles = sum( [7 * ["-"], 7 * ["--"], 7 * ["-."], 7 * [":"]], []) for i, reads in enumerate(num_reads_per_bin.T): count = np.cumsum(np.sort(reads)) count = count / count[-1] # to normalize y from 0 to 1 j = i % 35 plt.plot(x, count, label=args.labels[i], linestyle=pyplot_line_styles[j]) plt.xlabel('rank') plt.ylabel('fraction w.r.t. bin with highest coverage') plt.legend(loc='upper left') plt.suptitle(args.plotTitle) # set the plotFileFormat explicitly to None to trigger the # format from the file-extension if not args.plotFileFormat: args.plotFileFormat = None plt.savefig(args.plotFile.name, bbox_inches=0, format=args.plotFileFormat) plt.close() if args.outRawCounts: args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n" for row in num_reads_per_bin: args.outRawCounts.write(fmt % tuple(row)) args.outRawCounts.close() if args.outQualityMetrics: args.outQualityMetrics.write( "Sample\tAUC\tSynthetic AUC\tX-intercept\tSynthetic X-intercept\tElbow Point\tSynthetic Elbow Point" ) if args.JSDsample: args.outQualityMetrics.write( "\tJS Distance\tSynthetic JS Distance\t% genome enriched\tdiff. enrichment\tCHANCE divergence" ) args.outQualityMetrics.write("\n") line = np.arange( num_reads_per_bin.shape[0]) / float(num_reads_per_bin.shape[0] - 1) for idx, reads in enumerate(num_reads_per_bin.T): counts = np.cumsum(np.sort(reads)) counts = counts / float(counts[-1]) AUC = np.sum(counts) / float(len(counts)) XInt = (np.argmax(counts > 0) + 1) / float(counts.shape[0]) elbow = (np.argmax(line - counts) + 1) / float(counts.shape[0]) expected = getExpected( np.mean(reads)) # A tuple of expected (AUC, XInt, elbow) args.outQualityMetrics.write( "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format( args.labels[idx], AUC, expected[0], XInt, expected[1], elbow, expected[2])) if args.JSDsample: JSD = getJSD(args, idx, num_reads_per_bin) syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx]) CHANCE = getCHANCE(args, idx, num_reads_per_bin) args.outQualityMetrics.write( "\t{0}\t{1}\t{2}\t{3}\t{4}".format(JSD, syntheticJSD, CHANCE[0], CHANCE[1], CHANCE[2])) args.outQualityMetrics.write("\n") args.outQualityMetrics.close()
def main(args=None): args = process_args(args) cr = countR.CountReadsPerBin(args.bamfiles, binLength=1, numberOfSamples=args.numberOfSamples, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, blackListFileName=args.blackListFileName, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, out_file_for_raw_data=args.outRawCounts) num_reads_per_bin = cr.run() sys.stderr.write("Number of non zero bins " "used: {}\n".format(num_reads_per_bin.shape[0])) if args.outRawCounts: # append to the generated file the # labels header = "#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" f = open(args.outRawCounts, 'r+') content = f.read() f.seek(0, 0) f.write(header + content) f.close() if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non-zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) fig, axs = plt.subplots(1, 2, figsize=(15, 5)) plt.suptitle(args.plotTitle) # plot up to two std from mean num_reads_per_bin = num_reads_per_bin.astype(int) sample_mean = num_reads_per_bin.mean(axis=0) sample_std = num_reads_per_bin.std(axis=0) sample_max = num_reads_per_bin.max(axis=0) sample_min = num_reads_per_bin.min(axis=0) sample_25 = np.percentile(num_reads_per_bin, 25, axis=0) sample_50 = np.percentile(num_reads_per_bin, 50, axis=0) sample_75 = np.percentile(num_reads_per_bin, 75, axis=0) # use the largest 99th percentile from all samples to set the x_max value x_max = np.max(np.percentile(num_reads_per_bin, 99, axis=0)) # plot coverage # print headers for text output print("sample\tmean\tstd\tmin\t25%\t50%\t75%\tmax") # the determination of a sensible value for y_max of the first plot (fraction of bases sampled vs. # coverage) is important because, depending on the data, # it becomes very difficult to see the lines in the plot. For example, if the coverage of a sample # is a nice gaussian curve with a large mean of 50. Then a sensible range for the y axis (fraction of # reads having coverage=x) is (0, 0.02) which nicely shows the coverage curve. If instead the coverage is # very por and centers close to 1 then a good y axis range is (0,1). # the current implementation aims to find the y_value for which 50% of the reads >= x (coverage) and # sets that as the x_axis range. y_max = [] for idx, col in enumerate(num_reads_per_bin.T): frac_reads_per_coverage = np.bincount( col.astype(int)).astype(float) / num_reads_per_bin.shape[0] axs[0].plot(frac_reads_per_coverage, label="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx])) csum = np.bincount(col.astype(int))[::-1].cumsum() csum_frac = csum.astype(float)[::-1] / csum.max() axs[1].plot(csum_frac, label=args.labels[idx]) # find the indexes (i.e. the x values) for which the cumulative distribution 'fraction of bases # sampled >= coverage' where fraction of bases sampled = 50%: `np.flatnonzero(csum_frac>0.5)` # then find the fraction of bases sampled that that have the largest x y_max.append(frac_reads_per_coverage[max( np.flatnonzero(csum_frac > 0.5))]) print("{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format( args.labels[idx], sample_mean[idx], sample_std[idx], sample_min[idx], sample_25[idx], sample_50[idx], sample_75[idx], sample_max[idx], )) # The 'good' x-axis is computed for each sample. The lower value is favored in which # distributions with a wider x-range can better be seen. y_max = min(y_max) axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10))) axs[0].set_xlim(0, x_max) axs[0].set_xlabel('coverage (#reads per bp)') axs[0].legend(fancybox=True, framealpha=0.5) axs[0].set_ylabel('fraction of bases sampled') # plot cumulative coverage axs[1].set_xlim(0, x_max) axs[1].set_xlabel('coverage (#reads per bp)') axs[1].set_ylabel('fraction of bases sampled >= coverage') axs[1].legend(fancybox=True, framealpha=0.5) plt.savefig(args.plotFile, format=args.plotFileFormat) plt.close()
def main(args=None): args = process_args(args) cr = sumR.SumCoveragePerBin( args.bamfiles, args.binSize, args.numberOfSamples, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength) num_reads_per_bin = cr.run() if num_reads_per_bin.sum() == 0: import sys sys.stderr.write( "\nNo reads were found in {} regions sampled. Check that the\n" "min mapping quality is not overly high and that the \n" "chromosome names between bam files are consistant.\n" "\n".format(num_reads_per_bin.shape[0])) exit(1) if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) total = len(num_reads_per_bin[:, 0]) x = np.arange(total).astype('float') / total # normalize from 0 to 1 i = 0 # matplotlib won't iterate through line styles by itself pyplot_line_styles = sum([7 * ["-"], 7 * ["--"], 7 * ["-."], 7 * [":"], 7 * ["."]], []) for i, reads in enumerate(num_reads_per_bin.T): count = np.cumsum(np.sort(reads)) count = count / count[-1] # to normalize y from 0 to 1 j = i % 35 plt.plot(x, count, label=args.labels[i], linestyle=pyplot_line_styles[j]) plt.xlabel('rank') plt.ylabel('fraction w.r.t. bin with highest coverage') plt.legend(loc='upper left') plt.suptitle(args.plotTitle) # set the plotFileFormat explicitly to None to trigger the # format from the file-extension if not args.plotFileFormat: args.plotFileFormat = None plt.savefig(args.plotFile.name, bbox_inches=0, format=args.plotFileFormat) plt.close() if args.outRawCounts: args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n" for row in num_reads_per_bin: args.outRawCounts.write(fmt % tuple(row)) args.outRawCounts.close() if args.outQualityMetrics: args.outQualityMetrics.write("Sample\tAUC\tSynthetic AUC\tX-intercept\tSynthetic X-intercept\tElbow Point\tSynthetic Elbow Point") if args.JSDsample: args.outQualityMetrics.write("\tJS Distance\tSynthetic JS Distance\t% genome enriched\tdiff. enrichment\tCHANCE divergence") args.outQualityMetrics.write("\n") line = np.arange(num_reads_per_bin.shape[0]) / float(num_reads_per_bin.shape[0] - 1) for idx, reads in enumerate(num_reads_per_bin.T): counts = np.cumsum(np.sort(reads)) counts = counts / float(counts[-1]) AUC = np.sum(counts) / float(len(counts)) XInt = (np.argmax(counts > 0) + 1) / float(counts.shape[0]) elbow = (np.argmax(line - counts) + 1) / float(counts.shape[0]) expected = getExpected(np.mean(reads)) # A tuple of expected (AUC, XInt, elbow) args.outQualityMetrics.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(args.labels[idx], AUC, expected[0], XInt, expected[1], elbow, expected[2])) if args.JSDsample: JSD = getJSD(args, idx, num_reads_per_bin) syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx]) CHANCE = getCHANCE(args, idx, num_reads_per_bin) args.outQualityMetrics.write("\t{0}\t{1}\t{2}\t{3}\t{4}".format(JSD, syntheticJSD, CHANCE[0], CHANCE[1], CHANCE[2])) args.outQualityMetrics.write("\n") args.outQualityMetrics.close()
def main(args=None): args = process_args(args) if not args.plotFile and not args.outRawCounts and not args.outQualityMetrics: sys.stderr.write("\nAt least one of --plotFile, --outRawCounts or --outQualityMetrics is required.\n") sys.exit(1) cr = sumR.SumCoveragePerBin( args.bamfiles, args.binSize, args.numberOfSamples, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength) num_reads_per_bin = cr.run() if num_reads_per_bin.sum() == 0: import sys sys.stderr.write( "\nNo reads were found in {} regions sampled. Check that the\n" "min mapping quality is not overly high and that the \n" "chromosome names between bam files are consistant.\n" "For small genomes, decrease the --numberOfSamples.\n" "\n".format(num_reads_per_bin.shape[0])) exit(1) if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) total = len(num_reads_per_bin[:, 0]) x = np.arange(total).astype('float') / total # normalize from 0 to 1 if args.plotFile is not None: i = 0 # matplotlib won't iterate through line styles by itself pyplot_line_styles = sum([7 * ["-"], 7 * ["--"], 7 * ["-."], 7 * [":"]], []) plotly_colors = ["#d73027", "#fc8d59", "#f33090", "#e0f3f8", "#91bfdb", "#4575b4"] plotly_line_styles = sum([6 * ["solid"], 6 * ["dot"], 6 * ["dash"], 6 * ["longdash"], 6 * ["dashdot"], 6 * ["longdashdot"]], []) data = [] for i, reads in enumerate(num_reads_per_bin.T): count = np.cumsum(np.sort(reads)) count = count / count[-1] # to normalize y from 0 to 1 if args.plotFileFormat == 'plotly': trace = go.Scatter(x=x, y=count, mode='lines', name=args.labels[i]) trace['line'].update(dash=plotly_line_styles[i % 36], color=plotly_colors[i % 6]) data.append(trace) else: j = i % len(pyplot_line_styles) plt.plot(x, count, label=args.labels[i], linestyle=pyplot_line_styles[j]) plt.xlabel('rank') plt.ylabel('fraction w.r.t. bin with highest coverage') # set the plotFileFormat explicitly to None to trigger the # format from the file-extension if not args.plotFileFormat: args.plotFileFormat = None if args.plotFileFormat == 'plotly': fig = go.Figure() fig['data'] = data fig['layout'].update(title=args.plotTitle) fig['layout']['xaxis1'].update(title="rank") fig['layout']['yaxis1'].update(title="fraction w.r.t bin with highest coverage") py.plot(fig, filename=args.plotFile, auto_open=False) else: plt.legend(loc='upper left') plt.suptitle(args.plotTitle) plt.savefig(args.plotFile, bbox_inches=0, format=args.plotFileFormat) plt.close() if args.outRawCounts is not None: of = open(args.outRawCounts, "w") of.write("#plotFingerprint --outRawCounts\n") of.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n" for row in num_reads_per_bin: of.write(fmt % tuple(row)) of.close() if args.outQualityMetrics is not None: of = open(args.outQualityMetrics, "w") of.write("Sample\tAUC\tSynthetic AUC\tX-intercept\tSynthetic X-intercept\tElbow Point\tSynthetic Elbow Point") if args.JSDsample: of.write("\tJS Distance\tSynthetic JS Distance\t% genome enriched\tdiff. enrichment\tCHANCE divergence") else: of.write("\tSynthetic JS Distance") of.write("\n") line = np.arange(num_reads_per_bin.shape[0]) / float(num_reads_per_bin.shape[0] - 1) for idx, reads in enumerate(num_reads_per_bin.T): counts = np.cumsum(np.sort(reads)) counts = counts / float(counts[-1]) AUC = np.sum(counts) / float(len(counts)) XInt = (np.argmax(counts > 0) + 1) / float(counts.shape[0]) elbow = (np.argmax(line - counts) + 1) / float(counts.shape[0]) expected = getExpected(np.mean(reads)) # A tuple of expected (AUC, XInt, elbow) of.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(args.labels[idx], AUC, expected[0], XInt, expected[1], elbow, expected[2])) if args.JSDsample: JSD = getJSD(args, idx, num_reads_per_bin) syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx]) CHANCE = getCHANCE(args, idx, num_reads_per_bin) of.write("\t{0}\t{1}\t{2}\t{3}\t{4}".format(JSD, syntheticJSD, CHANCE[0], CHANCE[1], CHANCE[2])) else: syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx]) of.write("\t{0}".format(syntheticJSD)) of.write("\n") of.close()
def main(args=None): args = process_args(args) cr = countR.CountReadsPerBin(args.bamfiles, binLength=1, numberOfSamples=args.numberOfSamples, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude) num_reads_per_bin = cr.run() sys.stderr.write("Number of non zero bins " "used: {}\n".format(num_reads_per_bin.shape[0])) if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) if args.outRawCounts: args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n" for row in num_reads_per_bin: args.outRawCounts.write(fmt % tuple(row)) fig, axs = plt.subplots(1, 2, figsize=(15, 5)) plt.suptitle(args.plotTitle) # plot up to two std from mean sample_mean = num_reads_per_bin.mean(axis=0) std = max(num_reads_per_bin.std(axis=0)) y_max = max(sample_mean) + 3 * std # plot coverage for idx, col in enumerate(num_reads_per_bin.T): axs[0].plot(np.bincount(col.astype(int)).astype(float) / num_reads_per_bin.shape[0], label="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx])) csum = np.bincount(col.astype(int))[::-1].cumsum() axs[1].plot(csum.astype(float)[::-1] / csum.max(), label=args.labels[idx]) axs[0].set_xlim(0, y_max) axs[0].set_xlabel('coverage') axs[0].legend() axs[0].set_ylabel('fraction of bases sampled') # plot cumulative coverage axs[1].set_xlim(0, y_max) axs[1].set_xlabel('coverage') axs[1].set_ylabel('fraction of bases sampled >= coverage') axs[1].legend() plt.savefig(args.plotFile.name, format=args.plotFileFormat)