def setUp(self): """ As above, but using CRAM rather than BAM The distribution of reads between the two bam files is as follows. They cover 200 bp:: 0 100 200 |------------------------------------------------------------| A ==============> <============== B <============== ==============> ==============> ==============> """ self.root = ROOT self.bamFile1 = self.root + "testA.cram" self.bamFile2 = self.root + "testB.cram" self.bamFile_PE = self.root + "test_paired2.cram" self.chrom = '3R' step_size = 50 bin_length = 25 self.c = cr.CountReadsPerBin([self.bamFile1, self.bamFile2], binLength=bin_length, stepSize=step_size)
def main(args=None): args = process_args(args) cr = countR.CountReadsPerBin(args.bamfiles, args.binSize, args.numberOfSamples, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude) num_reads_per_bin = cr.run() if num_reads_per_bin.sum() == 0: import sys sys.stderr.write( "\nNo reads were found in {} regions sampled. Check that the\n" "min mapping quality is not overly high and that the \n" "chromosome names between bam files are consistant.\n" "\n".format(num_reads_per_bin.shape[0])) exit(1) if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) total = len(num_reads_per_bin[:, 0]) x = np.arange(total).astype('float') / total # normalize from 0 to 1 i = 0 for reads in num_reads_per_bin.T: count = np.cumsum(np.sort(reads)) count = count / count[-1] # to normalyze y from 0 to 1 plt.plot(x, count, label=args.labels[i]) plt.xlabel('rank') plt.ylabel('fraction w.r.t. bin with highest coverage') i += 1 plt.legend(loc='upper left') plt.suptitle(args.plotTitle) # set the plotFileFormat explicitly to None to trigger the # format from the file-extension if not args.plotFileFormat: args.plotFileFormat = None plt.savefig(args.plotFile.name, bbox_inches=0, format=args.plotFileFormat) if args.outRawCounts: args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n" for row in num_reads_per_bin: args.outRawCounts.write(fmt % tuple(row))
def test_count_reads_in_region_extension_1(self): """ In this case when read extension is smaller than read length extension is turned off and a warning is printed. """ self.c = cr.CountReadsPerBin([self.bamFile1, self.bamFile2], binLength=1, stepSize=50, extendReads=25) resp, _ = self.c.count_reads_in_region(self.chrom, 0, 200) nt.assert_equal(resp, np.array([[0, 0.], [0, 1.], [1, 1.], [1, 2.]]))
def test_bed_file(self): bed = "chr3R\t0\t10\nchr3R\t110\t120\nchr3R\t160\t180" import tempfile bed_file = tempfile.NamedTemporaryFile(suffix=".bed", delete=False, mode="w") bed_file.write(bed) bed_file.close() self.c = cr.CountReadsPerBin([self.bamFile2], bedFile=[bed_file.name]) resp = self.c.run() nt.assert_equal(resp, np.array([[0.], [1.], [2.]])) import os os.unlink(bed_file.name)
def calculate_frip(bam, peakfile): '''Calculates the fraction of reads in peaks for replicate bam files.''' b, pkf = bam, peakfile num_lines = sum(1 for line in open(peakfile)) if num_lines < 10: frip = "NA" else: # access deeptools function to get reads in peaks cr = crpb.CountReadsPerBin([b], bedFile=pkf, numberOfProcessors=12) rip = cr.run() total = rip.sum(axis=0) # read alignments with pysam b1 = pysam.AlignmentFile(b) # calculate fraction of reads in peaks frip = float(total[0]) / b1.mapped return frip
def estimateScaleFactor(bamFilesList, binLength, numberOfSamples, normalizationLength, avg_method='median', blackListFileName=None, numberOfProcessors=1, verbose=False, chrsToSkip=[]): r""" Subdivides the genome into chunks to be analyzed in parallel using several processors. The code handles the creation of workers that compute fragment counts (coverage) for different regions and then collect and integrates the results. Parameters ---------- bamFilesList : list list of bam files to normalize binLength : int the window size in bp, where reads are going to be counted. numberOfSamples : int number of sites to sample from the genome. For more info see the documentation of the CountReadsPerBin class normalizationLength : int length, in bp, to normalize the data. For a value of 1, on average 1 read per base pair is found avg_method : str defines how the different values are to be summarized. The options are 'mean' and 'median' chrsToSkip : list name of the chromosomes to be excluded from the scale estimation. Usually the chrX is included. blackListFileName : str BED file containing blacklisted regions Returns ------- dict Dictionary with the following keys:: 'size_factors' 'size_factors_based_on_mapped_reads' 'size_factors_SES' 'size_factors_based_on_mean' 'size_factors_based_on_median' 'mean' 'meanSES' 'median' 'reads_per_bin' 'std' 'sites_sampled' Examples -------- >>> test = Tester() >>> bin_length = 50 >>> num_samples = 4 >>> _dict = estimateScaleFactor([test.bamFile1, test.bamFile2], bin_length, num_samples, 1) >>> _dict['size_factors'] array([ 1. , 0.5]) >>> _dict['size_factors_based_on_mean'] array([ 1. , 0.5]) """ assert len( bamFilesList) == 2, "SES scale factors are only defined for 2 files" bamFilesHandlers = [bamHandler.openBam(x) for x in bamFilesList] mappedReads = [x.mapped for x in bamFilesHandlers] sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64') sizeFactorBasedOnMappedReads = sizeFactorBasedOnMappedReads.min( ) / sizeFactorBasedOnMappedReads cr = countR.CountReadsPerBin(bamFilesList, binLength=binLength, numberOfSamples=numberOfSamples, extendReads=False, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose, chrsToSkip=chrsToSkip) try: num_reads_per_bin = cr.run() except Exception as detail: exit("*ERROR*: {}".format(detail)) sitesSampled = len(num_reads_per_bin) # the transpose is taken to easily iterate by columns which are now # converted to rows num_reads_per_bin = num_reads_per_bin.transpose() # size factors based on order statistics # see Signal extraction scaling (SES) method in: Diaz et al (2012) # Normalization, bias correction, and peak calling for ChIP-seq. # Statistical applications in genetics and molecular biology, 11(3). # using the same names as in Diaz paper # p refers to ChIP, q to input p = np.sort(num_reads_per_bin[0, :]).cumsum() q = np.sort(num_reads_per_bin[1, :]).cumsum() # p[-1] and q[-1] are the maximum values in the arrays. # both p and q are normalized by this value diff = np.abs(p / p[-1] - q / q[-1]) # get the lowest rank for wich the difference is the maximum maxIndex = np.flatnonzero(diff == diff.max())[0] # Take a lower rank to move to a region with probably # less peaks and more background. maxIndex = int(maxIndex * 0.8) while (maxIndex < len(p)): # in rare cases the maxIndex maps to a zero value. # In such cases, the next index is used until # a non zero value appears. cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])]) if cumSum.min() > 0: break maxIndex += 1 meanSES = [ np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]), np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex]) ] # the maxIndex may be too close to the the signal regions # so i take a more conservative approach by taking a close number sizeFactorsSES = cumSum.min() / cumSum median = np.median(num_reads_per_bin, axis=1) # consider only those read numbers that are below the 90 # percentile to stimate the # mean and std mean = [] std = [] for values in num_reads_per_bin: maxNumReads = (np.percentile(values, 90)) if maxNumReads == 0: maxNumReads = (np.percentile(values, 99)) if maxNumReads == 0: print("all genomic regions sampled from one ") "of the bam files have no reads.\n" values = values[values <= maxNumReads] mean.append(np.mean(values)) std.append(np.std(values)) mean = np.array(mean) readsPerBin = mean if avg_method == 'mean' else median if min(median) == 0: idx_zero = [ix + 1 for ix, value in enumerate(median) if value == 0] exit( "\n*ERROR*: The median coverage computed is zero for sample(s) #{}\n" "Try selecting a larger sample size or a region with coverage\n". format(idx_zero)) sizeFactor = sizeFactorsSES return { 'size_factors': sizeFactor, 'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads, 'size_factors_SES': sizeFactorsSES, 'size_factors_based_on_mean': mean.min() / mean, 'size_factors_based_on_median': median.min() / median, 'mean': mean, 'meanSES': meanSES, 'median': median, 'reads_per_bin': readsPerBin, 'std': std, 'sites_sampled': sitesSampled }
def main(args=None): args = process_args(args) if not args.outRawCounts and not args.plotFile and not args.outCoverageMetrics: sys.exit( "At least one of --plotFile, --outRawCounts and --outCoverageMetrics are required.\n" ) if 'BED' in args: bed_regions = args.BED else: bed_regions = None cr = countR.CountReadsPerBin(args.bamfiles, binLength=1, bedFile=bed_regions, numberOfSamples=args.numberOfSamples, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, blackListFileName=args.blackListFileName, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, bed_and_bin=True, out_file_for_raw_data=args.outRawCounts) num_reads_per_bin = cr.run() if args.outCoverageMetrics and args.coverageThresholds: args.coverageThresholds.sort( ) # Galaxy in particular tends to give things in a weird order of = open(args.outCoverageMetrics, "w") of.write("Sample\tThreshold\tPercent\n") nbins = float(num_reads_per_bin.shape[0]) for thresh in args.coverageThresholds: vals = np.sum(num_reads_per_bin >= thresh, axis=0) for lab, val in zip(args.labels, vals): of.write("{}\t{}\t{:6.3f}\n".format(lab, thresh, 100. * val / nbins)) of.close() if args.outRawCounts: # append to the generated file the # labels header = "#plotCoverage --outRawCounts\n#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" f = open(args.outRawCounts, 'r+') content = f.read() f.seek(0, 0) f.write(header + content) f.close() if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non-zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) if args.plotFile: if args.plotFileFormat == 'plotly': fig = go.Figure() fig['layout']['xaxis1'] = { 'domain': [0.0, 0.48], 'anchor': 'x1', 'title': 'coverage (#reads per base)' } fig['layout']['xaxis2'] = { 'domain': [0.52, 1.0], 'anchor': 'x2', 'title': 'coverage (#reads per base)' } fig['layout']['yaxis1'] = { 'domain': [0.0, 1.0], 'anchor': 'x1', 'title': 'fraction of bases sampled' } fig['layout']['yaxis2'] = { 'domain': [0.0, 1.0], 'anchor': 'x2', 'title': 'fraction of bases sampled >= coverage' } fig['layout'].update(title=args.plotTitle) else: fig, axs = plt.subplots(1, 2, figsize=(args.plotWidth, args.plotHeight)) plt.suptitle(args.plotTitle) # plot up to two std from mean num_reads_per_bin = num_reads_per_bin.astype(int) sample_mean = num_reads_per_bin.mean(axis=0) sample_std = num_reads_per_bin.std(axis=0) sample_max = num_reads_per_bin.max(axis=0) sample_min = num_reads_per_bin.min(axis=0) sample_25 = np.percentile(num_reads_per_bin, 25, axis=0) sample_50 = np.percentile(num_reads_per_bin, 50, axis=0) sample_75 = np.percentile(num_reads_per_bin, 75, axis=0) # use the largest 99th percentile from all samples to set the x_max value x_max = np.max(np.percentile(num_reads_per_bin, 99, axis=0)) # plot coverage # print headers for text output print("sample\tmean\tstd\tmin\t25%\t50%\t75%\tmax") # the determination of a sensible value for y_max of the first plot (fraction of bases sampled vs. # coverage) is important because, depending on the data, # it becomes very difficult to see the lines in the plot. For example, if the coverage of a sample # is a nice gaussian curve with a large mean of 50. Then a sensible range for the y axis (fraction of # reads having coverage=x) is (0, 0.02) which nicely shows the coverage curve. If instead the coverage is # very por and centers close to 1 then a good y axis range is (0,1). # the current implementation aims to find the y_value for which 50% of the reads >= x (coverage) and # sets that as the x_axis range. y_max = [] data = [] # We need to manually set the line colors so they're shared between the two plots. plotly_colors = [ "#d73027", "#fc8d59", "#f33090", "#e0f3f8", "#91bfdb", "#4575b4" ] plotly_styles = sum([ 6 * ["solid"], 6 * ["dot"], 6 * ["dash"], 6 * ["longdash"], 6 * ["dashdot"], 6 * ["longdashdot"] ], []) for idx, col in enumerate(num_reads_per_bin.T): if args.plotFile: frac_reads_per_coverage = np.bincount( col.astype(int)).astype(float) / num_reads_per_bin.shape[0] csum = np.bincount(col.astype(int))[::-1].cumsum() csum_frac = csum.astype(float)[::-1] / csum.max() if args.plotFileFormat == 'plotly': color = plotly_colors[idx % len(plotly_colors)] dash = plotly_styles[idx % len(plotly_styles)] trace = go.Scatter(x=np.arange(0, int(x_max) - 1), y=frac_reads_per_coverage[:int(x_max)], mode='lines', xaxis='x1', yaxis='y1', line=dict(color=color, dash=dash), name="{}, mean={:.1f}".format( args.labels[idx], sample_mean[idx]), legendgroup="{}".format(idx)) data.append(trace) trace = go.Scatter(x=np.arange(0, int(x_max) - 1), y=csum_frac[:int(x_max)], mode='lines', xaxis='x2', yaxis='y2', line=dict(color=color, dash=dash), name=args.labels[idx], showlegend=False, legendgroup="{}".format(idx)) data.append(trace) else: axs[0].plot(frac_reads_per_coverage, label="{}, mean={:.1f}".format( args.labels[idx], sample_mean[idx])) axs[1].plot(csum_frac, label=args.labels[idx]) # find the indexes (i.e. the x values) for which the cumulative distribution 'fraction of bases # sampled >= coverage' where fraction of bases sampled = 50%: `np.flatnonzero(csum_frac>0.5)` # then find the fraction of bases sampled that that have the largest x y_max.append(frac_reads_per_coverage[max( np.flatnonzero(csum_frac > 0.5))]) print("{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format( args.labels[idx], sample_mean[idx], sample_std[idx], sample_min[idx], sample_25[idx], sample_50[idx], sample_75[idx], sample_max[idx], )) if args.plotFile: # Don't clip plots y_max = max(y_max) if args.plotFileFormat == "plotly": fig['data'] = data fig['layout']['yaxis1'].update( range=[0.0, min(1, y_max + (y_max * 0.10))]) fig['layout']['yaxis2'].update(range=[0.0, 1.0]) py.plot(fig, filename=args.plotFile, auto_open=False) else: axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10))) axs[0].set_xlim(0, x_max) axs[0].set_xlabel('coverage (#reads per bp)') axs[0].legend(fancybox=True, framealpha=0.5) axs[0].set_ylabel('fraction of bases sampled') # plot cumulative coverage axs[1].set_xlim(0, x_max) axs[1].set_xlabel('coverage (#reads per bp)') axs[1].set_ylabel('fraction of bases sampled >= coverage') axs[1].legend(fancybox=True, framealpha=0.5) plt.savefig(args.plotFile, format=args.plotFileFormat) plt.close()
def main(args=None): """ 1. get read counts at different positions either all of same length or from genomic regions from the BED file 2. save data for further plotting """ args = process_args(args) if len(args.bamfiles) < 2: print "Please input at least two bam files to compare" exit(1) if 'BED' in args: bed_regions = args.BED else: bed_regions = None stepsize = args.binSize + args.distanceBetweenBins c = countR.CountReadsPerBin( args.bamfiles, args.binSize, numberOfSamples=None, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, bedFile=bed_regions, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, stepSize=stepsize, zerosToNans=False, out_file_for_raw_data=args.outRawCounts) num_reads_per_bin = c.run() sys.stderr.write("Number of bins " "found: {}\n".format(num_reads_per_bin.shape[0])) if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") np.savez_compressed(args.outFileName, matrix=num_reads_per_bin, labels=args.labels) if args.outRawCounts: # append to the generated file the # labels header = "#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" with open(args.outRawCounts.name, 'r+') as f: content = f.read() f.seek(0, 0) f.write(header + content) args.outRawCounts.close()
def main(args=None): """ 1. get read counts at different positions either all of same length or from genomic regions from the BED file 2. save data for further plotting """ args = process_args(args) if 'BED' in args: bed_regions = args.BED else: bed_regions = None if len(args.bamfiles) == 1 and not args.outRawCounts: sys.stderr.write("You've input a single BAM file and not specified " "--outRawCounts. The resulting output will NOT be " "useful with any deepTools program!\n") stepsize = args.binSize + args.distanceBetweenBins c = countR.CountReadsPerBin(args.bamfiles, args.binSize, numberOfSamples=None, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, bedFile=bed_regions, blackListFileName=args.blackListFileName, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, stepSize=stepsize, zerosToNans=False, out_file_for_raw_data=args.outRawCounts) num_reads_per_bin = c.run(allArgs=args) sys.stderr.write("Number of bins " "found: {}\n".format(num_reads_per_bin.shape[0])) if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") # numpy will append .npz to the file name if we don't do this... f = open(args.outFileName, "wb") np.savez_compressed(f, matrix=num_reads_per_bin, labels=args.labels) f.close() if args.outRawCounts: # append to the generated file the # labels header = "#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" f = open(args.outRawCounts, 'r+') content = f.read() f.seek(0, 0) f.write(header + content) f.close()
def main(args=None): args = process_args(args) cr = countR.CountReadsPerBin(args.bamfiles, binLength=1, numberOfSamples=args.numberOfSamples, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, blackListFileName=args.blackListFileName, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, out_file_for_raw_data=args.outRawCounts) num_reads_per_bin = cr.run() sys.stderr.write("Number of non zero bins " "used: {}\n".format(num_reads_per_bin.shape[0])) if args.outRawCounts: # append to the generated file the # labels header = "#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" f = open(args.outRawCounts, 'r+') content = f.read() f.seek(0, 0) f.write(header + content) f.close() if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non-zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) fig, axs = plt.subplots(1, 2, figsize=(15, 5)) plt.suptitle(args.plotTitle) # plot up to two std from mean num_reads_per_bin = num_reads_per_bin.astype(int) sample_mean = num_reads_per_bin.mean(axis=0) sample_std = num_reads_per_bin.std(axis=0) sample_max = num_reads_per_bin.max(axis=0) sample_min = num_reads_per_bin.min(axis=0) sample_25 = np.percentile(num_reads_per_bin, 25, axis=0) sample_50 = np.percentile(num_reads_per_bin, 50, axis=0) sample_75 = np.percentile(num_reads_per_bin, 75, axis=0) # use the largest 99th percentile from all samples to set the x_max value x_max = np.max(np.percentile(num_reads_per_bin, 99, axis=0)) # plot coverage # print headers for text output print("sample\tmean\tstd\tmin\t25%\t50%\t75%\tmax") # the determination of a sensible value for y_max of the first plot (fraction of bases sampled vs. # coverage) is important because, depending on the data, # it becomes very difficult to see the lines in the plot. For example, if the coverage of a sample # is a nice gaussian curve with a large mean of 50. Then a sensible range for the y axis (fraction of # reads having coverage=x) is (0, 0.02) which nicely shows the coverage curve. If instead the coverage is # very por and centers close to 1 then a good y axis range is (0,1). # the current implementation aims to find the y_value for which 50% of the reads >= x (coverage) and # sets that as the x_axis range. y_max = [] for idx, col in enumerate(num_reads_per_bin.T): frac_reads_per_coverage = np.bincount( col.astype(int)).astype(float) / num_reads_per_bin.shape[0] axs[0].plot(frac_reads_per_coverage, label="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx])) csum = np.bincount(col.astype(int))[::-1].cumsum() csum_frac = csum.astype(float)[::-1] / csum.max() axs[1].plot(csum_frac, label=args.labels[idx]) # find the indexes (i.e. the x values) for which the cumulative distribution 'fraction of bases # sampled >= coverage' where fraction of bases sampled = 50%: `np.flatnonzero(csum_frac>0.5)` # then find the fraction of bases sampled that that have the largest x y_max.append(frac_reads_per_coverage[max( np.flatnonzero(csum_frac > 0.5))]) print("{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format( args.labels[idx], sample_mean[idx], sample_std[idx], sample_min[idx], sample_25[idx], sample_50[idx], sample_75[idx], sample_max[idx], )) # The 'good' x-axis is computed for each sample. The lower value is favored in which # distributions with a wider x-range can better be seen. y_max = min(y_max) axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10))) axs[0].set_xlim(0, x_max) axs[0].set_xlabel('coverage (#reads per bp)') axs[0].legend(fancybox=True, framealpha=0.5) axs[0].set_ylabel('fraction of bases sampled') # plot cumulative coverage axs[1].set_xlim(0, x_max) axs[1].set_xlabel('coverage (#reads per bp)') axs[1].set_ylabel('fraction of bases sampled >= coverage') axs[1].legend(fancybox=True, framealpha=0.5) plt.savefig(args.plotFile, format=args.plotFileFormat) plt.close()
frips = [] for idx, bam_file in enumerate(bam_file_list): # Init frip = 0 # Read first line first_line = None with open(peak_file_list[idx], "r") as file: for line in file: first_line = line break if first_line is not None: print("Calculating " + bam_file + " using " + peak_file_list[idx]) cr = crpb.CountReadsPerBin([bam_file], bedFile=[peak_file_list[idx]], numberOfProcessors=int(args.threads)) # Calc the total number of reads in peaks per bam file reads_at_peaks = cr.run() total = reads_at_peaks.sum(axis=0) # Load up bam file and get the total number of mapped reads bam = pysam.AlignmentFile(bam_file) # Calc frip frip = float(total[0]) / bam.mapped frips.append(str(frip)) # Log
] # file = "results/mapping/SRX4108929.1.control.final.bam" mappedReads = [] for file in files: mappedReads.append( bamHandler.openBam(file, returnStats=True, nThreads=nThreads)[1]) sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64') sizeFactorBasedOnMappedReads = sizeFactorBasedOnMappedReads.min( ) / sizeFactorBasedOnMappedReads cr = countR.CountReadsPerBin(files, binLength=50, numberOfSamples=10000, extendReads=False, numberOfProcessors=nThreads) chromsizes, non_common = deeptools.utilities.getCommonChrNames( [bamHandler.openBam(file) for file in files]) chrNames, chrLengths = list(zip(*chromsizes)) genomeSize = sum(chrLengths) bam = bamHandler.openBam(file) for read in bam.fetch("chr3", 9998999, 9999999): print(read.is_unmapped)
def get_coverage(self, **kwargs): """ retrieve coverage for each regions specified in bed file using deeptools' CountReadsPerBin. """ bamFilesList = _get_all_bams(self.extended_yml) out_file_for_raw_data_tmp = kwargs[ 'out_file_for_raw_data'] if 'out_file_for_raw_data' in kwargs else "tmp_counts.count" cr = crpb.CountReadsPerBin( bamFilesList, binLength=kwargs['binLength'] if 'binLength' in kwargs else 50, numberOfSamples=kwargs['numberOfSamples'] if 'numberOfSamples' in kwargs else None, numberOfProcessors=kwargs['numberOfProcessors'] if 'numberOfProcessors' in kwargs else 5, verbose=kwargs['verbose'] if 'verbose' in kwargs else False, region=kwargs['region'] if 'region' in kwargs else None, bedFile=kwargs['bedFile'] if 'bedFile' in kwargs else None, extendReads=kwargs['extendReads'] if 'extendReads' in kwargs else False, genomeChunkSize=kwargs['genomeChunkSize'] if 'genomeChunkSize' in kwargs else None, blackListFileName=kwargs['blackListFileName'] if 'blackListFileName' in kwargs else None, minMappingQuality=kwargs['minMappingQuality'] if 'minMappingQuality' in kwargs else None, ignoreDuplicates=kwargs['ignoreDuplicates'] if 'ignoreDuplicates' in kwargs else False, chrsToSkip=kwargs['chrsToSkip'] if 'chrsToSkip' in kwargs else [], stepSize=kwargs['stepSize'] if 'stepSize' in kwargs else None, center_read=kwargs['center_read'] if 'center_read' in kwargs else False, samFlag_include=kwargs['samFlag_include'] if 'samFlag_include' in kwargs else None, samFlag_exclude=kwargs['samFlag_exclude'] if 'samFlag_exclude' in kwargs else None, zerosToNans=kwargs['zerosToNans'] if 'zerosToNans' in kwargs else False, skipZeroOverZero=kwargs['skipZeroOverZero'] if 'skipZeroOverZero' in kwargs else False, smoothLength=kwargs['smoothLength'] if 'smoothLength' in kwargs else 0, minFragmentLength=kwargs['minFragmentLength'] if 'minFragmentLength' in kwargs else 0, maxFragmentLength=kwargs['maxFragmentLength'] if 'maxFragmentLength' in kwargs else 0, out_file_for_raw_data=out_file_for_raw_data_tmp, bed_and_bin=kwargs['bed_and_bin'] if 'bed_and_bin' in kwargs else False, statsList=kwargs['statsList'] if 'statsList' in kwargs else [], mappedList=kwargs['mappedList'] if 'mappedList' in kwargs else []) sequencing_depth = cr.run() col_names = ["chr", "start", "end" ] + [sample.split("/")[-1] for sample in bamFilesList] sequencing_depth_df = pd.read_csv(out_file_for_raw_data_tmp, sep="\t", header=None) sequencing_depth_df.columns = col_names if not 'out_file_for_raw_data' in kwargs: os.remove(out_file_for_raw_data_tmp) return sequencing_depth_df
parser.add_argument("-p", "--processors", help="number of processors", type=int) args = parser.parse_args() # Do not calculate if the bedfile is empty num_lines = 0 with open(args.bed, 'r') as f: for line in f: num_lines += 1 # Calculate Reads in bam file if num_lines > 0: cr = crpb.CountReadsPerBin([args.bam], bedFile=args.bed, numberOfProcessors=args.processors) reads_at_peaks = cr.run() # Calculate total number of reads in peaks total = reads_at_peaks.sum(axis=0) # Calculate % of frangments in peaks bam = pysam.AlignmentFile(args.bam) frip = float(total[0]) / bam.mapped print(str(frip * 100)) else: print('0')
def main(args=None): args = process_args(args) cr = countR.CountReadsPerBin(args.bamfiles, binLength=1, numberOfSamples=args.numberOfSamples, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude) num_reads_per_bin = cr.run() sys.stderr.write("Number of non zero bins " "used: {}\n".format(num_reads_per_bin.shape[0])) if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) if args.outRawCounts: args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n" for row in num_reads_per_bin: args.outRawCounts.write(fmt % tuple(row)) fig, axs = plt.subplots(1, 2, figsize=(15, 5)) plt.suptitle(args.plotTitle) # plot up to two std from mean sample_mean = num_reads_per_bin.mean(axis=0) std = max(num_reads_per_bin.std(axis=0)) y_max = max(sample_mean) + 3 * std # plot coverage for idx, col in enumerate(num_reads_per_bin.T): axs[0].plot(np.bincount(col.astype(int)).astype(float) / num_reads_per_bin.shape[0], label="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx])) csum = np.bincount(col.astype(int))[::-1].cumsum() axs[1].plot(csum.astype(float)[::-1] / csum.max(), label=args.labels[idx]) axs[0].set_xlim(0, y_max) axs[0].set_xlabel('coverage') axs[0].legend() axs[0].set_ylabel('fraction of bases sampled') # plot cumulative coverage axs[1].set_xlim(0, y_max) axs[1].set_xlabel('coverage') axs[1].set_ylabel('fraction of bases sampled >= coverage') axs[1].legend() plt.savefig(args.plotFile.name, format=args.plotFileFormat)
def main(args=None): args = process_args(args) cr = countR.CountReadsPerBin( args.bamfiles, args.binSize, args.numberOfSamples, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength) num_reads_per_bin = cr.run() if num_reads_per_bin.sum() == 0: import sys sys.stderr.write( "\nNo reads were found in {} regions sampled. Check that the\n" "min mapping quality is not overly high and that the \n" "chromosome names between bam files are consistant.\n" "\n".format(num_reads_per_bin.shape[0])) exit(1) if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) total = len(num_reads_per_bin[:, 0]) x = np.arange(total).astype('float') / total # normalize from 0 to 1 i = 0 # matplotlib won't iterate through line styles by itself pyplot_line_styles = sum([7 * ["-"], 7 * ["--"], 7 * ["-."], 7 * [":"], 7 * ["."]], []) for i, reads in enumerate(num_reads_per_bin.T): count = np.cumsum(np.sort(reads)) count = count / count[-1] # to normalize y from 0 to 1 j = i % 35 plt.plot(x, count, label=args.labels[i], linestyle=pyplot_line_styles[j]) plt.xlabel('rank') plt.ylabel('fraction w.r.t. bin with highest coverage') plt.legend(loc='upper left') plt.suptitle(args.plotTitle) # set the plotFileFormat explicitly to None to trigger the # format from the file-extension if not args.plotFileFormat: args.plotFileFormat = None plt.savefig(args.plotFile.name, bbox_inches=0, format=args.plotFileFormat) plt.close() if args.outRawCounts: args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n" for row in num_reads_per_bin: args.outRawCounts.write(fmt % tuple(row)) args.outRawCounts.close() if args.outQualityMetrics: args.outQualityMetrics.write("Sample\tAUC\tSynthetic AUC\tX-intercept\tSynthetic X-intercept\tElbow Point\tSynthetic Elbow Point") if args.JSDsample: args.outQualityMetrics.write("\tJS Distance\tSynthetic JS Distance\t% genome enriched\tdiff. enrichment\tCHANCE divergence") args.outQualityMetrics.write("\n") line = np.arange(num_reads_per_bin.shape[0]) / float(num_reads_per_bin.shape[0] - 1) for idx, reads in enumerate(num_reads_per_bin.T): counts = np.cumsum(np.sort(reads)) counts = counts / float(counts[-1]) AUC = np.sum(counts) / float(len(counts)) XInt = (np.argmax(counts > 0) + 1) / float(counts.shape[0]) elbow = (np.argmax(line - counts) + 1) / float(counts.shape[0]) expected = getExpected(np.mean(reads)) # A tuple of expected (AUC, XInt, elbow) args.outQualityMetrics.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(args.labels[idx], AUC, expected[0], XInt, expected[1], elbow, expected[2])) if args.JSDsample: JSD = getJSD(args, idx, num_reads_per_bin) syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx]) CHANCE = getCHANCE(args, idx, num_reads_per_bin) args.outQualityMetrics.write("\t{0}\t{1}\t{2}\t{3}\t{4}".format(JSD, syntheticJSD, CHANCE[0], CHANCE[1], CHANCE[2])) args.outQualityMetrics.write("\n") args.outQualityMetrics.close()