def get_scale_factor(args): scale_factor = args.scaleFactor bam_handle = bamHandler.openBam(args.bam) bam_mapped = parserCommon.bam_total_reads(bam_handle, args.ignoreForNormalization) blacklisted = parserCommon.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization, args.blackListFileName) bam_mapped -= blacklisted if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median'])) elif args.extendReads < 1: exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads)) elif args.extendReads > 2000: exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print "Estimated read length is {}".format(int(read_len_dict['median'])) current_coverage = \ float(bam_mapped * fragment_length) / args.normalizeTo1x # the scaling sets the coverage to match 1x scale_factor *= 1.0 / current_coverage if debug: print "Estimated current coverage {}".format(current_coverage) print "Scaling factor {}".format(args.scaleFactor) elif args.normalizeUsingRPKM: # the RPKM is the # reads per tile / \ # ( total reads (in millions) * tile length in Kb) million_reads_mapped = float(bam_mapped) / 1e6 tile_len_in_kb = float(args.binSize) / 1000 scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb) if debug: print "scale factor using RPKM is {0}".format(args.scaleFactor) return scale_factor
def main(args=None): args = process_args(args) global debug if args.verbose: debug = 1 else: debug = 0 func_args = {'scaleFactor': get_scale_factor(args)} if args.MNase: # check that library is paired end # using getFragmentAndReadSize from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, return_lengths=False, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if frag_len_dict is None: exit("*Error*: For the --MNAse function a paired end library is required. ") wr = CenterFragment([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, verbose=args.verbose, ) else: wr = writeBedGraph.WriteBedGraph([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, verbose=args.verbose, ) wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName, format=args.outFileFormat, smoothLength=args.smoothLength)
def main(args=None): args = parse_arguments().parse_args(args) fragment_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, return_lengths=True, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, binSize=args.binSize, distanceBetweenBins=args.distanceBetweenBins) if fragment_len_dict: if fragment_len_dict['mean'] == 0: print "No pairs were found. Is the data from a paired-end sequencing experiment?" print "Sample size: {}\n".format(fragment_len_dict['sample_size']) print "\nFragment lengths:" print "Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n" \ "3rd Qu.: {}\nMax.: {}\nStd: {}".format(fragment_len_dict['min'], fragment_len_dict['qtile25'], fragment_len_dict['mean'], fragment_len_dict['median'], fragment_len_dict['qtile75'], fragment_len_dict['max'], fragment_len_dict['std']) else: print "No pairs were found. Is the data from a paired-end sequencing experiment?" print "\nRead lengths:" print "Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n" \ "3rd Qu.: {}\nMax.: {}\nStd: {}".format(read_len_dict['min'], read_len_dict['qtile25'], read_len_dict['mean'], read_len_dict['median'], read_len_dict['qtile75'], read_len_dict['max'], read_len_dict['std']) if args.histogram: import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.hist(fragment_len_dict['lengths'], 50, range=(fragment_len_dict['min'], fragment_len_dict['mean'] * 2), normed=True) plt.xlabel('Fragment Length') plt.ylabel('Frequency') plt.title(args.plotTitle) plt.savefig(args.histogram, bbox_inches=0) plt.close()
def getFragSize(bam, args): fragment_len_dict, read_len_dict = get_read_and_fragment_length(bam, return_lengths=True, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, binSize=args.binSize, distanceBetweenBins=args.distanceBetweenBins) print("\n\nBAM file : {}".format(bam)) if fragment_len_dict: if fragment_len_dict['mean'] == 0: print("No pairs were found. Is the data from a paired-end sequencing experiment?") print("Sample size: {}\n".format(fragment_len_dict['sample_size'])) print("Fragment lengths:") print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n" "3rd Qu.: {}\nMax.: {}\nStd: {}".format(fragment_len_dict['min'], fragment_len_dict['qtile25'], fragment_len_dict['mean'], fragment_len_dict['median'], fragment_len_dict['qtile75'], fragment_len_dict['max'], fragment_len_dict['std'])) else: print("No pairs were found. Is the data from a paired-end sequencing experiment?") print("\nRead lengths:") print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n" "3rd Qu.: {}\nMax.: {}\nStd: {}".format(read_len_dict['min'], read_len_dict['qtile25'], read_len_dict['mean'], read_len_dict['median'], read_len_dict['qtile75'], read_len_dict['max'], read_len_dict['std'])) return fragment_len_dict
def get_scale_factor(args): scale_factor = args.scaleFactor bam_handle = bamHandler.openBam(args.bam) bam_mapped = parserCommon.bam_total_reads(bam_handle, args.ignoreForNormalization) blacklisted = parserCommon.bam_blacklisted_reads( bam_handle, args.ignoreForNormalization, args.blackListFileName) bam_mapped -= blacklisted if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if args.verbose: print( "Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median'])) elif args.extendReads < 1: exit( "*ERROR*: read extension must be bigger than one. Value give: {} " .format(args.extendReads)) elif args.extendReads > 2000: exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print "Estimated read length is {}".format( int(read_len_dict['median'])) current_coverage = \ float(bam_mapped * fragment_length) / args.normalizeTo1x # the scaling sets the coverage to match 1x scale_factor *= 1.0 / current_coverage if debug: print "Estimated current coverage {}".format(current_coverage) print "Scaling factor {}".format(args.scaleFactor) elif args.normalizeUsingRPKM: # the RPKM is the # reads per tile / \ # ( total reads (in millions) * tile length in Kb) million_reads_mapped = float(bam_mapped) / 1e6 tile_len_in_kb = float(args.binSize) / 1000 scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb) if debug: print "scale factor using RPKM is {0}".format(args.scaleFactor) return scale_factor
def get_scale_factors(args): bam1 = bamHandler.openBam(args.bamfile1, args.bamIndex1) bam2 = bamHandler.openBam(args.bamfile2, args.bamIndex2) bam1_mapped = parserCommon.bam_total_reads(bam1, args.ignoreForNormalization) bam2_mapped = parserCommon.bam_total_reads(bam2, args.ignoreForNormalization) if args.scaleFactors: scale_factors = map(float, args.scaleFactors.split(":")) else: if args.scaleFactorsMethod == 'SES': scalefactors_dict = estimateScaleFactor( [bam1.filename, bam2.filename], args.sampleLength, args.numberOfSamples, 1, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, chrsToSkip=args.ignoreForNormalization) scale_factors = scalefactors_dict['size_factors'] if args.verbose: print "Size factors using SES: {}".format(scale_factors) print "%s regions of size %s where used " % \ (scalefactors_dict['sites_sampled'], args.sampleLength) print "size factor if the number of mapped " \ "reads would have been used:" print tuple( float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped])) elif args.scaleFactorsMethod == 'readCount': scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array( [bam1_mapped, bam2_mapped]) if args.verbose: print "Size factors using total number " \ "of mapped reads: {}".format(scale_factors) # in case the subtract method is used, the final difference # would be normalized according to the given method if args.ratio == 'subtract': # The next lines identify which of the samples is not scaled down. # The normalization using RPKM or normalize to 1x would use # as reference such sample. Since the other sample would be # scaled to match the un-scaled one, the normalization factor # for both samples should be based on the unscaled one. # For example, if sample A is unscaled and sample B is scaled by 0.5, # then normalizing factor for A to report RPKM read counts # is also applied to B. if scale_factors[0] == 1: mappedReads = bam1_mapped bamfile = args.bamfile1 bamindex = args.bamIndex1 else: mappedReads = bam2_mapped bamfile = args.bamfile2 bamindex = args.bamIndex2 if args.scaleFactors is None: if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( bamfile, bamindex, return_lengths=False, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if args.verbose: print( "Fragment length based on paired en data " "estimated to be {}".format( frag_len_dict['median'])) elif args.extendReads < 1: exit( "*ERROR*: read extension must be bigger than one. Value give: {} " .format(args.extendReads)) elif args.extendReads > 2000: exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print "Estimated read length is {}".format( int(read_len_dict['median'])) current_coverage = float( mappedReads * fragment_length) / args.normalizeTo1x # the coverage scale factor is 1 / coverage, coverage_scale_factor = 1.0 / current_coverage scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print "Estimated current coverage {}".format( current_coverage) print "Scale factor to convert " \ "current coverage to 1: {}".format(coverage_scale_factor) else: # by default normalize using RPKM # the RPKM is: # Num reads per tile/(total reads (in millions)*tile length in Kb) millionReadsMapped = float(mappedReads) / 1e6 tileLengthInKb = float(args.binSize) / 1000 coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb) scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print "scale factor for " "RPKM is {0}".format(coverage_scale_factor) return scale_factors
def main(args=None): args = process_args(args) global debug if args.verbose: debug = 1 else: debug = 0 if args.normalizeTo1x or args.normalizeUsingRPKM: # if a normalization is required then compute the scale factors scale_factor = get_scale_factor(args) else: scale_factor = args.scaleFactor func_args = {'scaleFactor': scale_factor} if args.MNase: # check that library is paired end # using getFragmentAndReadSize from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if frag_len_dict is None: sys.exit("*Error*: For the --MNAse function a paired end library is required. ") # Set some default fragment length bounds if args.minFragmentLength == 0: args.minFragmentLength = 130 if args.maxFragmentLength == 0: args.maxFragmentLength = 200 wr = CenterFragment([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, verbose=args.verbose, ) elif args.Offset: if len(args.Offset) > 1: if args.Offset[0] == 0: sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.") if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]: sys.exir("'Error*: The right side bound is less than the left-side bound. This is inappropriate.") else: if args.Offset[0] == 0: sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.") wr = OffsetFragment([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, verbose=args.verbose) wr.filter_strand = args.filterRNAstrand wr.Offset = args.Offset elif args.filterRNAstrand: wr = filterRnaStrand([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, verbose=args.verbose, ) wr.filter_strand = args.filterRNAstrand else: wr = writeBedGraph.WriteBedGraph([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, verbose=args.verbose, ) wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName, blackListFileName=args.blackListFileName, format=args.outFileFormat, smoothLength=args.smoothLength)
def get_scale_factor(args): scale_factor = args.scaleFactor bam_mapped, bam_mapped_total = get_num_kept_reads(args) if args.normalizeTo1x: # Print output, since normalzation stuff isn't printed to stderr otherwise sys.stderr.write("normalization: 1x\n") # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print(("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median']))) elif args.extendReads < 1: exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads)) elif args.extendReads > 2000: exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print("Estimated read length is {}".format(int(read_len_dict['median']))) current_coverage = \ float(bam_mapped * fragment_length) / args.normalizeTo1x # the scaling sets the coverage to match 1x scale_factor *= 1.0 / current_coverage if debug: print("Estimated current coverage {}".format(current_coverage)) print("Scaling factor {}".format(args.scaleFactor)) elif args.normalizeUsingRPKM: # Print output, since normalzation stuff isn't printed to stderr otherwise sys.stderr.write("normalization: RPKM\n") # the RPKM is the # reads per tile / \ # ( total reads (in millions) * tile length in Kb) million_reads_mapped = float(bam_mapped) / 1e6 tile_len_in_kb = float(args.binSize) / 1000 scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb) if debug: print("scale factor using RPKM is {0}".format(args.scaleFactor)) else: # Print output, since normalzation stuff isn't printed to stderr otherwise sys.stderr.write("normalization: depth\n") scale_factor *= bam_mapped / float(bam_mapped_total) if args.verbose: print("Final scaling factor: {}".format(scale_factor)) return scale_factor
def __init__(self, bamFilesList, binLength=50, numberOfSamples=None, numberOfProcessors=1, verbose=False, region=None, bedFile=None, extendReads=False, blackListFileName=None, minMappingQuality=None, ignoreDuplicates=False, chrsToSkip=[], stepSize=None, center_read=False, samFlag_include=None, samFlag_exclude=None, zerosToNans=False, skipZeroOverZero=False, smoothLength=0, minFragmentLength=0, maxFragmentLength=0, out_file_for_raw_data=None, bed_and_bin=False, statsList=[], mappedList=[]): self.bamFilesList = bamFilesList self.binLength = binLength self.numberOfSamples = numberOfSamples self.blackListFileName = blackListFileName self.statsList = statsList self.mappedList = mappedList self.skipZeroOverZero = skipZeroOverZero self.bed_and_bin = bed_and_bin if extendReads and len(bamFilesList): from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( bamFilesList[0], return_lengths=False, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) if extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: self.defaultFragmentLength = int(frag_len_dict['median']) else: exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if verbose: print( ("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median']))) elif extendReads < read_len_dict['median']: sys.stderr.write( "*WARNING*: read extension is smaller than read length (read length = {}). " "Reads will not be extended.\n".format( int(read_len_dict['median']))) self.defaultFragmentLength = 'read length' elif extendReads > 2000: exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(extendReads)) else: self.defaultFragmentLength = int(extendReads) else: self.defaultFragmentLength = 'read length' self.numberOfProcessors = numberOfProcessors self.verbose = verbose self.region = region self.bedFile = bedFile self.minMappingQuality = minMappingQuality self.ignoreDuplicates = ignoreDuplicates self.chrsToSkip = chrsToSkip self.stepSize = stepSize self.center_read = center_read self.samFlag_include = samFlag_include self.samFlag_exclude = samFlag_exclude self.minFragmentLength = minFragmentLength self.maxFragmentLength = maxFragmentLength self.zerosToNans = zerosToNans self.smoothLength = smoothLength if out_file_for_raw_data: self.save_data = True self.out_file_for_raw_data = out_file_for_raw_data else: self.save_data = False self.out_file_for_raw_data = None # check that wither numberOfSamples or stepSize are set if numberOfSamples is None and stepSize is None and bedFile is None: raise ValueError( "either stepSize, numberOfSamples or bedFile have to be set") if self.defaultFragmentLength != 'read length': self.maxPairedFragmentLength = 4 * self.defaultFragmentLength else: self.maxPairedFragmentLength = 1000 if self.maxFragmentLength > 0: self.maxPairedFragmentLength = self.maxFragmentLength if len(self.mappedList) == 0: try: for fname in self.bamFilesList: bam, mapped, unmapped, stats = bamHandler.openBam( fname, returnStats=True, nThreads=self.numberOfProcessors) self.mappedList.append(mapped) self.statsList.append(stats) bam.close() except: self.mappedList = [] self.statsList = []
def main(args=None): args = parse_arguments().parse_args(args) if args.extraSampling: extra_sampling_file = args.extraSampling.name args.extraSampling.close() else: extra_sampling_file = None global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile global_vars['filter_out'] = args.blackListFileName global_vars['extra_sampling_file'] = extra_sampling_file bit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = bamHandler.openBam(global_vars['bam']) if args.fragmentLength: fragment_len_dict = \ {'median': args.fragmentLength} else: fragment_len_dict, __ = \ get_read_and_fragment_length(args.bamfile, None, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if not fragment_len_dict: print "\nPlease provide the fragment length used for the " \ "sample preparation.\n" exit(1) fragment_len_dict = {'median': int(fragment_len_dict['median'])} chrNameBitToBam = tbitToBamChrName(bit.index.keys(), bam.references) global_vars['genome_size'] = sum([bit[x].size for x in bit.index]) global_vars['total_reads'] = bam.mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize confidence_p_value = float(1) / args.sampleSize # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] # use poisson distribution to identify peaks that should be discarted. # I multiply by 4, because the real distribution of reads # vary depending on the gc content # and the global number of reads per bp may a be too low. # empirically, a value of at least 4 times as big as the # reads_per_bp was found. # Similarly for the min value, I divide by 4. global_vars['max_reads'] = \ poisson(4 * global_vars['reads_per_bp'] * fragment_len_dict['median']).isf(confidence_p_value) # this may be of not use, unless the depth of sequencing is really high # as this value is close to 0 global_vars['min_reads'] = \ poisson(0.25 * global_vars['reads_per_bp'] * fragment_len_dict['median']).ppf(confidence_p_value) for key in global_vars: print "{}: {}".format(key, global_vars[key]) print "computing frequencies" # the GC of the genome is sampled each stepSize bp. stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1) print "stepSize: {}".format(stepSize) data = tabulateGCcontent(fragment_len_dict, chrNameBitToBam, stepSize, chromSizes, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region) np.savetxt(args.GCbiasFrequenciesFile.name, data) if args.biasPlot: reads_per_gc = countReadsPerGC(args.regionSize, chrNameBitToBam, stepSize * 10, chromSizes, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region) plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)
def get_scale_factors(args): if args.ratio == 'subtract': # We need raw counts in this case normalizeTo1x = args.normalizeTo1x normalizeUsingRPKM = args.normalizeUsingRPKM args.normalizeTo1x = False args.normalizeUsingRPKM = False # This is only used if we subtract mapped_reads = [None, None] if args.scaleFactors: scale_factors = list(map(float, args.scaleFactors.split(":"))) elif args.scaleFactorsMethod == 'SES': scalefactors_dict = estimateScaleFactor( [args.bamfile1, args.bamfile2], args.sampleLength, args.numberOfSamples, 1, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, chrsToSkip=args.ignoreForNormalization) scale_factors = scalefactors_dict['size_factors'] if args.verbose: bam1 = bamHandler.openBam(args.bamfile1) bam2 = bamHandler.openBam(args.bamfile2) print("Size factors using SES: {}".format(scale_factors)) print("%s regions of size %s where used " % (scalefactors_dict['sites_sampled'], args.sampleLength)) print("ignoring filtering/blacklists, size factors if the number of mapped " "reads would have been used:") print(tuple( float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped]))) bam1.close() bam2.close() elif args.scaleFactorsMethod == 'readCount': args.bam = args.bamfile1 args.scaleFactor = 1.0 bam1_mapped, _ = get_num_kept_reads(args) args.bam = args.bamfile2 bam2_mapped, _ = get_num_kept_reads(args) scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array([bam1_mapped, bam2_mapped]) mapped_reads = [bam1_mapped, bam2_mapped] if args.verbose: print("Size factors using total number " "of mapped reads: {}".format(scale_factors)) # in case the subtract method is used, the final difference # would be normalized according to the given method if args.ratio == 'subtract': # The next lines identify which of the samples is not scaled down. # The normalization using RPKM or normalize to 1x would use # as reference such sample. Since the other sample would be # scaled to match the un-scaled one, the normalization factor due to RPKM or normalize1x # for both samples should be based on the unscaled one. # For example, if sample A is unscaled and sample B is scaled by 0.5, # then normalizing factor for A to report RPKM read counts # is also applied to B. if args.scaleFactors is None: # check which of the two samples is not scaled down if scale_factors[0] == 1: args.bam = args.bamfile1 mapped_reads = mapped_reads[0] else: args.bam = args.bamfile2 mapped_reads = mapped_reads[1] if mapped_reads is None: mapped_reads, _ = get_num_kept_reads(args) # Replace the arguments args.normalizeTo1x = normalizeTo1x args.normalizeUsingRPKM = normalizeUsingRPKM if args.scaleFactors is None: if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print(("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median']))) elif args.extendReads < 1: exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads)) elif args.extendReads > 2000: exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print("Estimated read length is {}".format(int(read_len_dict['median']))) current_coverage = float(mapped_reads * fragment_length) / args.normalizeTo1x # the coverage scale factor is 1 / coverage, coverage_scale_factor = 1.0 / current_coverage scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print("Estimated current coverage {}".format(current_coverage)) print("Scale factor to convert " "current coverage to 1: {}".format(coverage_scale_factor)) else: # by default normalize using RPKM # the RPKM is: # Num reads per tile/(total reads (in millions)*tile length in Kb) millionReadsMapped = float(mapped_reads) / 1e6 tileLengthInKb = float(args.binSize) / 1000 coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb) scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print("Scale factor for RPKM is {0}".format(coverage_scale_factor)) return scale_factors
def main(args=None): args = process_args(args) global debug if args.verbose: debug = 1 else: debug = 0 if args.normalizeTo1x or args.normalizeUsingRPKM: # if a normalization is required then compute the scale factors scale_factor = get_scale_factor(args) else: scale_factor = args.scaleFactor func_args = {'scaleFactor': scale_factor} if args.MNase: # check that library is paired end # using getFragmentAndReadSize from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if frag_len_dict is None: sys.exit( "*Error*: For the --MNAse function a paired end library is required. " ) # Set some default fragment length bounds if args.minFragmentLength == 0: args.minFragmentLength = 130 if args.maxFragmentLength == 0: args.maxFragmentLength = 200 wr = CenterFragment( [args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, verbose=args.verbose, ) elif args.Offset: if len(args.Offset) > 1: if args.Offset[0] == 0: sys.exit( "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment." ) if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]: sys.exir( "'Error*: The right side bound is less than the left-side bound. This is inappropriate." ) else: if args.Offset[0] == 0: sys.exit( "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment." ) wr = OffsetFragment([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, verbose=args.verbose) wr.filter_strand = args.filterRNAstrand wr.Offset = args.Offset elif args.filterRNAstrand: wr = filterRnaStrand( [args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, verbose=args.verbose, ) wr.filter_strand = args.filterRNAstrand else: wr = writeBedGraph.WriteBedGraph( [args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, verbose=args.verbose, ) wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName, blackListFileName=args.blackListFileName, format=args.outFileFormat, smoothLength=args.smoothLength)
def get_scale_factors(args): if args.ratio == 'subtract': # We need raw counts in this case normalizeTo1x = args.normalizeTo1x normalizeUsingRPKM = args.normalizeUsingRPKM args.normalizeTo1x = False args.normalizeUsingRPKM = False # This is only used if we subtract mapped_reads = [None, None] if args.scaleFactors: scale_factors = list(map(float, args.scaleFactors.split(":"))) elif args.scaleFactorsMethod == 'SES': scalefactors_dict = estimateScaleFactor( [args.bamfile1, args.bamfile2], args.sampleLength, args.numberOfSamples, 1, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, chrsToSkip=args.ignoreForNormalization) scale_factors = scalefactors_dict['size_factors'] if args.verbose: bam1 = bamHandler.openBam(args.bamfile1) bam2 = bamHandler.openBam(args.bamfile2) print("Size factors using SES: {}".format(scale_factors)) print("%s regions of size %s where used " % (scalefactors_dict['sites_sampled'], args.sampleLength)) print( "ignoring filtering/blacklists, size factors if the number of mapped " "reads would have been used:") print( tuple( float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped]))) bam1.close() bam2.close() elif args.scaleFactorsMethod == 'readCount': args.bam = args.bamfile1 args.scaleFactor = 1.0 bam1_mapped, _ = get_num_kept_reads(args) args.bam = args.bamfile2 bam2_mapped, _ = get_num_kept_reads(args) scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array( [bam1_mapped, bam2_mapped]) mapped_reads = [bam1_mapped, bam2_mapped] if args.verbose: print("Size factors using total number " "of mapped reads: {}".format(scale_factors)) # in case the subtract method is used, the final difference # would be normalized according to the given method if args.ratio == 'subtract': # The next lines identify which of the samples is not scaled down. # The normalization using RPKM or normalize to 1x would use # as reference such sample. Since the other sample would be # scaled to match the un-scaled one, the normalization factor due to RPKM or normalize1x # for both samples should be based on the unscaled one. # For example, if sample A is unscaled and sample B is scaled by 0.5, # then normalizing factor for A to report RPKM read counts # is also applied to B. if args.scaleFactors is None: # check which of the two samples is not scaled down if scale_factors[0] == 1: args.bam = args.bamfile1 mapped_reads = mapped_reads[0] else: args.bam = args.bamfile2 mapped_reads = mapped_reads[1] if mapped_reads is None: mapped_reads, _ = get_num_kept_reads(args) # Replace the arguments args.normalizeTo1x = normalizeTo1x args.normalizeUsingRPKM = normalizeUsingRPKM if args.scaleFactors is None: if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if args.verbose: print(("Fragment length based on paired en data " "estimated to be {}".format( frag_len_dict['median']))) elif args.extendReads < 1: exit( "*ERROR*: read extension must be bigger than one. Value give: {} " .format(args.extendReads)) elif args.extendReads > 2000: exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print("Estimated read length is {}".format( int(read_len_dict['median']))) current_coverage = float( mapped_reads * fragment_length) / args.normalizeTo1x # the coverage scale factor is 1 / coverage, coverage_scale_factor = 1.0 / current_coverage scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print("Estimated current coverage {}".format( current_coverage)) print("Scale factor to convert " "current coverage to 1: {}".format( coverage_scale_factor)) else: # by default normalize using RPKM # the RPKM is: # Num reads per tile/(total reads (in millions)*tile length in Kb) millionReadsMapped = float(mapped_reads) / 1e6 tileLengthInKb = float(args.binSize) / 1000 coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb) scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print("Scale factor for RPKM is {0}".format( coverage_scale_factor)) return scale_factors
def get_scale_factor(args, stats): scale_factor = args.scaleFactor bam_mapped, bam_mapped_total = get_num_kept_reads(args, stats) if args.normalizeUsing == 'RPGC': # Print output, since normalzation stuff isn't printed to stderr otherwise sys.stderr.write("normalization: 1x (effective genome size {})\n".format(args.effectiveGenomeSize)) # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print(("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median']))) elif args.extendReads < 1: exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads)) elif args.extendReads > 2000: exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print("Estimated read length is {}".format(int(read_len_dict['median']))) current_coverage = \ float(bam_mapped * fragment_length) / args.effectiveGenomeSize # the scaling sets the coverage to match 1x scale_factor *= 1.0 / current_coverage if debug: print("Estimated current coverage {}".format(current_coverage)) print("Scaling factor {}".format(args.scaleFactor)) elif args.normalizeUsing == 'RPKM': # Print output, since normalzation stuff isn't printed to stderr otherwise sys.stderr.write("normalization: RPKM\n") # the RPKM is the # reads per tile / \ # ( total reads (in millions) * tile length in Kb) million_reads_mapped = float(bam_mapped) / 1e6 tile_len_in_kb = float(args.binSize) / 1000 scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb) if debug: print("scale factor using RPKM is {0}".format(args.scaleFactor)) elif args.normalizeUsing == 'CPM': # Print output, since normalzation stuff isn't printed to stderr otherwise sys.stderr.write("normalization: CPM\n") # the CPM (norm is based on post-filtering total counts of reads in BAM "bam_mapped") million_reads_mapped = float(bam_mapped) / 1e6 scale_factor *= 1.0 / (million_reads_mapped) if debug: print("scale factor using CPM is {0}".format(args.scaleFactor)) elif args.normalizeUsing == 'BPM': # Print output, since normalzation stuff isn't printed to stderr otherwise sys.stderr.write("normalization: BPM\n") # the BPM (norm is based on post-filtering total counts of reads in BAM "bam_mapped") # sampled_bins_sum = getSampledSum(args.bam) tile_len_in_kb = float(args.binSize) / 1000 tpm_scaleFactor = (bam_mapped / tile_len_in_kb) / 1e6 scale_factor *= 1 / (tpm_scaleFactor * tile_len_in_kb) if debug: print("scale factor using BPM is {0}".format(args.scaleFactor)) else: # Print output, since normalzation stuff isn't printed to stderr otherwise sys.stderr.write("normalization: none (signal scaled by the fraction of alignments kept after filtering)\n") scale_factor *= bam_mapped / float(bam_mapped_total) if args.verbose: print("Final scaling factor: {}".format(scale_factor)) return scale_factor
def main(args=None): args = process_args(args) global debug if args.verbose: debug = 1 else: debug = 0 func_args = {'scaleFactor': get_scale_factor(args)} if args.MNase: # check that library is paired end # using getFragmentAndReadSize from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if frag_len_dict is None: exit( "*Error*: For the --MNAse function a paired end library is required. " ) wr = CenterFragment( [args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, verbose=args.verbose, ) elif args.filterRNAstrand: wr = filterRnaStrand( [args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, verbose=args.verbose, ) wr.filter_strand = args.filterRNAstrand else: wr = writeBedGraph.WriteBedGraph( [args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, verbose=args.verbose, ) wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName, blackListFileName=args.blackListFileName, format=args.outFileFormat, smoothLength=args.smoothLength)
def main(args=None): args = process_args(args) global debug if args.verbose: sys.stderr.write("Specified --scaleFactor: {}\n".format(args.scaleFactor)) debug = 1 else: debug = 0 if args.normalizeUsing == 'None': args.normalizeUsing = None # For the sake of sanity elif args.normalizeUsing == 'RPGC' and not args.effectiveGenomeSize: sys.exit("RPGC normalization requires an --effectiveGenomeSize!\n") if args.normalizeUsing: # if a normalization is required then compute the scale factors bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors) bam.close() scale_factor = get_scale_factor(args, stats) else: scale_factor = args.scaleFactor func_args = {'scaleFactor': scale_factor} # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used if args.filterRNAstrand and not args.Offset: args.Offset = [1, -1] if args.MNase: # check that library is paired end # using getFragmentAndReadSize from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if frag_len_dict is None: sys.exit("*Error*: For the --MNAse function a paired end library is required. ") # Set some default fragment length bounds if args.minFragmentLength == 0: args.minFragmentLength = 130 if args.maxFragmentLength == 0: args.maxFragmentLength = 200 wr = CenterFragment([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, chrsToSkip=args.ignoreForNormalization, verbose=args.verbose, ) elif args.Offset: if len(args.Offset) > 1: if args.Offset[0] == 0: sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.") if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]: sys.exir("'Error*: The right side bound is less than the left-side bound. This is inappropriate.") else: if args.Offset[0] == 0: sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.") wr = OffsetFragment([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, chrsToSkip=args.ignoreForNormalization, verbose=args.verbose) wr.filter_strand = args.filterRNAstrand wr.Offset = args.Offset else: wr = writeBedGraph.WriteBedGraph([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, chrsToSkip=args.ignoreForNormalization, verbose=args.verbose, ) wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName, blackListFileName=args.blackListFileName, format=args.outFileFormat, smoothLength=args.smoothLength)
def getFragSize(bam, args, idx, outRawFrags): fragment_len_dict, read_len_dict = get_read_and_fragment_length( bam, return_lengths=True, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, binSize=args.binSize, distanceBetweenBins=args.distanceBetweenBins) if outRawFrags: label = bam if args.samplesLabel and idx < len(args.samplesLabel): label = args.samplesLabel[idx] if fragment_len_dict: fragment_len_dict['lengths'] = [ int(x) for x in fragment_len_dict['lengths'] ] cnts = np.bincount(fragment_len_dict['lengths'], minlength=int(fragment_len_dict['max']) + 1) else: read_len_dict['lengths'] = [ int(x) for x in read_len_dict['lengths'] ] cnts = np.bincount(read_len_dict['lengths'], minlength=int(read_len_dict['max']) + 1) for idx, v in enumerate(cnts): if v > 0: outRawFrags.write("{}\t{}\t{}\n".format(idx, v, label)) if args.samplesLabel and idx < len(args.samplesLabel): print("\n\nSample label: {}".format(args.samplesLabel[idx])) else: print("\n\nBAM file : {}".format(bam)) if fragment_len_dict: if fragment_len_dict['mean'] == 0: print( "No pairs were found. Is the data from a paired-end sequencing experiment?" ) print("Sample size: {}\n".format(fragment_len_dict['sample_size'])) print("Fragment lengths:") print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n" "3rd Qu.: {}\nMax.: {}\nStd: {}".format( fragment_len_dict['min'], fragment_len_dict['qtile25'], fragment_len_dict['mean'], fragment_len_dict['median'], fragment_len_dict['qtile75'], fragment_len_dict['max'], fragment_len_dict['std'])) print( "MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n" .format(fragment_len_dict['mad'], fragment_len_dict['qtile10'], fragment_len_dict['qtile20'], fragment_len_dict['qtile30'], fragment_len_dict['qtile40'], fragment_len_dict['qtile60'], fragment_len_dict['qtile70'], fragment_len_dict['qtile80'], fragment_len_dict['qtile90'], fragment_len_dict['qtile99'])) else: print( "No pairs were found. Is the data from a paired-end sequencing experiment?" ) print("\nRead lengths:") print("Sample size: {}\n".format(read_len_dict['sample_size'])) print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n" "3rd Qu.: {}\nMax.: {}\nStd: {}".format( read_len_dict['min'], read_len_dict['qtile25'], read_len_dict['mean'], read_len_dict['median'], read_len_dict['qtile75'], read_len_dict['max'], read_len_dict['std'])) print( "MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n" .format(read_len_dict['mad'], read_len_dict['qtile10'], read_len_dict['qtile20'], read_len_dict['qtile30'], read_len_dict['qtile40'], read_len_dict['qtile60'], read_len_dict['qtile70'], read_len_dict['qtile80'], read_len_dict['qtile90'], read_len_dict['qtile99'])) # The read and fragment lists will just eat up memory if not removed! if args.histogram: if fragment_len_dict: maxVal = fragment_len_dict['mean'] * 2 minVal = fragment_len_dict['min'] else: maxVal = read_len_dict['mean'] * 2 minVal = read_len_dict['min'] if args.maxFragmentLength > 0: maxVal = args.maxFragmentLength if fragment_len_dict: fragment_len_dict['lengths'] = getDensity( fragment_len_dict['lengths'], minVal, maxVal) if read_len_dict: read_len_dict['lengths'] = getDensity(read_len_dict['lengths'], minVal, maxVal) else: if fragment_len_dict: del fragment_len_dict['lengths'] if read_len_dict: del read_len_dict['lengths'] return (fragment_len_dict, read_len_dict)
def main(args=None): args = parse_arguments().parse_args(args) if args.extraSampling: extra_sampling_file = args.extraSampling.name args.extraSampling.close() else: extra_sampling_file = None global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile global_vars['filter_out'] = args.blackListFileName global_vars['extra_sampling_file'] = extra_sampling_file tbit = py2bit.open(global_vars['2bit']) bam, mapped, unmapped, stats = bamHandler.openBam(global_vars['bam'], returnStats=True, nThreads=args.numberOfProcessors) if args.fragmentLength: fragment_len_dict = \ {'median': args.fragmentLength} else: fragment_len_dict, __ = \ get_read_and_fragment_length(args.bamfile, None, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if not fragment_len_dict: print("\nPlease provide the fragment length used for the " "sample preparation.\n") exit(1) fragment_len_dict = {'median': int(fragment_len_dict['median'])} chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references) global_vars['genome_size'] = sum(tbit.chroms().values()) global_vars['total_reads'] = mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize confidence_p_value = float(1) / args.sampleSize # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] chromSizes = [x for x in chromSizes if x[0] in tbit.chroms()] # use poisson distribution to identify peaks that should be discarted. # I multiply by 4, because the real distribution of reads # vary depending on the gc content # and the global number of reads per bp may a be too low. # empirically, a value of at least 4 times as big as the # reads_per_bp was found. # Similarly for the min value, I divide by 4. global_vars['max_reads'] = poisson(4 * global_vars['reads_per_bp'] * fragment_len_dict['median']).isf(confidence_p_value) # this may be of not use, unless the depth of sequencing is really high # as this value is close to 0 global_vars['min_reads'] = poisson(0.25 * global_vars['reads_per_bp'] * fragment_len_dict['median']).ppf(confidence_p_value) for key in global_vars: print("{}: {}".format(key, global_vars[key])) print("computing frequencies") # the GC of the genome is sampled each stepSize bp. stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1) print("stepSize: {}".format(stepSize)) data = tabulateGCcontent(fragment_len_dict, chrNameBitToBam, stepSize, chromSizes, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region) np.savetxt(args.GCbiasFrequenciesFile.name, data) if args.biasPlot: reads_per_gc = countReadsPerGC(args.regionSize, chrNameBitToBam, stepSize * 10, chromSizes, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region) if args.plotFileFormat == "plotly": plotlyGCbias(args.biasPlot, data, reads_per_gc, args.regionSize) else: plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)
def main(args=None): args = parse_arguments().parse_args(args) if not args.outRawCounts and not args.plotFile: sys.exit( "Error: You need to specify at least one of --plotFile or --outRawCounts!\n" ) if args.labels is None: args.labels = args.bamfiles if args.smartLabels: args.labels = smartLabels(args.bamfiles) if len(args.labels) != len(args.bamfiles): sys.exit( "Error: The number of labels ({0}) does not match the number of BAM files ({1})!" .format(len(args.labels), len(args.bamfiles))) global gtf if not args.regionLabels and args.smartLabels: args.regionLabels = smartLabels(args.BED) gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels) # Get fragment size and chromosome dict fhs = [openBam(x) for x in args.bamfiles] chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose) for fh in fhs: fh.close() frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bamfiles[0], return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: defaultFragmentLength = frag_len_dict['median'] else: sys.exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if args.verbose: print("Fragment length based on paired en data " "estimated to be {0}".format(frag_len_dict['median'])) elif args.extendReads < read_len_dict['median']: sys.stderr.write( "*WARNING*: read extension is smaller than read length (read length = {}). " "Reads will not be extended.\n".format( int(read_len_dict['median']))) defaultFragmentLength = 'read length' elif args.extendReads > 2000: sys.exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(args.extendReads)) else: defaultFragmentLength = args.extendReads else: defaultFragmentLength = 'read length' # Get the chunkLength chunkLength = getChunkLength(args, chromSize) # Map reduce to get the counts/file/feature res = mapReduce([args, defaultFragmentLength], getEnrichment_worker, chromSize, genomeChunkLength=chunkLength, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) features = res[0][1] featureCounts = [] for i in list(range(len(args.bamfiles))): d = dict() for x in features: d[x] = 0 featureCounts.append(d) # res is a list, with each element a list (length len(args.bamfiles)) of dicts totalCounts = [0] * len(args.bamfiles) for x in res: for i, y in enumerate(x[2]): totalCounts[i] += y for i, y in enumerate(x[0]): for k, v in y.items(): featureCounts[i][k] += v # Make a plot if args.plotFile: plotEnrichment(args, featureCounts, totalCounts, features) # Raw counts if args.outRawCounts: of = open(args.outRawCounts, "w") of.write( "file\tfeatureType\tpercent\tfeatureReadCount\ttotalReadCount\n") for i, x in enumerate(args.labels): for k, v in featureCounts[i].items(): of.write("{0}\t{1}\t{2:5.2f}\t{3}\t{4}\n".format( x, k, (100.0 * v) / totalCounts[i], v, totalCounts[i])) of.close()
def get_scale_factor(args): scale_factor = args.scaleFactor bam_mapped, bam_mapped_total = get_num_kept_reads(args) if args.normalizeTo1x: # Print output, since normalzation stuff isn't printed to stderr otherwise sys.stderr.write( "normalization: 1x (effective genome size {})\n".format( args.normalizeTo1x)) # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if args.verbose: print( ("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median']))) elif args.extendReads < 1: exit( "*ERROR*: read extension must be bigger than one. Value give: {} " .format(args.extendReads)) elif args.extendReads > 2000: exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print("Estimated read length is {}".format( int(read_len_dict['median']))) current_coverage = \ float(bam_mapped * fragment_length) / args.normalizeTo1x # the scaling sets the coverage to match 1x scale_factor *= 1.0 / current_coverage if debug: print("Estimated current coverage {}".format(current_coverage)) print("Scaling factor {}".format(args.scaleFactor)) elif args.normalizeUsingRPKM: # Print output, since normalzation stuff isn't printed to stderr otherwise sys.stderr.write("normalization: RPKM\n") # the RPKM is the # reads per tile / \ # ( total reads (in millions) * tile length in Kb) million_reads_mapped = float(bam_mapped) / 1e6 tile_len_in_kb = float(args.binSize) / 1000 scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb) if debug: print("scale factor using RPKM is {0}".format(args.scaleFactor)) else: # Print output, since normalzation stuff isn't printed to stderr otherwise sys.stderr.write("normalization: depth\n") scale_factor *= bam_mapped / float(bam_mapped_total) if args.verbose: print("Final scaling factor: {}".format(scale_factor)) return scale_factor
def main(args=None): args = process_args(args) global debug if args.verbose: sys.stderr.write("Specified --scaleFactor: {}\n".format( args.scaleFactor)) debug = 1 else: debug = 0 if args.normalizeUsing == 'None': args.normalizeUsing = None # For the sake of sanity elif args.normalizeUsing == 'RPGC' and not args.effectiveGenomeSize: sys.exit("RPGC normalization requires an --effectiveGenomeSize!\n") if args.normalizeUsing: # if a normalization is required then compute the scale factors bam, mapped, unmapped, stats = openBam( args.bam, returnStats=True, nThreads=args.numberOfProcessors) bam.close() scale_factor = get_scale_factor(args, stats) else: scale_factor = args.scaleFactor func_args = {'scaleFactor': scale_factor} # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used if args.filterRNAstrand and not args.Offset: args.Offset = [1, -1] if args.MNase: # check that library is paired end # using getFragmentAndReadSize from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if frag_len_dict is None: sys.exit( "*Error*: For the --MNAse function a paired end library is required. " ) # Set some default fragment length bounds if args.minFragmentLength == 0: args.minFragmentLength = 130 if args.maxFragmentLength == 0: args.maxFragmentLength = 200 wr = CenterFragment( [args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, chrsToSkip=args.ignoreForNormalization, verbose=args.verbose, ) elif args.Offset: if len(args.Offset) > 1: if args.Offset[0] == 0: sys.exit( "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment." ) if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]: sys.exir( "'Error*: The right side bound is less than the left-side bound. This is inappropriate." ) else: if args.Offset[0] == 0: sys.exit( "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment." ) wr = OffsetFragment([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, chrsToSkip=args.ignoreForNormalization, verbose=args.verbose) wr.filter_strand = args.filterRNAstrand wr.Offset = args.Offset else: wr = writeBedGraph.WriteBedGraph( [args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, chrsToSkip=args.ignoreForNormalization, verbose=args.verbose, ) wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName, blackListFileName=args.blackListFileName, format=args.outFileFormat, smoothLength=args.smoothLength)
def __init__(self, bamFilesList, binLength=50, numberOfSamples=None, numberOfProcessors=1, verbose=False, region=None, bedFile=None, extendReads=False, blackListFileName=None, minMappingQuality=None, ignoreDuplicates=False, chrsToSkip=[], stepSize=None, center_read=False, samFlag_include=None, samFlag_exclude=None, zerosToNans=False, smoothLength=0, minFragmentLength=0, maxFragmentLength=0, out_file_for_raw_data=None): self.bamFilesList = bamFilesList self.binLength = binLength self.numberOfSamples = numberOfSamples self.blackListFileName = blackListFileName if extendReads and len(bamFilesList): from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(bamFilesList[0], return_lengths=False, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) if extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: self.defaultFragmentLength = int(frag_len_dict['median']) else: exit("*ERROR*: library is not paired-end. Please provide an extension length.") if verbose: print(("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median']))) elif extendReads < read_len_dict['median']: sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). " "Reads will not be extended.\n".format(int(read_len_dict['median']))) self.defaultFragmentLength = 'read length' elif extendReads > 2000: exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(extendReads)) else: self.defaultFragmentLength = int(extendReads) else: self.defaultFragmentLength = 'read length' self.numberOfProcessors = numberOfProcessors self.verbose = verbose self.region = region self.bedFile = bedFile self.minMappingQuality = minMappingQuality self.ignoreDuplicates = ignoreDuplicates self.chrsToSkip = chrsToSkip self.stepSize = stepSize self.center_read = center_read self.samFlag_include = samFlag_include self.samFlag_exclude = samFlag_exclude self.minFragmentLength = minFragmentLength self.maxFragmentLength = maxFragmentLength self.zerosToNans = zerosToNans self.smoothLength = smoothLength if out_file_for_raw_data: self.save_data = True self.out_file_for_raw_data = out_file_for_raw_data else: self.save_data = False self.out_file_for_raw_data = None # check that wither numberOfSamples or stepSize are set if numberOfSamples is None and stepSize is None and bedFile is None: raise ValueError("either stepSize, numberOfSamples or bedFile have to be set") if self.defaultFragmentLength != 'read length': self.maxPairedFragmentLength = 4 * self.defaultFragmentLength else: self.maxPairedFragmentLength = 1000 if self.maxFragmentLength > 0: self.maxPairedFragmentLength = self.maxFragmentLength
def main(args=None): args = parse_arguments().parse_args(args) if not args.outRawCounts and not args.plotFile: sys.exit("Error: You need to specify at least one of --plotFile or --outRawCounts!\n") if args.labels is None: args.labels = args.bamfiles if args.smartLabels: args.labels = smartLabels(args.bamfiles) if len(args.labels) != len(args.bamfiles): sys.exit("Error: The number of labels ({0}) does not match the number of BAM files ({1})!".format(len(args.labels), len(args.bamfiles))) # Ensure that if we're given an attributeKey that it's not empty if args.attributeKey and args.attributeKey == "": args.attributeKey = None global gtf if not args.regionLabels and args.smartLabels: args.regionLabels = smartLabels(args.BED) gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels, attributeKey=args.attributeKey) # Get fragment size and chromosome dict fhs = [openBam(x) for x in args.bamfiles] chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose) for fh in fhs: fh.close() frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bamfiles[0], return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: defaultFragmentLength = frag_len_dict['median'] else: sys.exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print("Fragment length based on paired en data " "estimated to be {0}".format(frag_len_dict['median'])) elif args.extendReads < read_len_dict['median']: sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). " "Reads will not be extended.\n".format(int(read_len_dict['median']))) defaultFragmentLength = 'read length' elif args.extendReads > 2000: sys.exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: defaultFragmentLength = args.extendReads else: defaultFragmentLength = 'read length' # Get the chunkLength chunkLength = getChunkLength(args, chromSize) # Map reduce to get the counts/file/feature res = mapReduce([args, defaultFragmentLength], getEnrichment_worker, chromSize, genomeChunkLength=chunkLength, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) features = res[0][1] featureCounts = [] for i in list(range(len(args.bamfiles))): d = dict() for x in features: d[x] = 0 featureCounts.append(d) # res is a list, with each element a list (length len(args.bamfiles)) of dicts totalCounts = [0] * len(args.bamfiles) for x in res: for i, y in enumerate(x[2]): totalCounts[i] += y for i, y in enumerate(x[0]): for k, v in y.items(): featureCounts[i][k] += v # Make a plot if args.plotFile: plotEnrichment(args, featureCounts, totalCounts, features) # Raw counts if args.outRawCounts: of = open(args.outRawCounts, "w") of.write("file\tfeatureType\tpercent\tfeatureReadCount\ttotalReadCount\n") for i, x in enumerate(args.labels): for k, v in featureCounts[i].items(): of.write("{0}\t{1}\t{2:5.2f}\t{3}\t{4}\n".format(x, k, (100.0 * v) / totalCounts[i], v, totalCounts[i])) of.close()
def get_scale_factors(args): bam1 = bamHandler.openBam(args.bamfile1) bam2 = bamHandler.openBam(args.bamfile2) bam1_mapped = parserCommon.bam_total_reads(bam1, args.ignoreForNormalization) bam2_mapped = parserCommon.bam_total_reads(bam2, args.ignoreForNormalization) if args.scaleFactors: scale_factors = map(float, args.scaleFactors.split(":")) else: if args.scaleFactorsMethod == 'SES': scalefactors_dict = estimateScaleFactor( [bam1.filename, bam2.filename], args.sampleLength, args.numberOfSamples, 1, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, chrsToSkip=args.ignoreForNormalization) scale_factors = scalefactors_dict['size_factors'] if args.verbose: print "Size factors using SES: {}".format(scale_factors) print "%s regions of size %s where used " % \ (scalefactors_dict['sites_sampled'], args.sampleLength) print "size factor if the number of mapped " \ "reads would have been used:" print tuple( float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped])) elif args.scaleFactorsMethod == 'readCount': scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array([bam1_mapped, bam2_mapped]) if args.verbose: print "Size factors using total number " \ "of mapped reads: {}".format(scale_factors) # in case the subtract method is used, the final difference # would be normalized according to the given method if args.ratio == 'subtract': # The next lines identify which of the samples is not scaled down. # The normalization using RPKM or normalize to 1x would use # as reference such sample. Since the other sample would be # scaled to match the un-scaled one, the normalization factor # for both samples should be based on the unscaled one. # For example, if sample A is unscaled and sample B is scaled by 0.5, # then normalizing factor for A to report RPKM read counts # is also applied to B. if scale_factors[0] == 1: mappedReads = bam1_mapped bamfile = args.bamfile1 else: mappedReads = bam2_mapped bamfile = args.bamfile2 if args.scaleFactors is None: if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(bamfile, return_lengths=False, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median'])) elif args.extendReads < 1: exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads)) elif args.extendReads > 2000: exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print "Estimated read length is {}".format(int(read_len_dict['median'])) current_coverage = float(mappedReads * fragment_length) / args.normalizeTo1x # the coverage scale factor is 1 / coverage, coverage_scale_factor = 1.0 / current_coverage scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print "Estimated current coverage {}".format(current_coverage) print "Scale factor to convert " \ "current coverage to 1: {}".format(coverage_scale_factor) else: # by default normalize using RPKM # the RPKM is: # Num reads per tile/(total reads (in millions)*tile length in Kb) millionReadsMapped = float(mappedReads) / 1e6 tileLengthInKb = float(args.binSize) / 1000 coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb) scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print "scale factor for " "RPKM is {0}".format(coverage_scale_factor) return scale_factors
def getFragSize(bam, args, idx, outRawFrags): fragment_len_dict, read_len_dict = get_read_and_fragment_length(bam, return_lengths=True, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, binSize=args.binSize, distanceBetweenBins=args.distanceBetweenBins) if outRawFrags: label = bam if args.samplesLabel and idx < len(args.samplesLabel): label = args.samplesLabel[idx] if fragment_len_dict: fragment_len_dict['lengths'] = [int(x) for x in fragment_len_dict['lengths']] cnts = np.bincount(fragment_len_dict['lengths'], minlength=int(fragment_len_dict['max']) + 1) else: read_len_dict['lengths'] = [int(x) for x in read_len_dict['lengths']] cnts = np.bincount(read_len_dict['lengths'], minlength=int(read_len_dict['max']) + 1) for idx, v in enumerate(cnts): if v > 0: outRawFrags.write("{}\t{}\t{}\n".format(idx, v, label)) if args.samplesLabel and idx < len(args.samplesLabel): print("\n\nSample label: {}".format(args.samplesLabel[idx])) else: print("\n\nBAM file : {}".format(bam)) if fragment_len_dict: if fragment_len_dict['mean'] == 0: print("No pairs were found. Is the data from a paired-end sequencing experiment?") print("Sample size: {}\n".format(fragment_len_dict['sample_size'])) print("Fragment lengths:") print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n" "3rd Qu.: {}\nMax.: {}\nStd: {}".format(fragment_len_dict['min'], fragment_len_dict['qtile25'], fragment_len_dict['mean'], fragment_len_dict['median'], fragment_len_dict['qtile75'], fragment_len_dict['max'], fragment_len_dict['std'])) print("MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n".format(fragment_len_dict['mad'], fragment_len_dict['qtile10'], fragment_len_dict['qtile20'], fragment_len_dict['qtile30'], fragment_len_dict['qtile40'], fragment_len_dict['qtile60'], fragment_len_dict['qtile70'], fragment_len_dict['qtile80'], fragment_len_dict['qtile90'], fragment_len_dict['qtile99'])) else: print("No pairs were found. Is the data from a paired-end sequencing experiment?") print("\nRead lengths:") print("Sample size: {}\n".format(read_len_dict['sample_size'])) print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n" "3rd Qu.: {}\nMax.: {}\nStd: {}".format(read_len_dict['min'], read_len_dict['qtile25'], read_len_dict['mean'], read_len_dict['median'], read_len_dict['qtile75'], read_len_dict['max'], read_len_dict['std'])) print("MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n".format(read_len_dict['mad'], read_len_dict['qtile10'], read_len_dict['qtile20'], read_len_dict['qtile30'], read_len_dict['qtile40'], read_len_dict['qtile60'], read_len_dict['qtile70'], read_len_dict['qtile80'], read_len_dict['qtile90'], read_len_dict['qtile99'])) # The read and fragment lists will just eat up memory if not removed! if args.histogram: if fragment_len_dict: maxVal = fragment_len_dict['mean'] * 2 minVal = fragment_len_dict['min'] else: maxVal = read_len_dict['mean'] * 2 minVal = read_len_dict['min'] if args.maxFragmentLength > 0: maxVal = args.maxFragmentLength if fragment_len_dict: fragment_len_dict['lengths'] = getDensity(fragment_len_dict['lengths'], minVal, maxVal) if read_len_dict: read_len_dict['lengths'] = getDensity(read_len_dict['lengths'], minVal, maxVal) else: if fragment_len_dict: del fragment_len_dict['lengths'] if read_len_dict: del read_len_dict['lengths'] return (fragment_len_dict, read_len_dict)