def get_num_kept_reads(args, stats): """ Substracts from the total number of mapped reads in a bamfile the proportion of reads that fall into blacklisted regions or that are filtered :return: integer """ if stats is None: bam_handle, mapped, unmapped, stats = bamHandler.openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors) else: bam_handle = bamHandler.openBam(args.bam) bam_mapped_total = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization, stats) if args.blackListFileName: blacklisted = utilities.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization, args.blackListFileName, args.numberOfProcessors) print("There are {0} alignments, of which {1} are completely " "within a blacklist region.".format(bam_mapped_total, blacklisted)) num_kept_reads = bam_mapped_total - blacklisted else: num_kept_reads = bam_mapped_total ftk = fraction_kept(args, stats) if ftk < 1: num_kept_reads *= ftk print("Due to filtering, {0}% of the aforementioned alignments " "will be used {1}".format(100 * ftk, num_kept_reads)) return num_kept_reads, bam_mapped_total
def get_num_kept_reads(args, stats): """ Substracts from the total number of mapped reads in a bamfile the proportion of reads that fall into blacklisted regions or that are filtered :return: integer """ if stats is None: bam_handle, mapped, unmapped, stats = bamHandler.openBam( args.bam, returnStats=True, nThreads=args.numberOfProcessors) else: bam_handle = bamHandler.openBam(args.bam) bam_mapped_total = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization, stats) if args.blackListFileName: blacklisted = utilities.bam_blacklisted_reads( bam_handle, args.ignoreForNormalization, args.blackListFileName, args.numberOfProcessors) print("There are {0} alignments, of which {1} are completely " "within a blacklist region.".format(bam_mapped_total, blacklisted)) num_kept_reads = bam_mapped_total - blacklisted else: num_kept_reads = bam_mapped_total ftk = fraction_kept(args, stats) if ftk < 1: num_kept_reads *= ftk print("Due to filtering, {0}% of the aforementioned alignments " "will be used {1}".format(100 * ftk, num_kept_reads)) return num_kept_reads, bam_mapped_total
def get_scale_factor(args): scale_factor = args.scaleFactor bam_handle = bamHandler.openBam(args.bam) bam_mapped = parserCommon.bam_total_reads(bam_handle, args.ignoreForNormalization) blacklisted = parserCommon.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization, args.blackListFileName) bam_mapped -= blacklisted if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median'])) elif args.extendReads < 1: exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads)) elif args.extendReads > 2000: exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print "Estimated read length is {}".format(int(read_len_dict['median'])) current_coverage = \ float(bam_mapped * fragment_length) / args.normalizeTo1x # the scaling sets the coverage to match 1x scale_factor *= 1.0 / current_coverage if debug: print "Estimated current coverage {}".format(current_coverage) print "Scaling factor {}".format(args.scaleFactor) elif args.normalizeUsingRPKM: # the RPKM is the # reads per tile / \ # ( total reads (in millions) * tile length in Kb) million_reads_mapped = float(bam_mapped) / 1e6 tile_len_in_kb = float(args.binSize) / 1000 scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb) if debug: print "scale factor using RPKM is {0}".format(args.scaleFactor) return scale_factor
def __init__(self): import os self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/" self.tbitFile = self.root + "sequence.2bit" self.bamFile = self.root + "test.bam" self.mappability = self.root + "mappability.bw" self.chrNameBam = '2L' self.chrNameBit = 'chr2L' bam = bamHandler.openBam(self.bamFile) tbit = py2bit.open(self.tbitFile) global debug debug = 0 global global_vars global_vars = {'2bit': self.tbitFile, 'bam': self.bamFile, 'filter_out': None, 'mappability': self.mappability, 'extra_sampling_file': None, 'max_reads': 5, 'min_reads': 0, 'min_reads': 0, 'reads_per_bp': 0.3, 'total_reads': bam.mapped, 'genome_size': sum(tbit.chroms().values()) }
def countReadsPerGC_worker(chromNameBam, start, end, stepSize, regionSize, chrNameBamToBit, verbose=False): """given a genome region defined by (start, end), the GC content is quantified for regions of size regionSize that are contiguous """ chromNameBit = chrNameBamToBit[chromNameBam] tbit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = bamHandler.openBam(global_vars['bam']) c = 1 sub_reads_per_gc = [] positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) for index in xrange(len(positions_to_sample)): i = positions_to_sample[index] # stop if region extends over the chromosome end if tbit[chromNameBit].size < i + regionSize: break try: gc = getGC_content(tbit[chromNameBit].get(i, i + regionSize)) except Exception as detail: if verbose: print "{}:{}-{}".format(chromNameBit, i, i + regionSize) print detail continue numberReads = bam.count(chromNameBam, i, i + regionSize) sub_reads_per_gc.append((numberReads, gc)) c += 1 return sub_reads_per_gc
def __init__(self): import os self.root = os.path.dirname( os.path.abspath(__file__)) + "/test/test_corrGC/" self.tbitFile = self.root + "sequence.2bit" self.bamFile = self.root + "test.bam" self.mappability = self.root + "mappability.bw" self.chrNameBam = '2L' self.chrNameBit = 'chr2L' self.samtools = cfg.config.get('external_tools', 'samtools') bam = bamHandler.openBam(self.bamFile) bit = twobit.TwoBitFile(open(self.tbitFile)) global debug debug = 0 global global_vars global_vars = { '2bit': self.tbitFile, 'bam': self.bamFile, 'filter_out': None, 'mappability': self.mappability, 'extra_sampling_file': None, 'max_reads': 5, 'min_reads': 0, 'min_reads': 0, 'reads_per_bp': 0.3, 'total_reads': bam.mapped, 'genome_size': sum([bit[x].size for x in bit.index]) }
def countReadsPerGC_worker(chromNameBam, start, end, stepSize, regionSize, chrNameBamToBit, verbose=False): """given a genome region defined by (start, end), the GC content is quantified for regions of size regionSize that are contiguous """ chromNameBit = chrNameBamToBit[chromNameBam] tbit = py2bit.open(global_vars['2bit']) bam = bamHandler.openBam(global_vars['bam']) c = 1 sub_reads_per_gc = [] positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) for index in range(len(positions_to_sample)): i = positions_to_sample[index] # stop if region extends over the chromosome end if tbit.chroms(chromNameBit) < i + regionSize: break try: gc = getGC_content(tbit, chromNameBit, int(i), int(i + regionSize)) except Exception as detail: if verbose: print("{}:{}-{}".format(chromNameBit, i, i + regionSize)) print(detail) continue numberReads = bam.count(chromNameBam, i, i + regionSize) sub_reads_per_gc.append((numberReads, gc)) c += 1 return sub_reads_per_gc
def testTabulateGCcontent(self): fragmentLength = {'median': 10} chrNameBitToBam = {'chr2L': '2L'} stepSize = 1 bam = bamHandler.openBam(global_vars['bam']) chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] return (fragmentLength, chrNameBitToBam, stepSize, chromSizes, 1)
def testCountReadsPerGC(self): regionSize = 300 chrNameBitToBam = {'chr2L': '2L'} stepSize = 1 bam = bamHandler.openBam(global_vars['bam']) chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] return (regionSize, chrNameBitToBam, stepSize, chromSizes, 1)
def fraction_kept(args): """ Count the following: (A) The total number of alignments sampled (B) The total number of alignments ignored due to any of the following: --samFlagInclude --samFlagExclude --minMappingQuality --ignoreDuplicates --minFragmentLength --maxFragmentLength Black list regions are already accounted for. This works by sampling the genome (by default, we'll iterate until we sample 1% or 100,000 alignments, whichever is smaller (unless there are fewer than 100,000 alignments, in which case sample everything). The sampling works by dividing the genome into bins and only looking at the first 50000 bases. If this doesn't yield sufficient alignments then the bin size is halved. """ filtered = 0 total = 0 distanceBetweenBins = 2000000 bam_handle = bamHandler.openBam(args.bam) bam_mapped = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization) num_needed_to_sample = max(bam_mapped if bam_mapped <= 100000 else 0, min(100000, 0.01 * bam_mapped)) if args.ignoreForNormalization: chrom_sizes = [(chrom_name, bam_handle.lengths[idx]) for idx, chrom_name in enumerate(bam_handle.references) if chrom_name not in args.ignoreForNormalization] else: chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths)) while total < num_needed_to_sample and distanceBetweenBins > 50000: # If we've iterated, then halve distanceBetweenBins distanceBetweenBins /= 2 if distanceBetweenBins < 50000: distanceBetweenBins = 50000 res = mapReduce.mapReduce((bam_handle.filename, args), getFractionKept_wrapper, chrom_sizes, genomeChunkLength=distanceBetweenBins, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if len(res): filtered, total = np.sum(res, axis=0) if total == 0: # This should never happen total = 1 return 1.0 - float(filtered) / float(total)
def bam_blacklisted_worker(args): bam, chrom, start, end = args fh = openBam(bam) blacklisted = 0 for r in fh.fetch(reference=chrom, start=start, end=end): if r.reference_start >= start and r.reference_start + r.infer_query_length(always=False) - 1 <= end: blacklisted += 1 fh.close() return blacklisted
def getEnrichment_worker(arglist): """ This is the worker function of plotEnrichment. In short, given a region, iterate over all reads **starting** in it. Filter/extend them as requested and check each for an overlap with findOverlaps. For each overlap, increment the counter for that feature. """ chrom, start, end, args, defaultFragmentLength = arglist if args.verbose: sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end)) gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels) olist = [] total = [0] * len(args.bamfiles) for idx, f in enumerate(args.bamfiles): odict = dict() for x in gtf.features: odict[x] = 0 fh = openBam(f) chrom = mungeChromosome(chrom, fh.references) prev_start_pos = None # to store the start positions for read in fh.fetch(chrom, start, end): # Filter if read.pos < start: # Ensure that a given alignment is processed only once continue if read.flag & 4: continue if args.minMappingQuality and read.mapq < args.minMappingQuality: continue if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: continue tLen = getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: continue if args.ignoreDuplicates and prev_start_pos \ and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse): continue prev_start_pos = (read.reference_start, read.pnext, read.is_reverse) total[idx] += 1 # Get blocks, possibly extending features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset)) if features is not None and len(features) > 0: for x in features: odict[x] += 1 olist.append(odict) return olist, gtf.features, total
def getEnrichment_worker(arglist): """ This is the worker function of plotEnrichment. In short, given a region, iterate over all reads **starting** in it. Filter/extend them as requested and check each for an overlap with findOverlaps. For each overlap, increment the counter for that feature. """ chrom, start, end, args, defaultFragmentLength = arglist if args.verbose: sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end)) gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels) olist = [] for f in args.bamfiles: odict = dict() for x in gtf.features: odict[x] = 0 fh = openBam(f) chrom = mungeChromosome(chrom, fh.references) prev_start_pos = None # to store the start positions for read in fh.fetch(chrom, start, end): # Filter if read.pos < start: # Ensure that a given alignment is processed only once continue if read.flag & 4: continue if args.minMappingQuality and read.mapq < args.minMappingQuality: continue if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: continue if args.minFragmentLength > 0 and abs(read.template_length) < args.minFragmentLength: continue if args.maxFragmentLength > 0 and abs(read.template_length) > args.maxFragmentLength: continue if args.ignoreDuplicates and prev_start_pos \ and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse): continue prev_start_pos = (read.reference_start, read.pnext, read.is_reverse) # Get blocks, possibly extending features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads)) if features is not None and len(features) > 0: for x in features: odict[x] += 1 olist.append(odict) return olist, gtf.features
def getRead(self, readType): """ prepare arguments for test """ bam = bamHandler.openBam(self.bamFile_PE) if readType == 'paired-reverse': read = [x for x in bam.fetch('chr2', 5000081, 5000082)][0] elif readType == 'single-forward': read = [x for x in bam.fetch('chr2', 5001491, 5001492)][0] elif readType == 'single-reverse': read = [x for x in bam.fetch('chr2', 5001700, 5001701)][0] else: # by default a forward paired read is returned read = [x for x in bam.fetch('chr2', 5000027, 5000028)][0] return read
def getFragmentLength_worker(chrom, start, end, bamFile, distanceBetweenBins): """ Queries the reads at the given region for the distance between reads and the read length Parameters ---------- chrom : str chromosome name start : int region start end : int region end bamFile : str BAM file name distanceBetweenBins : int the number of bases at the end of each bin to ignore Returns ------- np.array an np.array, where first column is fragment length, the second is for read length """ bam = bamHandler.openBam(bamFile) end = max(start + 1, end - distanceBetweenBins) if chrom in bam.references: reads = np.array([ (abs(r.template_length), r.infer_query_length(always=False)) for r in bam.fetch(chrom, start, end) if r.is_proper_pair and r.is_read1 and not r.is_unmapped ]) if not len(reads): # if the previous operation produces an empty list # it could be that the data is not paired, then # we try with out filtering reads = np.array([(abs(r.template_length), r.infer_query_length(always=False)) for r in bam.fetch(chrom, start, end) if not r.is_unmapped]) else: raise NameError("chromosome {} not found in bam file".format(chrom)) if not len(reads): reads = np.array([]).reshape(0, 2) return reads
def getFragmentLength_worker(chrom, start, end, bamFile, distanceBetweenBins): """ Queries the reads at the given region for the distance between reads and the read length Parameters ---------- chrom : str chromosome name start : int region start end : int region end bamFile : str BAM file name distanceBetweenBins : int the number of bases at the end of each bin to ignore Returns ------- np.array an np.array, where first column is fragment length, the second is for read length """ bam = bamHandler.openBam(bamFile) end = max(start + 1, end - distanceBetweenBins) if chrom in bam.references: reads = np.array([(abs(r.template_length), r.infer_query_length(always=False)) for r in bam.fetch(chrom, start, end) if r.is_proper_pair and r.is_read1]) if not len(reads): # if the previous operation produces an empty list # it could be that the data is not paired, then # we try with out filtering reads = np.array([(abs(r.template_length), r.infer_query_length(always=False)) for r in bam.fetch(chrom, start, end)]) else: raise NameError("chromosome {} not found in bam file".format(chrom)) if not len(reads): reads = np.array([]).reshape(0, 2) return reads
def get_scale_factors(args): bam1 = bamHandler.openBam(args.bamfile1) bam2 = bamHandler.openBam(args.bamfile2) bam1_mapped = parserCommon.bam_total_reads(bam1, args.ignoreForNormalization) bam2_mapped = parserCommon.bam_total_reads(bam2, args.ignoreForNormalization) if args.scaleFactors: scale_factors = map(float, args.scaleFactors.split(":")) else: if args.scaleFactorsMethod == 'SES': scalefactors_dict = estimateScaleFactor( [bam1.filename, bam2.filename], args.sampleLength, args.numberOfSamples, 1, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, chrsToSkip=args.ignoreForNormalization) scale_factors = scalefactors_dict['size_factors'] if args.verbose: print "Size factors using SES: {}".format(scale_factors) print "%s regions of size %s where used " % \ (scalefactors_dict['sites_sampled'], args.sampleLength) print "size factor if the number of mapped " \ "reads would have been used:" print tuple( float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped])) elif args.scaleFactorsMethod == 'readCount': scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array([bam1_mapped, bam2_mapped]) if args.verbose: print "Size factors using total number " \ "of mapped reads: {}".format(scale_factors) # in case the subtract method is used, the final difference # would be normalized according to the given method if args.ratio == 'subtract': # The next lines identify which of the samples is not scaled down. # The normalization using RPKM or normalize to 1x would use # as reference such sample. Since the other sample would be # scaled to match the un-scaled one, the normalization factor # for both samples should be based on the unscaled one. # For example, if sample A is unscaled and sample B is scaled by 0.5, # then normalizing factor for A to report RPKM read counts # is also applied to B. if scale_factors[0] == 1: mappedReads = bam1_mapped bamfile = args.bamfile1 else: mappedReads = bam2_mapped bamfile = args.bamfile2 if args.scaleFactors is None: if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(bamfile, return_lengths=False, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median'])) elif args.extendReads < 1: exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads)) elif args.extendReads > 2000: exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print "Estimated read length is {}".format(int(read_len_dict['median'])) current_coverage = float(mappedReads * fragment_length) / args.normalizeTo1x # the coverage scale factor is 1 / coverage, coverage_scale_factor = 1.0 / current_coverage scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print "Estimated current coverage {}".format(current_coverage) print "Scale factor to convert " \ "current coverage to 1: {}".format(coverage_scale_factor) else: # by default normalize using RPKM # the RPKM is: # Num reads per tile/(total reads (in millions)*tile length in Kb) millionReadsMapped = float(mappedReads) / 1e6 tileLengthInKb = float(args.binSize) / 1000 coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb) scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print "scale factor for " "RPKM is {0}".format(coverage_scale_factor) return scale_factors
def run(self, allArgs=None): bamFilesHandles = [] for x in self.bamFilesList: try: y = bamHandler.openBam(x) except SystemExit: sys.exit(sys.exc_info()[1]) except: y = pyBigWig.open(x) bamFilesHandles.append(y) chromsizes, non_common = deeptools.utilities.getCommonChrNames( bamFilesHandles, verbose=self.verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(self.chrsToSkip): chromsizes = [x for x in chromsizes if x[0] not in self.chrsToSkip] chrNames, chrLengths = list(zip(*chromsizes)) genomeSize = sum(chrLengths) if self.bedFile is None: chunkSize = self.get_chunk_length(bamFilesHandles, genomeSize, chromsizes, chrLengths) else: chunkSize = None [bam_h.close() for bam_h in bamFilesHandles] if self.verbose: print("step size is {}".format(self.stepSize)) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) # Handle GTF options transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions( allArgs) # use map reduce to call countReadsInRegions_wrapper imap_res = mapReduce.mapReduce( [], countReadsInRegions_wrapper, chromsizes, self_=self, genomeChunkLength=chunkSize, bedFile=self.bedFile, blackListFileName=self.blackListFileName, region=self.region, numberOfProcessors=self.numberOfProcessors, transcriptID=transcriptID, exonID=exonID, keepExons=keepExons, transcript_id_designator=transcript_id_designator) if self.out_file_for_raw_data: if len(non_common): sys.stderr.write( "*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n" ) # concatenate intermediary bedgraph files ofile = open(self.out_file_for_raw_data, "w") for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one _foo = open(tempFileName, 'r') shutil.copyfileobj(_foo, ofile) _foo.close() os.remove(tempFileName) ofile.close() try: num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return num_reads_per_bin except ValueError: if self.bedFile: sys.exit( '\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: sys.exit( '\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.')
def openBam(bamFile, bamIndex=None): return bamHandler.openBam(bamFile, bamIndex)
def main(args=None): args = parse_arguments().parse_args(args) if args.extraSampling: extra_sampling_file = args.extraSampling.name args.extraSampling.close() else: extra_sampling_file = None global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile global_vars['filter_out'] = args.blackListFileName global_vars['extra_sampling_file'] = extra_sampling_file tbit = py2bit.open(global_vars['2bit']) bam = bamHandler.openBam(global_vars['bam']) if args.fragmentLength: fragment_len_dict = \ {'median': args.fragmentLength} else: fragment_len_dict, __ = \ get_read_and_fragment_length(args.bamfile, None, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if not fragment_len_dict: print("\nPlease provide the fragment length used for the " "sample preparation.\n") exit(1) fragment_len_dict = {'median': int(fragment_len_dict['median'])} chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references) global_vars['genome_size'] = sum(tbit.chroms().values()) global_vars['total_reads'] = bam.mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize confidence_p_value = float(1) / args.sampleSize # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] # use poisson distribution to identify peaks that should be discarted. # I multiply by 4, because the real distribution of reads # vary depending on the gc content # and the global number of reads per bp may a be too low. # empirically, a value of at least 4 times as big as the # reads_per_bp was found. # Similarly for the min value, I divide by 4. global_vars['max_reads'] = \ poisson(4 * global_vars['reads_per_bp'] * fragment_len_dict['median']).isf(confidence_p_value) # this may be of not use, unless the depth of sequencing is really high # as this value is close to 0 global_vars['min_reads'] = \ poisson(0.25 * global_vars['reads_per_bp'] * fragment_len_dict['median']).ppf(confidence_p_value) for key in global_vars: print("{}: {}".format(key, global_vars[key])) print("computing frequencies") # the GC of the genome is sampled each stepSize bp. stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1) print("stepSize: {}".format(stepSize)) data = tabulateGCcontent(fragment_len_dict, chrNameBitToBam, stepSize, chromSizes, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region) np.savetxt(args.GCbiasFrequenciesFile.name, data) if args.biasPlot: reads_per_gc = countReadsPerGC(args.regionSize, chrNameBitToBam, stepSize * 10, chromSizes, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region) plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)
def main(args=None): args = parse_arguments().parse_args(args) if not args.outRawCounts and not args.plotFile: sys.exit("Error: You need to specify at least one of --plotFile or --outRawCounts!\n") if args.labels is None: args.labels = args.bamfiles if args.smartLabels: args.labels = smartLabels(args.bamfiles) if len(args.labels) != len(args.bamfiles): sys.exit("Error: The number of labels ({0}) does not match the number of BAM files ({1})!".format(len(args.labels), len(args.bamfiles))) # Ensure that if we're given an attributeKey that it's not empty if args.attributeKey and args.attributeKey == "": args.attributeKey = None global gtf if not args.regionLabels and args.smartLabels: args.regionLabels = smartLabels(args.BED) gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels, attributeKey=args.attributeKey) # Get fragment size and chromosome dict fhs = [openBam(x) for x in args.bamfiles] chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose) for fh in fhs: fh.close() frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bamfiles[0], return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: defaultFragmentLength = frag_len_dict['median'] else: sys.exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print("Fragment length based on paired en data " "estimated to be {0}".format(frag_len_dict['median'])) elif args.extendReads < read_len_dict['median']: sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). " "Reads will not be extended.\n".format(int(read_len_dict['median']))) defaultFragmentLength = 'read length' elif args.extendReads > 2000: sys.exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: defaultFragmentLength = args.extendReads else: defaultFragmentLength = 'read length' # Get the chunkLength chunkLength = getChunkLength(args, chromSize) # Map reduce to get the counts/file/feature res = mapReduce([args, defaultFragmentLength], getEnrichment_worker, chromSize, genomeChunkLength=chunkLength, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) features = res[0][1] featureCounts = [] for i in list(range(len(args.bamfiles))): d = dict() for x in features: d[x] = 0 featureCounts.append(d) # res is a list, with each element a list (length len(args.bamfiles)) of dicts totalCounts = [0] * len(args.bamfiles) for x in res: for i, y in enumerate(x[2]): totalCounts[i] += y for i, y in enumerate(x[0]): for k, v in y.items(): featureCounts[i][k] += v # Make a plot if args.plotFile: plotEnrichment(args, featureCounts, totalCounts, features) # Raw counts if args.outRawCounts: of = open(args.outRawCounts, "w") of.write("file\tfeatureType\tpercent\tfeatureReadCount\ttotalReadCount\n") for i, x in enumerate(args.labels): for k, v in featureCounts[i].items(): of.write("{0}\t{1}\t{2:5.2f}\t{3}\t{4}\n".format(x, k, (100.0 * v) / totalCounts[i], v, totalCounts[i])) of.close()
def writeBedGraph_worker( chrom, start, end, tileSize, defaultFragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds=True, smoothLength=0, missingDataAsZero=False, fixed_step=False, ): r""" Writes a bedgraph having as base a number of bam files. The given func is called to compute the desired bedgraph value using the funcArgs tileSize """ if start > end: raise NameError("start position ({0}) bigger than " "end position ({1})".format(start, end)) coverage = [] for indexFile, fileFormat in bamOrBwFileList: if fileFormat == "bam": bamHandle = bamHandler.openBam(indexFile) coverage.append( getCoverageFromBam( bamHandle, chrom, start, end, tileSize, defaultFragmentLength, extendPairedEnds, True ) ) bamHandle.close() elif fileFormat == "bigwig": bigwigHandle = pyBigWig.open(indexFile) coverage.append(getCoverageFromBigwig(bigwigHandle, chrom, start, end, tileSize, missingDataAsZero)) bigwigHandle.close() # is /dev/shm available? # working in this directory speeds the process try: _file = tempfile.NamedTemporaryFile(dir="/dev/shm", delete=False) except OSError: _file = tempfile.NamedTemporaryFile(delete=False) previousValue = None lengthCoverage = len(coverage[0]) for tileIndex in xrange(lengthCoverage): tileCoverage = [] for index in range(len(bamOrBwFileList)): if smoothLength > 0: vectorStart, vectorEnd = getSmoothRange(tileIndex, tileSize, smoothLength, lengthCoverage) tileCoverage.append(np.mean(coverage[index][vectorStart:vectorEnd])) else: try: tileCoverage.append(coverage[index][tileIndex]) except IndexError: print "Chromosome {} probably not in one of the bigwig " "files. Remove this chromosome from the bigwig file " "to continue".format( chrom ) exit(0) # if zerosToNans == True and sum(tileCoverage) == 0.0: # continue value = func(tileCoverage, funcArgs) if fixed_step: writeStart = start + tileIndex * tileSize writeEnd = min(writeStart + tileSize, end) try: _file.write("%s\t%d\t%d\t%.2f\n" % (chrom, writeStart, writeEnd, value)) except TypeError: _file.write("{}\t{}\t{}\t{}\n".format(chrom, writeStart, writeEnd, value)) else: if previousValue is None: writeStart = start + tileIndex * tileSize writeEnd = min(writeStart + tileSize, end) previousValue = value elif previousValue == value: writeEnd = min(writeEnd + tileSize, end) elif previousValue != value: if not np.isnan(previousValue): _file.write("%s\t%d\t%d\t%.2f\n" % (chrom, writeStart, writeEnd, previousValue)) previousValue = value writeStart = writeEnd writeEnd = min(writeStart + tileSize, end) if not fixed_step: # write remaining value if not a nan if previousValue and writeStart != end and not np.isnan(previousValue): _file.write("%s\t%d\t%d\t%.1f\n" % (chrom, writeStart, end, previousValue)) # """ tempFileName = _file.name _file.close() return tempFileName
def estimateScaleFactor(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, normalizationLength, avg_method='median', numberOfProcessors=1, verbose=False, chrsToSkip=[]): r""" Subdivides the genome into chunks to be analyzed in parallel using several processors. The code handles the creation of workers that compute fragment counts (coverage) for different regions and then collect and integrates the results. The arguments are: 'bamFilesList', list of bam files to normalize 'binLength', the window size in bp, where reads are going to be counted. 'numberOfSamples', Number of sites to sample. 'defaultFragmentLength', if the reads are not paired, this value is used extend the reads. 'normalizationLength', length, in bp, to normalize the data. For a value of 1, are given such that on average 1 fragment per base pair is found 'avg_method', defines how the different values are to be summarized. The options are 'mean' and 'median' 'chrsToSkip', name of the chromosomes to be excluded from the scale stimation. Usually the chrX is included. For example, to test about 1 million regions of length 500 bp, the binLength will be 500 and the numberOfSamples is going to be the size of the genome divided by the 1 million. This number is not exact because regions in which all counts are 0 are not taken into account The test data contains reads for 200 bp >>> test = Tester() >>> dict = estimateScaleFactor([test.bamFile1, test.bamFile2], 50, 4, 0, 1) >>> dict['size_factors'] array([ 1. , 0.5]) >>> dict['size_factors_based_on_mean'] array([ 1. , 0.5]) """ if len(bamFilesList) > 2: raise NameError("SES scale factors are only defined for 2 files") bamFilesHandlers = [bamHandler.openBam(x) for x in bamFilesList] mappedReads = [x.mapped for x in bamFilesHandlers] sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64') sizeFactorBasedOnMappedReads = \ sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads num_reads_per_bin = getNumReadsPerBin( bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors=numberOfProcessors, verbose=verbose, chrsToSkip=chrsToSkip) sitesSampled = len(num_reads_per_bin) # the transpose is taken to easily iterate by columns which are now # converted to rows num_reads_per_bin = num_reads_per_bin.transpose() # np.savetxt("/home/ramirez/tmp/test.num_reads", num_reads_per_bin) # size factors based on order statistics # see Signal extraction scaling (SES) method in: Diaz et al (2012) # Normalization, bias correction, and peak calling for ChIP-seq. # Statistical applications in genetics and molecular biology, 11(3). # using the same names as in Diaz paper # p refers to ChIP, q to input p = np.sort(num_reads_per_bin[0, :]).cumsum() q = np.sort(num_reads_per_bin[1, :]).cumsum() # p[-1] and q[-1] are the maximum values in the arrays. # both p and q are normalized by this value diff = np.abs(p / p[-1] - q / q[-1]) # get the lowest rank for wich the difference is the maximum maxIndex = np.flatnonzero(diff == diff.max())[0] # Take a lower rank to move to a region with probably # less peaks and more background. maxIndex = int(maxIndex * 0.8) while (maxIndex < len(p)): # in rare cases the maxIndex maps to a zero value. # In such cases, the next index is used until # a non zero value appears. cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])]) if cumSum.min() > 0: break maxIndex += 1 meanSES = [ np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]), np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex]) ] # the maxIndex may be too close to the the signal regions # so i take a more conservative approach by taking a close number sizeFactorsSES = cumSum.min() / cumSum median = np.median(num_reads_per_bin, axis=1) # consider only those read numbers that are below the 90 # percentile to stimate the # mean and std mean = [] std = [] for values in num_reads_per_bin: maxNumReads = (np.percentile(values, 90)) if maxNumReads == 0: maxNumReads = (np.percentile(values, 99)) if maxNumReads == 0: print "all genomic regions sampled from one " "of the bam files have no reads.\n" values = values[values <= maxNumReads] mean.append(np.mean(values)) std.append(np.std(values)) mean = np.array(mean) readsPerBin = mean if avg_method == 'mean' else median sizeFactor = sizeFactorsSES return { 'size_factors': sizeFactor, 'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads, 'size_factors_SES': sizeFactorsSES, 'size_factors_based_on_mean': mean.min() / mean, 'size_factors_based_on_median': median.min() / median, 'mean': mean, 'meanSES': meanSES, 'median': median, 'reads_per_bin': readsPerBin, 'std': std, 'sites_sampled': sitesSampled }
def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): """Counts the reads in each bam file at each 'stepSize' position within the interval (start, end) for a window or bin of size binLength. The stepSize controls the distance between bins. For example, a step size of 20 and a bin size of 20 will create bins next to each other. If the step size is smaller than the bin size the bins will overlap. If a list of bedRegions is given, then the number of reads that overlaps with each region is counted. Parameters ---------- chrom : str Chrom name start : int start coordinate end : int end coordinate bed_regions_list: list List of list of tuples of the form (start, end) corresponding to bed regions to be processed. If not bed file was passed to the object constructor then this list is empty. Returns ------- numpy array The result is a numpy array that as rows each bin and as columns each bam file. Examples -------- Initialize some useful values >>> test = Tester() >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50) The transpose is used to get better looking numbers. The first line corresponds to the number of reads per bin in the first bamfile. >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200) >>> _array array([[ 0., 0.], [ 0., 1.], [ 1., 1.], [ 1., 2.]]) """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) if self.stepSize is None: raise ValueError("stepSize is not set!") # array to keep the read counts for the regions subnum_reads_per_bin = [] start_time = time.time() bam_handlers = [] for fname in self.bamFilesList: try: bam_handlers.append(bamHandler.openBam(fname)) except: bam_handlers.append(pyBigWig.open(fname)) blackList = None if self.blackListFileName is not None: blackList = GTF(self.blackListFileName) # A list of lists of tuples transcriptsToConsider = [] if bed_regions_list is not None: transcriptsToConsider = [x[1] for x in bed_regions_list] else: if self.stepSize == self.binLength: transcriptsToConsider.append([(start, end, self.binLength)]) else: for i in range(start, end, self.stepSize): if i + self.binLength > end: break if blackList is not None and blackList.findOverlaps(chrom, i, i + self.binLength): continue transcriptsToConsider.append([(i, i + self.binLength)]) if self.save_data: _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') _file_name = _file.name else: _file_name = '' for bam in bam_handlers: for trans in transcriptsToConsider: tcov = self.get_coverage_of_region(bam, chrom, trans) if bed_regions_list is not None: subnum_reads_per_bin.append(np.sum(tcov)) else: subnum_reads_per_bin.extend(tcov) subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F') if self.save_data: idx = 0 for i, trans in enumerate(transcriptsToConsider): if len(trans[0]) != 3: starts = ",".join([str(x[0]) for x in trans]) ends = ",".join([str(x[1]) for x in trans]) _file.write("\t".join([chrom, starts, ends]) + "\t") _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[i, :]]) + "\n") else: for exon in trans: for startPos in range(exon[0], exon[1], exon[2]): if idx >= subnum_reads_per_bin.shape[0]: # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size # Counts there are added to the bin before them, but range() will still try to include them. break _file.write("{0}\t{1}\t{2}\t".format(chrom, startPos, startPos + exon[2])) _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[idx, :]]) + "\n") idx += 1 _file.close() if self.verbose: endTime = time.time() rows = subnum_reads_per_bin.shape[0] print("%s countReadsInRegions_worker: processing %d " "(%.1f per sec) @ %s:%s-%s" % (multiprocessing.current_process().name, rows, rows / (endTime - start_time), chrom, start, end)) return subnum_reads_per_bin, _file_name
def run(self, allArgs=None): # Try to determine an optimal fraction of the genome (chunkSize) that is sent to # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical bamFilesHandlers = [] for x in self.bamFilesList: try: y = bamHandler.openBam(x) except: y = pyBigWig.open(x) bamFilesHandlers.append(y) chromSizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandlers, verbose=self.verbose) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if len(self.chrsToSkip): chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip] chrNames, chrLengths = list(zip(*chromSizes)) genomeSize = sum(chrLengths) if self.stepSize is None: if self.region is None: self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1) else: # compute the step size, based on the number of samples # and the length of the region studied (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3] self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1) # number of samples is better if large if np.mean(chrLengths) < self.stepSize and self.bedFile is None: min_num_of_samples = int(genomeSize / np.mean(chrLengths)) raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples)) max_mapped = [] for x in bamFilesHandlers: try: max_mapped.append(x.mapped) except: # bigWig, use a fixed value max_mapped.append(0) max_mapped = max(max_mapped) # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin if max_mapped == 0: chunkSize = 10000 * self.binLength self.stepSize = self.binLength else: reads_per_bp = float(max_mapped) / genomeSize chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers))) [bam_h.close() for bam_h in bamFilesHandlers] # Ensure that chunkSize is always at least self.stepSize if chunkSize < self.stepSize: chunkSize = self.stepSize if self.verbose: print("step size is {}".format(self.stepSize)) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) # Handle GTF options transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs) # use map reduce to call countReadsInRegions_wrapper imap_res = mapReduce.mapReduce([], countReadsInRegions_wrapper, chromSizes, self_=self, genomeChunkLength=chunkSize, bedFile=self.bedFile, blackListFileName=self.blackListFileName, region=self.region, numberOfProcessors=self.numberOfProcessors, transcriptID=transcriptID, exonID=exonID, keepExons=keepExons, transcript_id_designator=transcript_id_designator) if self.out_file_for_raw_data: if len(non_common): sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n") # concatenate intermediary bedgraph files ofile = open(self.out_file_for_raw_data, "w") for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one _foo = open(tempFileName, 'r') shutil.copyfileobj(_foo, ofile) _foo.close() os.remove(tempFileName) ofile.close() try: num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return num_reads_per_bin except ValueError: if self.bedFile: sys.exit('\nNo coverage values could be computed.\n\n' 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' 'The valid chromosome names are:\n{}'.format(chrNames)) else: sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' 'contain mapped reads.')
def getEnrichment_worker(arglist): """ This is the worker function of plotEnrichment. In short, given a region, iterate over all reads **starting** in it. Filter/extend them as requested and check each for an overlap with findOverlaps. For each overlap, increment the counter for that feature. """ chrom, start, end, args, defaultFragmentLength = arglist if args.verbose: sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end)) olist = [] total = [0] * len(args.bamfiles) for idx, f in enumerate(args.bamfiles): odict = dict() for x in gtf.features: odict[x] = 0 fh = openBam(f) chrom = mungeChromosome(chrom, fh.references) lpos = None prev_pos = set() for read in fh.fetch(chrom, start, end): # Filter if read.pos < start: # Ensure that a given alignment is processed only once continue if read.flag & 4: continue if args.minMappingQuality and read.mapq < args.minMappingQuality: continue if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: continue tLen = getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: continue if args.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if tLen >= 0: s = read.pos e = s + tLen else: s = read.pnext e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) total[idx] += 1 # Get blocks, possibly extending features = gtf.findOverlaps( chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset)) if features is not None and len(features) > 0: for x in features: odict[x] += 1 olist.append(odict) return olist, gtf.features, total
def main(args=None): args = process_args(args) global F_gc, N_gc, R_gc data = np.loadtxt(args.GCbiasFrequenciesFile.name) F_gc = data[:, 0] N_gc = data[:, 1] R_gc = data[:, 2] global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that max_dup_gc = [ binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 for x in range(len(F_gc)) ] global_vars['max_dup_gc'] = max_dup_gc tbit = py2bit.open(global_vars['2bit']) bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors) global_vars['genome_size'] = sum(tbit.chroms().values()) global_vars['total_reads'] = mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize # apply correction print("applying correction") # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) chunkSize = int(4e5 / global_vars['reads_per_bp']) # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] regionStart = 0 if args.region: chromSizes, regionStart, regionEnd, chunkSize = \ mapReduce.getUserRegion(chromSizes, args.region, max_chunk_size=chunkSize) print("genome partition size for multiprocessing: {}".format(chunkSize)) print("using region {}".format(args.region)) mp_args = [] bedGraphStep = args.binSize chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references) chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) print(chrNameBitToBam, chrNameBamToBit) c = 1 for chrom, size in chromSizes: start = 0 if regionStart == 0 else regionStart for i in range(start, size, chunkSize): try: chrNameBamToBit[chrom] except KeyError: print("no sequence information for ") "chromosome {} in 2bit file".format(chrom) print("Reads in this chromosome will be skipped") continue length = min(size, i + chunkSize) mp_args.append( (chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) if args.correctedFile.name.endswith('bam'): if len(mp_args) > 1 and args.numberOfProcessors > 1: print(("using {} processors for {} " "number of tasks".format(args.numberOfProcessors, len(mp_args)))) res = pool.map_async(writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrectedSam_wrapper, mp_args)) if len(res) == 1: command = "cp {} {}".format(res[0], args.correctedFile.name) run_shell_command(command) else: print("concatenating (sorted) intermediate BAMs") header = pysam.Samfile(res[0]) of = pysam.Samfile(args.correctedFile.name, "wb", template=header) header.close() for f in res: f = pysam.Samfile(f) for e in f.fetch(until_eof=True): of.write(e) f.close() of.close() print("indexing BAM") pysam.index(args.correctedFile.name) for tempFileName in res: os.remove(tempFileName) if args.correctedFile.name.endswith('bg') or \ args.correctedFile.name.endswith('bw'): if len(mp_args) > 1 and args.numberOfProcessors > 1: res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrected_wrapper, mp_args)) oname = args.correctedFile.name args.correctedFile.close() if oname.endswith('bg'): f = open(oname, 'wb') for tempFileName in res: if tempFileName: shutil.copyfileobj(open(tempFileName, 'rb'), f) os.remove(tempFileName) f.close() else: chromSizes = [(k, v) for k, v in tbit.chroms().items()] writeBedGraph.bedGraphToBigWig(chromSizes, res, oname)
def getFiltered_worker(arglist): chrom, start, end, args = arglist # Fix the bounds if end - start > args.binSize and end - start > args.distanceBetweenBins: end -= args.distanceBetweenBins if end <= start: end = start + 1 o = [] for fname in args.bamfiles: fh = bamHandler.openBam(fname) chromUse = utilities.mungeChromosome(chrom, fh.references) prev_pos = set() lpos = None minMapq = 0 samFlagInclude = 0 samFlagExclude = 0 internalDupes = 0 externalDupes = 0 singletons = 0 filterRNAstrand = 0 nFiltered = 0 total = 0 # This is only used to estimate the percentage affected for read in fh.fetch(chromUse, start, end): filtered = 0 if read.pos < start: # ensure that we never double count (in case distanceBetweenBins == 0) continue if read.flag & 4: # Ignore unmapped reads, they were counted already continue if args.minMappingQuality and read.mapq < args.minMappingQuality: filtered = 1 minMapq += 1 if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: filtered = 1 samFlagInclude += 1 if args.samFlagExclude and read.flag & args.samFlagExclude != 0: filtered = 1 samFlagExclude += 1 if args.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if read.tlen >= 0: s = read.pos e = s + read.tlen else: s = read.pnext e = s - read.tlen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: filtered = 1 internalDupes += 1 if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) if read.is_duplicate: filtered = 1 externalDupes += 1 if read.is_paired and read.mate_is_unmapped: filtered = 1 singletons += 1 # filterRNAstrand if args.filterRNAstrand: if read.is_paired: if args.filterRNAstrand == 'forward': if read.flag & 144 == 128 or read.flag & 96 == 64: pass else: filtered = 1 filterRNAstrand += 1 elif args.filterRNAstrand == 'reverse': if read.flag & 144 == 144 or read.flag & 96 == 96: pass else: filtered = 1 filterRNAstrand += 1 else: if args.filterRNAstrand == 'forward': if read.flag & 16 == 16: pass else: filtered = 1 filterRNAstrand += 1 elif args.filterRNAstrand == 'reverse': if read.flag & 16 == 0: pass else: filtered = 1 filterRNAstrand += 1 total += 1 nFiltered += filtered fh.close() # Append a tuple to the output tup = (total, nFiltered, minMapq, samFlagInclude, samFlagExclude, internalDupes, externalDupes, singletons, filterRNAstrand) o.append(tup) return o
def main(args=None): args = parseArguments().parse_args(args) if not args.sampleLabels and args.smartLabels: args.sampleLabels = smartLabels(args.bamfiles) if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles): sys.stderr.write("\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n") sys.exit(1) if args.outFile is None: of = sys.stdout else: of = open(args.outFile, "w") bhs = [bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles] mapped = [x[1] for x in bhs] unmappedList = [x[2] for x in bhs] bhs = [x[0] for x in bhs] # Get the reads in blacklisted regions if args.blackListFileName: blacklisted = [] for bh in bhs: blacklisted.append(utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors)) else: blacklisted = [0] * len(bhs) # Get the total and mapped reads total = [x + y for x, y in list(zip(mapped, unmappedList))] chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths)) for x in bhs: x.close() # Get the remaining metrics res = mapReduce([args], getFiltered_worker, chrom_sizes, genomeChunkLength=args.binSize + args.distanceBetweenBins, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) totals = [0] * len(args.bamfiles) nFiltered = [0] * len(args.bamfiles) MAPQs = [0] * len(args.bamfiles) flagIncludes = [0] * len(args.bamfiles) flagExcludes = [0] * len(args.bamfiles) internalDupes = [0] * len(args.bamfiles) externalDupes = [0] * len(args.bamfiles) singletons = [0] * len(args.bamfiles) rnaStrand = [0] * len(args.bamfiles) for x in res: for idx, r in enumerate(x): totals[idx] += r[0] nFiltered[idx] += r[1] MAPQs[idx] += r[2] flagIncludes[idx] += r[3] flagExcludes[idx] += r[4] internalDupes[idx] += r[5] externalDupes[idx] += r[6] singletons[idx] += r[7] rnaStrand[idx] += r[8] # Print some output of.write("Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n") for idx, _ in enumerate(args.bamfiles): if args.sampleLabels: of.write(args.sampleLabels[idx]) else: of.write(args.bamfiles[idx]) of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx], blacklisted[idx])) # nFiltered metric = 0.0 if totals[idx] > 0: metric = blacklisted[idx] + float(nFiltered[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # MAPQ metric = 0.0 if totals[idx] > 0: metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagInclude metric = 0.0 if totals[idx] > 0: metric = float(flagIncludes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagExclude metric = 0.0 if totals[idx] > 0: metric = float(flagExcludes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Internally determined duplicates metric = 0.0 if totals[idx] > 0: metric = float(internalDupes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Externally marked duplicates metric = 0.0 if totals[idx] > 0: metric = float(externalDupes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Singletons metric = 0.0 if totals[idx] > 0: metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # filterRNAstrand metric = 0.0 if totals[idx] > 0: metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) of.write("\n") if args.outFile is not None: of.close() return 0
def writeBedGraph( bamOrBwFileList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, numberOfProcessors=None, format="bedgraph", extendPairedEnds=True, missingDataAsZero=False, smoothLength=0, fixed_step=False): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. """ bamHandlers = [bamHandler.openBam(indexedFile) for indexedFile, fileFormat in bamOrBwFileList if fileFormat == 'bam'] if len(bamHandlers): genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=False) else: genomeChunkLength = int(10e6) bigwigs = [fileName for fileName, fileFormat in bamOrBwFileList if fileFormat == 'bigwig'] cCommon = [] chromNamesAndSize = {} for bw in bigwigs: bwh = pyBigWig.open(bw) for chromName, size in bwh.chroms().items(): if chromName in chromNamesAndSize: cCommon.append(chromName) if chromNamesAndSize[chromName] != size: print "\nWARNING\n" \ "Chromosome {} length reported in the " \ "bigwig files differ.\n{} for {}\n" \ "{} for {}.\n\nThe smallest " \ "length will be used".format( chromName, chromNamesAndSize[chromName], bigwigs[0], size, bw) chromNamesAndSize[chromName] = min( chromNamesAndSize[chromName], size) else: chromNamesAndSize[chromName] = size bwh.close() # get the list of common chromosome names and sizes chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems() if k in cCommon] if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds, smoothLength, missingDataAsZero, fixed_step), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, numberOfProcessors=numberOfProcessors) # concatenate intermediary bedgraph files outFile = open(outputFileName + ".bg", 'wb') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), outFile) os.remove(tempFileName) bedGraphFile = outFile.name outFile.close() if format == 'bedgraph': os.rename(bedGraphFile, outputFileName) if debug: print "output file: %s" % (outputFileName) else: bedGraphToBigWig( chromNamesAndSize, bedGraphFile, outputFileName, True) if debug: print "output file: %s" % (outputFileName) os.remove(bedGraphFile)
def main(args=None): args = process_args(args) global debug if args.verbose: sys.stderr.write("Specified --scaleFactor: {}\n".format( args.scaleFactor)) debug = 1 else: debug = 0 if args.normalizeUsing == 'None': args.normalizeUsing = None # For the sake of sanity if args.normalizeUsing: # if a normalization is required then compute the scale factors bam, mapped, unmapped, stats = openBam( args.bam, returnStats=True, nThreads=args.numberOfProcessors) bam.close() scale_factor = get_scale_factor(args, stats) else: scale_factor = args.scaleFactor func_args = {'scaleFactor': scale_factor} # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used if args.filterRNAstrand and not args.Offset: args.Offset = [1, -1] if args.MNase: # check that library is paired end # using getFragmentAndReadSize from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if frag_len_dict is None: sys.exit( "*Error*: For the --MNAse function a paired end library is required. " ) # Set some default fragment length bounds if args.minFragmentLength == 0: args.minFragmentLength = 130 if args.maxFragmentLength == 0: args.maxFragmentLength = 200 wr = CenterFragment( [args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, chrsToSkip=args.ignoreForNormalization, verbose=args.verbose, ) elif args.Offset: if len(args.Offset) > 1: if args.Offset[0] == 0: sys.exit( "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment." ) if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]: sys.exir( "'Error*: The right side bound is less than the left-side bound. This is inappropriate." ) else: if args.Offset[0] == 0: sys.exit( "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment." ) wr = OffsetFragment([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, chrsToSkip=args.ignoreForNormalization, verbose=args.verbose) wr.filter_strand = args.filterRNAstrand wr.Offset = args.Offset else: wr = writeBedGraph.WriteBedGraph( [args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, chrsToSkip=args.ignoreForNormalization, verbose=args.verbose, ) wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName, blackListFileName=args.blackListFileName, format=args.outFileFormat, smoothLength=args.smoothLength)
def main(args=None): args = process_args(args) global debug if args.verbose: sys.stderr.write("Specified --scaleFactor: {}\n".format(args.scaleFactor)) debug = 1 else: debug = 0 if args.normalizeUsing == 'None': args.normalizeUsing = None # For the sake of sanity elif args.normalizeUsing == 'RPGC' and not args.effectiveGenomeSize: sys.exit("RPGC normalization requires an --effectiveGenomeSize!\n") if args.normalizeUsing: # if a normalization is required then compute the scale factors bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors) bam.close() scale_factor = get_scale_factor(args, stats) else: scale_factor = args.scaleFactor func_args = {'scaleFactor': scale_factor} # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used if args.filterRNAstrand and not args.Offset: args.Offset = [1, -1] if args.MNase: # check that library is paired end # using getFragmentAndReadSize from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if frag_len_dict is None: sys.exit("*Error*: For the --MNAse function a paired end library is required. ") # Set some default fragment length bounds if args.minFragmentLength == 0: args.minFragmentLength = 130 if args.maxFragmentLength == 0: args.maxFragmentLength = 200 wr = CenterFragment([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, chrsToSkip=args.ignoreForNormalization, verbose=args.verbose, ) elif args.Offset: if len(args.Offset) > 1: if args.Offset[0] == 0: sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.") if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]: sys.exir("'Error*: The right side bound is less than the left-side bound. This is inappropriate.") else: if args.Offset[0] == 0: sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.") wr = OffsetFragment([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, chrsToSkip=args.ignoreForNormalization, verbose=args.verbose) wr.filter_strand = args.filterRNAstrand wr.Offset = args.Offset else: wr = writeBedGraph.WriteBedGraph([args.bam], binLength=args.binSize, stepSize=args.binSize, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, extendReads=args.extendReads, minMappingQuality=args.minMappingQuality, ignoreDuplicates=args.ignoreDuplicates, center_read=args.centerReads, zerosToNans=args.skipNonCoveredRegions, samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, maxFragmentLength=args.maxFragmentLength, chrsToSkip=args.ignoreForNormalization, verbose=args.verbose, ) wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName, blackListFileName=args.blackListFileName, format=args.outFileFormat, smoothLength=args.smoothLength)
def getFractionKept_worker(chrom, start, end, bamFile, args): """ Queries the BAM file and counts the number of alignments kept/found in the first 50000 bases. """ bam = bamHandler.openBam(bamFile) end = min(end, start + 50000) tot = 0 filtered = 0 prev_start_pos = None # to store the start positions if chrom in bam.references: for read in bam.fetch(chrom, start, end): tot += 1 if args.minMappingQuality and read.mapq < args.minMappingQuality: filtered += 1 continue # filter reads based on SAM flag if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: filtered += 1 continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: filtered += 1 continue # fragment length filtering tLen = utilities.getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: filtered += 1 continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: filtered += 1 continue # get rid of duplicate reads that have same position on each of the # pairs if args.ignoreDuplicates and prev_start_pos \ and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse): filtered += 1 continue prev_start_pos = (read.reference_start, read.pnext, read.is_reverse) # If filterRNAstrand is in args, then filter accordingly # This is very similar to what's used in the get_fragment_from_read function in the filterRnaStrand class if hasattr(args, "filterRNAstrand"): if read.is_paired: if args.filterRNAstrand == 'forward': if not ((read.flag & 128 == 128 and read.flag & 16 == 0) or (read.flag & 64 == 64 and read.flag & 32 == 0)): filtered += 1 continue elif args.filterRNAstrand == 'reverse': if not (read.flag & 144 == 144 or read.flag & 96 == 96): filtered += 1 continue else: if args.filterRNAstrand == 'forward' and read.flag & 16 == 0: filtered += 1 continue elif args.filterRNAstrand == 'reverse' and read.flag & 16 == 16: filtered += 1 continue return (filtered, tot)
def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, fragmentLength, chrNameBamToBit, verbose=False): r""" given genome regions, the GC content of the genome is tabulated for fragments of length 'fragmentLength' each 'stepSize' positions. >>> test = Tester() >>> args = test.testTabulateGCcontentWorker() >>> N_gc, F_gc = tabulateGCcontent_worker(*args) The forward read positions are: [1, 4, 10, 10, 16, 18] which correspond to a GC of [1, 1, 1, 1, 2, 1] The evaluated position are [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] the corresponding GC is [2, 1, 1, 2, 2, 1, 2, 3, 2, 1] >>> print(N_gc) [0 4 5 1] >>> print(F_gc) [0 4 1 0] >>> test.set_filter_out_file() >>> chrNameBam2bit = {'2L': 'chr2L'} Test for the filter out option >>> N_gc, F_gc = tabulateGCcontent_worker('2L', 0, 20, 2, ... {'median': 3}, chrNameBam2bit) >>> test.unset_filter_out_file() The evaluated positions are [ 0 2 8 10 12 14 16 18] >>> print(N_gc) [0 3 4 1] >>> print(F_gc) [0 3 1 0] Test for extra_sampling option >>> test.set_extra_sampling_file() >>> chrNameBam2bit = {'2L': 'chr2L'} >>> res = tabulateGCcontent_worker('2L', 0, 20, 2, ... {'median': 3}, chrNameBam2bit) The new positions evaluated are [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18] and the GC is [2, 1, 1, 0, 1, 2, 2, 1, 2, 3, 2, 1] >>> print(res[0]) [1 5 5 1] >>> print(res[1]) [0 5 1 0] """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) chromNameBit = chrNameBamToBit[chromNameBam] # array to keep track of the GC from regions of length 'fragmentLength' # from the genome. The index of the array is used to # indicate the gc content. The values inside the # array are counts. Thus, if N_gc[10] = 3, that means # that 3 regions have a gc_content of 10. subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int') subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int') tbit = py2bit.open(global_vars['2bit']) bam = bamHandler.openBam(global_vars['bam']) peak = 0 startTime = time.time() if verbose: print("[{:.3f}] computing positions to " "sample".format(time.time() - startTime)) positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) read_counts = [] # Optimize IO. # if the sample regions are far apart from each # other is faster to go to each location and fetch # the reads found there. # Otherwise, if the regions to sample are close to # each other, is faster to load all the reads in # a large region into memory and consider only # those falling into the positions to sample. # The following code gets the reads # that are at sampling positions that lie close together if np.mean(np.diff(positions_to_sample)) < 1000: start_pos = min(positions_to_sample) end_pos = max(positions_to_sample) if verbose: print("[{:.3f}] caching reads".format(time.time() - startTime)) counts = np.bincount([r.pos - start_pos for r in bam.fetch(chromNameBam, start_pos, end_pos + 1) if not r.is_reverse and r.pos >= start_pos], minlength=end_pos - start_pos + 2) read_counts = counts[positions_to_sample - min(positions_to_sample)] if verbose: print("[{:.3f}] finish caching reads.".format( time.time() - startTime)) countTime = time.time() c = 1 for index in range(len(positions_to_sample)): i = positions_to_sample[index] # stop if the end of the chromosome is reached if i + fragmentLength['median'] > tbit.chroms(chromNameBit): break try: gc = getGC_content(tbit, chromNameBit, int(i), int(i + fragmentLength['median']), fraction=False) except Exception as detail: if verbose: print(detail) continue subN_gc[gc] += 1 # count all reads at position 'i' if len(read_counts) == 0: # case when no cache was done num_reads = len([x.pos for x in bam.fetch(chromNameBam, i, i + 1) if x.is_reverse is False and x.pos == i]) else: num_reads = read_counts[index] if num_reads >= global_vars['max_reads']: peak += 1 continue subF_gc[gc] += num_reads if verbose: if index % 50000 == 0: endTime = time.time() print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" % (multiprocessing.current_process().name, index, index / (endTime - countTime), chromNameBit, start, end, stepSize)) c += 1 if verbose: endTime = time.time() print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" % (multiprocessing.current_process().name, index, index / (endTime - countTime), chromNameBit, start, end, stepSize)) print("%s total time %.1f @ %s:%s-%s %s" % (multiprocessing.current_process().name, (endTime - startTime), chromNameBit, start, end, stepSize)) return(subN_gc, subF_gc)
def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, format="bedgraph", smoothLength=0): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. Parameters ---------- func_to_call : str function name to be called to convert the list of coverages computed for each bam file at each position into a single value. An example is a function that takes the ratio between the coverage of two bam files. func_args : dict dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0} out_file_name : str name of the file to save the resulting data. smoothLength : int Distance in bp for smoothing the coverage per tile. """ self.__dict__["smoothLength"] = smoothLength bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList] genome_chunk_length = getGenomeChunkLength(bam_handlers, self.binLength) # check if both bam files correspond to the same species # by comparing the chromosome names: chrom_names_and_size, non_common = getCommonChrNames(bam_handlers, verbose=False) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) for x in self.__dict__.keys(): sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x))) res = mapReduce.mapReduce([func_to_call, func_args], writeBedGraph_wrapper, chrom_names_and_size, self_=self, genomeChunkLength=genome_chunk_length, region=self.region, blackListFileName=blackListFileName, numberOfProcessors=self.numberOfProcessors) # concatenate intermediary bedgraph files out_file = open(out_file_name + ".bg", 'wb') for tempfilename in res: if tempfilename: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempfilename, 'rb'), out_file) os.remove(tempfilename) bedgraph_file = out_file.name out_file.close() if format == 'bedgraph': os.rename(bedgraph_file, out_file_name) if self.verbose: print "output file: {}".format(out_file_name) else: bedGraphToBigWig( chrom_names_and_size, bedgraph_file, out_file_name, True) if self.verbose: print "output file: {}".format(out_file_name) os.remove(bedgraph_file)
def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): r"""writes a bedgraph file containing the GC correction of a region from the genome >>> test = Tester() >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk()) >>> open(tempFile, 'r').readlines() ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n'] >>> os.remove(tempFile) """ global R_gc fragmentLength = len(R_gc) - 1 cvg_corr = np.zeros(end - start) i = 0 tbit = py2bit.open(global_vars['2bit']) bam = openBam(global_vars['bam']) read_repetitions = 0 removed_duplicated_reads = 0 startTime = time.time() # caching seems to be faster # r.flag & 4 == 0 is to skip unmapped # reads that nevertheless are asigned # to a genomic position reads = [r for r in bam.fetch(chrNameBam, start, end) if r.flag & 4 == 0] bam.close() r_index = -1 for read in reads: if read.is_unmapped: continue r_index += 1 try: # calculate GC content of read fragment gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) except Exception as detail: print(detail) """ this exception happens when the end of a chromosome is reached """ continue if not gc: continue # is this read in the same orientation and position as the previous? if r_index > 0 and read.pos == reads[r_index - 1].pos and \ read.is_reverse == reads[r_index - 1].is_reverse \ and read.pnext == reads[r_index - 1].pnext: read_repetitions += 1 if read_repetitions >= global_vars['max_dup_gc'][gc]: removed_duplicated_reads += 1 continue else: read_repetitions = 0 try: fragmentStart, fragmentEnd = getFragmentFromRead(read, fragmentLength, extendPairedEnds=True) vectorStart = max(fragmentStart - start, 0) vectorEnd = min(fragmentEnd - start, end - start) except TypeError: # the get_fragment_from_read functions returns None in some cases. # Those cases are to be skipped, hence the continue line. continue cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc] i += 1 try: if debug: endTime = time.time() print("{}, processing {} ({:.1f} per sec) ") "reads @ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end) except NameError: pass if i == 0: return None _file = open(utilities.getTempFileName(suffix='.bg'), 'w') # save in bedgraph format for bin in range(0, len(cvg_corr), step): value = np.mean(cvg_corr[bin:min(bin + step, end)]) if value > 0: writeStart = start + bin writeEnd = min(start + bin + step, end) _file.write("%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart, writeEnd, value)) tempFileName = _file.name _file.close() return tempFileName
def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, step=None, tag_but_not_change_number=False, verbose=True): r""" Writes a BAM file, deleting and adding some reads in order to compensate for the GC bias. **This is a stochastic method.** >>> np.random.seed(1) >>> test = Tester() >>> args = test.testWriteCorrectedSam() >>> tempFile = writeCorrectedSam_worker(*args, \ ... tag_but_not_change_number=True, verbose=False) >>> try: ... import StringIO ... except ImportError: ... from io import StringIO >>> ostdout = sys.stdout >>> import tempfile >>> sys.stdout = tempfile.TemporaryFile() >>> idx = pysam.index(tempFile) >>> sys.stdout = ostdout >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") >>> tempFile = \ ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\ ... tag_but_not_change_number=True, verbose=False) >>> sys.stdout = tempfile.TemporaryFile() >>> idx = pysam.index(tempFile) >>> sys.stdout = ostdout >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") """ global R_gc fragmentLength = len(R_gc) - 1 if verbose: print("Sam for %s %s %s " % (chrNameBit, start, end)) i = 0 tbit = py2bit.open(global_vars['2bit']) bam = openBam(global_vars['bam']) tempFileName = utilities.getTempFileName(suffix='.bam') outfile = pysam.Samfile(tempFileName, 'wb', template=bam) startTime = time.time() matePairs = {} read_repetitions = 0 removed_duplicated_reads = 0 # cache data # r.flag & 4 == 0 is to filter unmapped reads that # have a genomic position reads = [r for r in bam.fetch(chrNameBam, start, end) if r.pos > start and r.flag & 4 == 0] r_index = -1 for read in reads: if read.pos <= start or read.is_unmapped: continue r_index += 1 copies = None gc = None # check if a mate has already been procesed # to apply the same correction try: copies = matePairs[read.qname]['copies'] gc = matePairs[read.qname]['gc'] del(matePairs[read.qname]) except: # this exception happens when a mate is # not present. This could # happen because of removal of the mate # by some filtering gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) if gc: copies = numCopiesOfRead(float(1) / R_gc[gc]) else: copies = 1 # is this read in the same orientation and position as the previous? if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \ and read.is_reverse == reads[r_index - 1].is_reverse \ and read.pnext == reads[r_index - 1].pnext: read_repetitions += 1 if read_repetitions >= global_vars['max_dup_gc'][gc]: copies = 0 # in other words do not take into account this read removed_duplicated_reads += 1 else: read_repetitions = 0 readName = read.qname # Each tag is a tuple of (tag name, value, type) # Note that get_tags() returns ord(type) rather than type and this must # be fixed! # It turns out that the "with_value_type" option only started working in # pysam-0.8.4, so we can't reliably add tags on earlier versions without # potentially creating BAM files that break HTSJDK/IGV/etc. readTag = read.get_tags(with_value_type=True) replace_tags = False if len(readTag) > 0: if len(readTag[0]) == 3: if type(readTag[2]) is int: readTag = [(x[0], x[1], chr(x[2])) for x in readTag] replace_tags = True else: replace_tags = True if gc: GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2)) readTag.append( ('YC', float(round(float(1) / R_gc[gc], 2)), "f")) readTag.append(('YN', copies, "i")) else: GC = -1 readTag.append(('YG', GC, "i")) if replace_tags: read.set_tags(readTag) if read.is_paired and read.is_proper_pair \ and not read.mate_is_unmapped \ and not read.is_reverse: matePairs[readName] = {'copies': copies, 'gc': gc} """ outfile.write(read) """ if tag_but_not_change_number: outfile.write(read) continue for numCop in range(1, copies + 1): # the read has to be renamed such that newly # formed pairs will match if numCop > 1: read.qname = readName + "_%d" % (numCop) outfile.write(read) if verbose: if i % 500000 == 0 and i > 0: endTime = time.time() print("{}, processing {} ({:.1f} per sec) reads " "@ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end)) i += 1 outfile.close() if verbose: endTime = time.time() print("{}, processing {} ({:.1f} per sec) reads " "@ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end)) percentage = float(removed_duplicated_reads) * 100 / len(reads) \ if len(reads) > 0 else 0 print("duplicated reads removed %d of %d (%.2f) " % (removed_duplicated_reads, len(reads), percentage)) return tempFileName
def estimateScaleFactor(bamFilesList, binLength, numberOfSamples, normalizationLength, avg_method='median', blackListFileName=None, numberOfProcessors=1, verbose=False, chrsToSkip=[]): r""" Subdivides the genome into chunks to be analyzed in parallel using several processors. The code handles the creation of workers that compute fragment counts (coverage) for different regions and then collect and integrates the results. Parameters ---------- bamFilesList : list list of bam files to normalize binLength : int the window size in bp, where reads are going to be counted. numberOfSamples : int number of sites to sample from the genome. For more info see the documentation of the CountReadsPerBin class normalizationLength : int length, in bp, to normalize the data. For a value of 1, on average 1 read per base pair is found avg_method : str defines how the different values are to be summarized. The options are 'mean' and 'median' chrsToSkip : list name of the chromosomes to be excluded from the scale estimation. Usually the chrX is included. blackListFileName : str BED file containing blacklisted regions Returns ------- dict Dictionary with the following keys:: 'size_factors' 'size_factors_based_on_mapped_reads' 'size_factors_SES' 'size_factors_based_on_mean' 'size_factors_based_on_median' 'mean' 'meanSES' 'median' 'reads_per_bin' 'std' 'sites_sampled' Examples -------- >>> test = Tester() >>> bin_length = 50 >>> num_samples = 4 >>> _dict = estimateScaleFactor([test.bamFile1, test.bamFile2], bin_length, num_samples, 1) >>> _dict['size_factors'] array([ 1. , 0.5]) >>> _dict['size_factors_based_on_mean'] array([ 1. , 0.5]) """ assert len( bamFilesList) == 2, "SES scale factors are only defined for 2 files" bamFilesHandlers = [bamHandler.openBam(x) for x in bamFilesList] mappedReads = [x.mapped for x in bamFilesHandlers] sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64') sizeFactorBasedOnMappedReads = sizeFactorBasedOnMappedReads.min( ) / sizeFactorBasedOnMappedReads cr = countR.CountReadsPerBin(bamFilesList, binLength=binLength, numberOfSamples=numberOfSamples, extendReads=False, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose, chrsToSkip=chrsToSkip) try: num_reads_per_bin = cr.run() except Exception as detail: exit("*ERROR*: {}".format(detail)) sitesSampled = len(num_reads_per_bin) # the transpose is taken to easily iterate by columns which are now # converted to rows num_reads_per_bin = num_reads_per_bin.transpose() # size factors based on order statistics # see Signal extraction scaling (SES) method in: Diaz et al (2012) # Normalization, bias correction, and peak calling for ChIP-seq. # Statistical applications in genetics and molecular biology, 11(3). # using the same names as in Diaz paper # p refers to ChIP, q to input p = np.sort(num_reads_per_bin[0, :]).cumsum() q = np.sort(num_reads_per_bin[1, :]).cumsum() # p[-1] and q[-1] are the maximum values in the arrays. # both p and q are normalized by this value diff = np.abs(p / p[-1] - q / q[-1]) # get the lowest rank for wich the difference is the maximum maxIndex = np.flatnonzero(diff == diff.max())[0] # Take a lower rank to move to a region with probably # less peaks and more background. maxIndex = int(maxIndex * 0.8) while (maxIndex < len(p)): # in rare cases the maxIndex maps to a zero value. # In such cases, the next index is used until # a non zero value appears. cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])]) if cumSum.min() > 0: break maxIndex += 1 meanSES = [ np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]), np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex]) ] # the maxIndex may be too close to the the signal regions # so i take a more conservative approach by taking a close number sizeFactorsSES = cumSum.min() / cumSum median = np.median(num_reads_per_bin, axis=1) # consider only those read numbers that are below the 90 # percentile to stimate the # mean and std mean = [] std = [] for values in num_reads_per_bin: maxNumReads = (np.percentile(values, 90)) if maxNumReads == 0: maxNumReads = (np.percentile(values, 99)) if maxNumReads == 0: print("all genomic regions sampled from one ") "of the bam files have no reads.\n" values = values[values <= maxNumReads] mean.append(np.mean(values)) std.append(np.std(values)) mean = np.array(mean) readsPerBin = mean if avg_method == 'mean' else median if min(median) == 0: idx_zero = [ix + 1 for ix, value in enumerate(median) if value == 0] exit( "\n*ERROR*: The median coverage computed is zero for sample(s) #{}\n" "Try selecting a larger sample size or a region with coverage\n". format(idx_zero)) sizeFactor = sizeFactorsSES return { 'size_factors': sizeFactor, 'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads, 'size_factors_SES': sizeFactorsSES, 'size_factors_based_on_mean': mean.min() / mean, 'size_factors_based_on_median': median.min() / median, 'mean': mean, 'meanSES': meanSES, 'median': median, 'reads_per_bin': readsPerBin, 'std': std, 'sites_sampled': sitesSampled }
def main(args=None): args = process_args(args) global F_gc, N_gc, R_gc data = np.loadtxt(args.GCbiasFrequenciesFile.name) F_gc = data[:, 0] N_gc = data[:, 1] R_gc = data[:, 2] global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that max_dup_gc = [binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 for x in range(len(F_gc))] global_vars['max_dup_gc'] = max_dup_gc tbit = py2bit.open(global_vars['2bit']) bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors) global_vars['genome_size'] = sum(tbit.chroms().values()) global_vars['total_reads'] = mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize # apply correction print("applying correction") # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) chunkSize = int(4e5 / global_vars['reads_per_bp']) # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] regionStart = 0 if args.region: chromSizes, regionStart, regionEnd, chunkSize = \ mapReduce.getUserRegion(chromSizes, args.region, max_chunk_size=chunkSize) print("genome partition size for multiprocessing: {}".format(chunkSize)) print("using region {}".format(args.region)) mp_args = [] bedGraphStep = args.binSize chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references) chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) print(chrNameBitToBam, chrNameBamToBit) c = 1 for chrom, size in chromSizes: start = 0 if regionStart == 0 else regionStart for i in range(start, size, chunkSize): try: chrNameBamToBit[chrom] except KeyError: print("no sequence information for ") "chromosome {} in 2bit file".format(chrom) print("Reads in this chromosome will be skipped") continue length = min(size, i + chunkSize) mp_args.append((chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) if args.correctedFile.name.endswith('bam'): if len(mp_args) > 1 and args.numberOfProcessors > 1: print(("using {} processors for {} " "number of tasks".format(args.numberOfProcessors, len(mp_args)))) res = pool.map_async( writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrectedSam_wrapper, mp_args)) if len(res) == 1: command = "cp {} {}".format(res[0], args.correctedFile.name) run_shell_command(command) else: print("concatenating (sorted) intermediate BAMs") header = pysam.Samfile(res[0]) of = pysam.Samfile(args.correctedFile.name, "wb", template=header) header.close() for f in res: f = pysam.Samfile(f) for e in f.fetch(until_eof=True): of.write(e) f.close() of.close() print("indexing BAM") pysam.index(args.correctedFile.name) for tempFileName in res: os.remove(tempFileName) if args.correctedFile.name.endswith('bg') or \ args.correctedFile.name.endswith('bw'): if len(mp_args) > 1 and args.numberOfProcessors > 1: res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrected_wrapper, mp_args)) oname = args.correctedFile.name args.correctedFile.close() if oname.endswith('bg'): f = open(oname, 'wb') for tempFileName in res: if tempFileName: shutil.copyfileobj(open(tempFileName, 'rb'), f) os.remove(tempFileName) f.close() else: chromSizes = [(k, v) for k, v in tbit.chroms().items()] writeBedGraph.bedGraphToBigWig(chromSizes, res, oname)
def __init__(self, bamFilesList, binLength=50, numberOfSamples=None, numberOfProcessors=1, verbose=False, region=None, bedFile=None, extendReads=False, blackListFileName=None, minMappingQuality=None, ignoreDuplicates=False, chrsToSkip=[], stepSize=None, center_read=False, samFlag_include=None, samFlag_exclude=None, zerosToNans=False, skipZeroOverZero=False, smoothLength=0, minFragmentLength=0, maxFragmentLength=0, out_file_for_raw_data=None, statsList=[], mappedList=[]): self.bamFilesList = bamFilesList self.binLength = binLength self.numberOfSamples = numberOfSamples self.blackListFileName = blackListFileName self.statsList = statsList self.mappedList = mappedList self.skipZeroOverZero = skipZeroOverZero if extendReads and len(bamFilesList): from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( bamFilesList[0], return_lengths=False, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) if extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: self.defaultFragmentLength = int(frag_len_dict['median']) else: exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if verbose: print( ("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median']))) elif extendReads < read_len_dict['median']: sys.stderr.write( "*WARNING*: read extension is smaller than read length (read length = {}). " "Reads will not be extended.\n".format( int(read_len_dict['median']))) self.defaultFragmentLength = 'read length' elif extendReads > 2000: exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(extendReads)) else: self.defaultFragmentLength = int(extendReads) else: self.defaultFragmentLength = 'read length' self.numberOfProcessors = numberOfProcessors self.verbose = verbose self.region = region self.bedFile = bedFile self.minMappingQuality = minMappingQuality self.ignoreDuplicates = ignoreDuplicates self.chrsToSkip = chrsToSkip self.stepSize = stepSize self.center_read = center_read self.samFlag_include = samFlag_include self.samFlag_exclude = samFlag_exclude self.minFragmentLength = minFragmentLength self.maxFragmentLength = maxFragmentLength self.zerosToNans = zerosToNans self.smoothLength = smoothLength if out_file_for_raw_data: self.save_data = True self.out_file_for_raw_data = out_file_for_raw_data else: self.save_data = False self.out_file_for_raw_data = None # check that wither numberOfSamples or stepSize are set if numberOfSamples is None and stepSize is None and bedFile is None: raise ValueError( "either stepSize, numberOfSamples or bedFile have to be set") if self.defaultFragmentLength != 'read length': self.maxPairedFragmentLength = 4 * self.defaultFragmentLength else: self.maxPairedFragmentLength = 1000 if self.maxFragmentLength > 0: self.maxPairedFragmentLength = self.maxFragmentLength if len(self.mappedList) == 0: try: for fname in self.bamFilesList: bam, mapped, unmapped, stats = bamHandler.openBam( fname, returnStats=True, nThreads=self.numberOfProcessors) self.mappedList.append(mapped) self.statsList.append(stats) bam.close() except: self.mappedList = [] self.statsList = []
def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): """Counts the reads in each bam file at each 'stepSize' position within the interval (start, end) for a window or bin of size binLength. The stepSize controls the distance between bins. For example, a step size of 20 and a bin size of 20 will create bins next to each other. If the step size is smaller than the bin size the bins will overlap. If a list of bedRegions is given, then the number of reads that overlaps with each region is counted. Parameters ---------- chrom : str Chrom name start : int start coordinate end : int end coordinate bed_regions_list: list List of list of tuples of the form (start, end) corresponding to bed regions to be processed. If not bed file was passed to the object constructor then this list is empty. Returns ------- numpy array The result is a numpy array that as rows each bin and as columns each bam file. Examples -------- Initialize some useful values >>> test = Tester() >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50) The transpose is used to get better looking numbers. The first line corresponds to the number of reads per bin in the first bamfile. >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200) >>> _array array([[0., 0.], [0., 1.], [1., 1.], [1., 2.]]) """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) if self.stepSize is None and bed_regions_list is None: raise ValueError("stepSize is not set!") # array to keep the read counts for the regions subnum_reads_per_bin = [] start_time = time.time() bam_handles = [] for fname in self.bamFilesList: try: bam_handles.append(bamHandler.openBam(fname)) except SystemExit: sys.exit(sys.exc_info()[1]) except: bam_handles.append(pyBigWig.open(fname)) blackList = None if self.blackListFileName is not None: blackList = GTF(self.blackListFileName) # A list of lists of tuples transcriptsToConsider = [] if bed_regions_list is not None: transcriptsToConsider = [x[1] for x in bed_regions_list] else: if self.stepSize == self.binLength: transcriptsToConsider.append([(start, end, self.binLength)]) else: for i in range(start, end, self.stepSize): if i + self.binLength > end: break if blackList is not None and blackList.findOverlaps( chrom, i, i + self.binLength): continue transcriptsToConsider.append([(i, i + self.binLength)]) if self.save_data: _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') _file_name = _file.name else: _file_name = '' for bam in bam_handles: for trans in transcriptsToConsider: tcov = self.get_coverage_of_region(bam, chrom, trans) if bed_regions_list is not None: subnum_reads_per_bin.append(np.sum(tcov)) else: subnum_reads_per_bin.extend(tcov) subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape( -1, len(self.bamFilesList), order='F') if self.save_data: idx = 0 for i, trans in enumerate(transcriptsToConsider): if len(trans[0]) != 3: starts = ",".join([str(x[0]) for x in trans]) ends = ",".join([str(x[1]) for x in trans]) _file.write("\t".join([chrom, starts, ends]) + "\t") _file.write("\t".join( ["{}".format(x) for x in subnum_reads_per_bin[i, :]]) + "\n") else: for exon in trans: for startPos in range(exon[0], exon[1], exon[2]): if idx >= subnum_reads_per_bin.shape[0]: # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size # Counts there are added to the bin before them, but range() will still try to include them. break _file.write("{0}\t{1}\t{2}\t".format( chrom, startPos, startPos + exon[2])) _file.write("\t".join([ "{}".format(x) for x in subnum_reads_per_bin[idx, :] ]) + "\n") idx += 1 _file.close() if self.verbose: endTime = time.time() rows = subnum_reads_per_bin.shape[0] print("%s countReadsInRegions_worker: processing %d " "(%.1f per sec) @ %s:%s-%s" % (multiprocessing.current_process().name, rows, rows / (endTime - start_time), chrom, start, end)) return subnum_reads_per_bin, _file_name
def main(args=None): args = parseArguments().parse_args(args) if not args.sampleLabels and args.smartLabels: args.sampleLabels = smartLabels(args.bamfiles) if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles): sys.stderr.write( "\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n" ) sys.exit(1) if args.outFile is None: of = sys.stdout else: of = open(args.outFile, "w") bhs = [ bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles ] mapped = [x[1] for x in bhs] unmappedList = [x[2] for x in bhs] bhs = [x[0] for x in bhs] # Get the reads in blacklisted regions if args.blackListFileName: blacklisted = [] for bh in bhs: blacklisted.append( utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors)) else: blacklisted = [0] * len(bhs) # Get the total and mapped reads total = [x + y for x, y in list(zip(mapped, unmappedList))] chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths)) for x in bhs: x.close() # Get the remaining metrics res = mapReduce([args], getFiltered_worker, chrom_sizes, genomeChunkLength=args.binSize + args.distanceBetweenBins, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) totals = [0] * len(args.bamfiles) nFiltered = [0] * len(args.bamfiles) MAPQs = [0] * len(args.bamfiles) flagIncludes = [0] * len(args.bamfiles) flagExcludes = [0] * len(args.bamfiles) internalDupes = [0] * len(args.bamfiles) externalDupes = [0] * len(args.bamfiles) singletons = [0] * len(args.bamfiles) rnaStrand = [0] * len(args.bamfiles) for x in res: for idx, r in enumerate(x): totals[idx] += r[0] nFiltered[idx] += r[1] MAPQs[idx] += r[2] flagIncludes[idx] += r[3] flagExcludes[idx] += r[4] internalDupes[idx] += r[5] externalDupes[idx] += r[6] singletons[idx] += r[7] rnaStrand[idx] += r[8] # Print some output of.write( "Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n" ) for idx, _ in enumerate(args.bamfiles): if args.sampleLabels: of.write(args.sampleLabels[idx]) else: of.write(args.bamfiles[idx]) of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx], blacklisted[idx])) # nFiltered metric = 0.0 if totals[idx] > 0: metric = blacklisted[idx] + float(nFiltered[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # MAPQ metric = 0.0 if totals[idx] > 0: metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagInclude metric = 0.0 if totals[idx] > 0: metric = float(flagIncludes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagExclude metric = 0.0 if totals[idx] > 0: metric = float(flagExcludes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Internally determined duplicates metric = 0.0 if totals[idx] > 0: metric = float(internalDupes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Externally marked duplicates metric = 0.0 if totals[idx] > 0: metric = float(externalDupes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Singletons metric = 0.0 if totals[idx] > 0: metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # filterRNAstrand metric = 0.0 if totals[idx] > 0: metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) of.write("\n") if args.outFile is not None: of.close() return 0
def estimateScaleFactor(bamFilesList, binLength, numberOfSamples, normalizationLength, avg_method='median', numberOfProcessors=1, verbose=False, chrsToSkip=[]): r""" Subdivides the genome into chunks to be analyzed in parallel using several processors. The code handles the creation of workers that compute fragment counts (coverage) for different regions and then collect and integrates the results. Parameters ---------- bamFilesList : list list of bam files to normalize binLength : int the window size in bp, where reads are going to be counted. numberOfSamples : int number of sites to sample from the genome. For more info see the documentation of the CountReadsPerBin class normalizationLength : int length, in bp, to normalize the data. For a value of 1, on average 1 read per base pair is found avg_method : str defines how the different values are to be summarized. The options are 'mean' and 'median' chrsToSkip : list name of the chromosomes to be excluded from the scale estimation. Usually the chrX is included. Returns ------- dict Dictionary with the following keys:: 'size_factors' 'size_factors_based_on_mapped_reads' 'size_factors_SES' 'size_factors_based_on_mean' 'size_factors_based_on_median' 'mean' 'meanSES' 'median' 'reads_per_bin' 'std' 'sites_sampled' Examples -------- >>> test = Tester() >>> bin_length = 50 >>> num_samples = 4 >>> _dict = estimateScaleFactor([test.bamFile1, test.bamFile2], bin_length, num_samples, 1) >>> _dict['size_factors'] array([ 1. , 0.5]) >>> _dict['size_factors_based_on_mean'] array([ 1. , 0.5]) """ assert len(bamFilesList) == 2, "SES scale factors are only defined for 2 files" bamFilesHandlers = [bamHandler.openBam(x) for x in bamFilesList] mappedReads = [x.mapped for x in bamFilesHandlers] sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64') sizeFactorBasedOnMappedReads = sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads cr = countR.CountReadsPerBin(bamFilesList, binLength=binLength, numberOfSamples=numberOfSamples, extendReads=False, numberOfProcessors=numberOfProcessors, verbose=verbose, chrsToSkip=chrsToSkip) try: num_reads_per_bin = cr.run() except Exception as detail: exit("*ERROR*: {}".format(detail)) sitesSampled = len(num_reads_per_bin) # the transpose is taken to easily iterate by columns which are now # converted to rows num_reads_per_bin = num_reads_per_bin.transpose() # np.savetxt("/home/ramirez/tmp/test.num_reads", num_reads_per_bin) # size factors based on order statistics # see Signal extraction scaling (SES) method in: Diaz et al (2012) # Normalization, bias correction, and peak calling for ChIP-seq. # Statistical applications in genetics and molecular biology, 11(3). # using the same names as in Diaz paper # p refers to ChIP, q to input p = np.sort(num_reads_per_bin[0, :]).cumsum() q = np.sort(num_reads_per_bin[1, :]).cumsum() # p[-1] and q[-1] are the maximum values in the arrays. # both p and q are normalized by this value diff = np.abs(p / p[-1] - q / q[-1]) # get the lowest rank for wich the difference is the maximum maxIndex = np.flatnonzero(diff == diff.max())[0] # Take a lower rank to move to a region with probably # less peaks and more background. maxIndex = int(maxIndex * 0.8) while(maxIndex < len(p)): # in rare cases the maxIndex maps to a zero value. # In such cases, the next index is used until # a non zero value appears. cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])]) if cumSum.min() > 0: break maxIndex += 1 meanSES = [np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]), np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex])] # the maxIndex may be too close to the the signal regions # so i take a more conservative approach by taking a close number sizeFactorsSES = cumSum.min() / cumSum median = np.median(num_reads_per_bin, axis=1) # consider only those read numbers that are below the 90 # percentile to stimate the # mean and std mean = [] std = [] for values in num_reads_per_bin: maxNumReads = (np.percentile(values, 90)) if maxNumReads == 0: maxNumReads = (np.percentile(values, 99)) if maxNumReads == 0: print "all genomic regions sampled from one " "of the bam files have no reads.\n" values = values[values <= maxNumReads] mean.append(np.mean(values)) std.append(np.std(values)) mean = np.array(mean) readsPerBin = mean if avg_method == 'mean' else median if min(median) == 0: idx_zero = [ix + 1 for ix, value in enumerate(median) if value == 0] exit("\n*ERROR*: The median coverage computed is zero for sample(s) #{}\n" "Try selecting a larger sample size or a region with coverage\n".format(idx_zero)) sizeFactor = sizeFactorsSES return {'size_factors': sizeFactor, 'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads, 'size_factors_SES': sizeFactorsSES, 'size_factors_based_on_mean': mean.min() / mean, 'size_factors_based_on_median': median.min() / median, 'mean': mean, 'meanSES': meanSES, 'median': median, 'reads_per_bin': readsPerBin, 'std': std, 'sites_sampled': sitesSampled}
def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args, bed_regions_list=None): r"""Writes a bedgraph based on the read coverage found on bamFiles The given func is called to compute the desired bedgraph value using the funcArgs Parameters ---------- chrom : str Chrom name start : int start coordinate end : int end coordinate func_to_call : str function name to be called to convert the list of coverages computed for each bam file at each position into a single value. An example is a function that takes the ratio between the coverage of two bam files. func_args : dict dict of arguments to pass to `func`. smoothLength : int Distance in bp for smoothing the coverage per tile. bed_regions_list: list List of tuples of the form (chrom, start, end) corresponding to bed regions to be processed. If not bed file was passed to the object constructor then this list is empty. Returns ------- temporary file with the bedgraph results for the region queried. Examples -------- >>> test_path = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/" >>> bamFile1 = test_path + "testA.bam" >>> bin_length = 50 >>> number_of_samples = 0 # overruled by step_size >>> func_to_call = scaleCoverage >>> funcArgs = {'scaleFactor': 1.0} >>> c = WriteBedGraph([bamFile1], bin_length, number_of_samples, stepSize=50) >>> tempFile = c.writeBedGraph_worker( '3R', 0, 200, func_to_call, funcArgs) >>> open(tempFile, 'r').readlines() ['3R\t0\t100\t0.00\n', '3R\t100\t200\t1.0\n'] >>> os.remove(tempFile) """ if start > end: raise NameError("start position ({0}) bigger " "than end position ({1})".format(start, end)) coverage = [] bam_handlers = [bamHandler.openBam(bam) for bam in self.bamFilesList] for bam in bam_handlers: coverage.append( self.get_coverage_of_region(bam, chrom, start, end, self.binLength)) bam.close() _file = open(utilities.getTempFileName(suffix='.bg'), 'w') previous_value = None length_coverage = len(coverage[0]) for tileIndex in xrange(length_coverage): tileCoverage = [] for index in range(len(self.bamFilesList)): if self.smoothLength > 0: vector_start, vector_end = self.getSmoothRange( tileIndex, self.binLength, self.smoothLength, length_coverage) tileCoverage.append( np.mean(coverage[index][vector_start:vector_end])) else: tileCoverage.append(coverage[index][tileIndex]) value = func_to_call(tileCoverage, func_args) """ # uncomment this lines if fixed step bedgraph is wanted if not np.isnan(value): writeStart = start + tileIndex*self.binLength writeEnd = min(writeStart+self.binLength, end) _file.write( "%s\t%d\t%d\t%.2f\n" % (chrom, writeStart, writeEnd, value) ) """ if previous_value is None: writeStart = start + tileIndex * self.binLength writeEnd = min(writeStart + self.binLength, end) previous_value = value elif previous_value == value: writeEnd = min(writeEnd + self.binLength, end) elif previous_value != value: if not np.isnan(previous_value): _file.write("{}\t{}\t{}\t{:.2f}\n".format( chrom, writeStart, writeEnd, previous_value)) previous_value = value writeStart = writeEnd writeEnd = min(writeStart + self.binLength, end) # write remaining value if not a nan if previous_value and writeStart != end and not np.isnan( previous_value): _file.write("%s\t%d\t%d\t%.1f\n" % (chrom, writeStart, end, previous_value)) tempfilename = _file.name _file.close() return tempfilename
def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None, binSize=50000, distanceBetweenBins=1000000, numberOfProcessors=None, verbose=False): """ Estimates the fragment length and read length through sampling Parameters ---------- bamFile : str BAM file name return_lengths : bool numberOfProcessors : int verbose : bool binSize : int distanceBetweenBins : int Returns ------- d : dict tuple of two dictionaries, one for the fragment length and the other for the read length. The dictionaries summarise the mean, median etc. values """ bam_handle = bamHandler.openBam(bamFile) chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths)) distanceBetweenBins *= 2 fl = [] while len(fl) < 1000 and distanceBetweenBins > 1: distanceBetweenBins /= 2 stepsize = binSize + distanceBetweenBins imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=stepsize, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragment_length = fl[:, 0] read_length = fl[:, 1] if fragment_length.mean() > 0: fragment_len_dict = {'sample_size': len(fragment_length), 'min': fragment_length.min(), 'qtile25': np.percentile(fragment_length, 25), 'mean': np.mean(fragment_length), 'median': np.median(fragment_length), 'qtile75': np.percentile(fragment_length, 75), 'max': fragment_length.max(), 'std': np.std(fragment_length)} else: fragment_len_dict = None if return_lengths and fragment_len_dict is not None: fragment_len_dict['lengths'] = fragment_length read_len_dict = {'sample_size': len(read_length), 'min': read_length.min(), 'qtile25': np.percentile(read_length, 25), 'mean': np.mean(read_length), 'median': np.median(read_length), 'qtile75': np.percentile(read_length, 75), 'max': read_length.max(), 'std': np.std(read_length)} if return_lengths: read_len_dict['lengths'] = read_length else: fragment_len_dict = None read_len_dict = None return fragment_len_dict, read_len_dict
def compareSignal(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, outFileName, outFileFormat, outFileNameLambda=None, region=None, extendPairedEnds=True, numberOfProcessors=1, Nsigmas = 2, maxSignalRatio=10, verbose=False): bam1 = bamHandler.openBam(bamFilesList[0]) genomeSize = sum(bam1.lengths) bam2 = bamHandler.openBam(bamFilesList[1]) treatmentMapped = bam1.mapped controlMapped = bam2.mapped treatmentControlRatioMapped = float(treatmentMapped) / controlMapped # 1. Get a table containing number of reads in a sample from the genome. # Only regions for which both samples have non zero counts are considered num_reads_per_region = getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors, skipZeros=True, verbose=verbose) if verbose: print "number of non-zero regions sampled: {}".format(num_reads_per_region.shape[0]) # 2. get Mean and std of treatment (col1) and control (col2) treatmentMean, controlMean = np.mean(num_reads_per_region, axis=0) # axis=0: that measn by column treatmentStd, controlStd = np.std(num_reads_per_region, axis=0) treatmentTotal, controlTotal = np.sum(num_reads_per_region, axis=0) # 3. Calculate residual in treatment & control data, at regions for which treatment # signal exceeds mean + std * Nsigmas # (these are expected to be the regions at which the signal > mean-signal, # so the residual signal is positive) overRows = np.where(num_reads_per_region[:,0].copy() >= treatmentMean + treatmentStd*Nsigmas )[0] over_Nsigma_regions = num_reads_per_region[overRows, :] treatmentSigMean, controlSigMean = np.mean(over_Nsigma_regions, axis=0) treatmentExtraSignal = treatmentSigMean - treatmentMean controlExtraSignal = controlSigMean - controlMean treatmentControlRatio = float(treatmentTotal) / controlTotal adjSignalRatio = maxSignalRatio * treatmentControlRatio; treatmentSignalRatio = float(treatmentExtraSignal) / controlExtraSignal if treatmentSignalRatio < adjSignalRatio and treatmentSignalRatio > 0: treatmentSignalRatio = adjSignalRatio if treatmentSignalRatio < 1: raise NameError("estimated signal in control file {} is greater than estimated signal in treatmant file {}. Perhaps the file names are swapped?".format(bamFilesList[0], bamFilesList[1])) else: controlSignalRatio = 1.0/treatmentSignalRatio controlRatio = 1.0 / treatmentControlRatio print "Treatment mean: {:.2f}, Treatment total:{:.2f}".format(treatmentMean, treatmentTotal) print "Control mean: {:.2f}, Control total:{}".format(controlMean, controlTotal) print "the ratio of treatment vs. control for enriched regions is: {:.2f}".format(treatmentSignalRatio) print "the ratio of treatment vs. control ratio: {:.2f} (if based on mapped reads: {:.2f})".format(treatmentControlRatio, treatmentControlRatioMapped) funcArgs = {'controlMean': controlMean, 'treatmentMean': treatmentMean, 'controlSignalRatio': controlSignalRatio, 'controlRatio': controlRatio, 'treatmentControlRatio': treatmentControlRatio } writeBedGraph.writeBedGraph( bamFilesList, outFileName, defaultFragmentLength, computePvalue, funcArgs, tileSize=binLength, region=region, format=outFileFormat, zerosToNans = False, numberOfProcessors=numberOfProcessors, extendPairedEnds=extendPairedEnds) if outFileNameLambda: writeBedGraph.writeBedGraph( bamFilesList, outFileNameLambda, defaultFragmentLength, computeLambda, funcArgs, tileSize=binLength, region=region, format=outFileFormat, zerosToNans = False, numberOfProcessors=numberOfProcessors, extendPairedEnds=extendPairedEnds)
def writeBedGraph_worker( chrom, start, end, tileSize, defaultFragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds=True, smoothLength=0, missingDataAsZero=False, fixed_step=False): r""" Writes a bedgraph having as base a number of bam files. The given func is called to compute the desired bedgraph value using the funcArgs tileSize """ if start > end: raise NameError("start position ({0}) bigger than " "end position ({1})".format(start, end)) coverage = [] for indexFile, fileFormat in bamOrBwFileList: if fileFormat == 'bam': bamHandle = bamHandler.openBam(indexFile) coverage.append(getCoverageFromBam( bamHandle, chrom, start, end, tileSize, defaultFragmentLength, extendPairedEnds, True)) bamHandle.close() elif fileFormat == 'bigwig': bigwigHandle = pyBigWig.open(indexFile) coverage.append( getCoverageFromBigwig( bigwigHandle, chrom, start, end, tileSize, missingDataAsZero)) bigwigHandle.close() # is /dev/shm available? # working in this directory speeds the process try: _file = tempfile.NamedTemporaryFile(dir="/dev/shm", delete=False) except OSError: _file = tempfile.NamedTemporaryFile(delete=False) previousValue = None lengthCoverage = len(coverage[0]) for tileIndex in xrange(lengthCoverage): tileCoverage = [] for index in range(len(bamOrBwFileList)): if smoothLength > 0: vectorStart, vectorEnd = getSmoothRange( tileIndex, tileSize, smoothLength, lengthCoverage) tileCoverage.append( np.mean(coverage[index][vectorStart:vectorEnd])) else: try: tileCoverage.append(coverage[index][tileIndex]) except IndexError: print "Chromosome {} probably not in one of the bigwig " \ "files. Remove this chromosome from the bigwig file " \ "to continue".format(chrom) exit(0) # if zerosToNans == True and sum(tileCoverage) == 0.0: # continue value = func(tileCoverage, funcArgs) if fixed_step: writeStart = start + tileIndex * tileSize writeEnd = min(writeStart + tileSize, end) try: _file.write("%s\t%d\t%d\t%.2f\n" % (chrom, writeStart, writeEnd, value)) except TypeError: _file.write("{}\t{}\t{}\t{}\n".format(chrom, writeStart, writeEnd, value)) else: if previousValue is None: writeStart = start + tileIndex * tileSize writeEnd = min(writeStart + tileSize, end) previousValue = value elif previousValue == value: writeEnd = min(writeEnd + tileSize, end) elif previousValue != value: if not np.isnan(previousValue): _file.write( "%s\t%d\t%d\t%.2f\n" % (chrom, writeStart, writeEnd, previousValue)) previousValue = value writeStart = writeEnd writeEnd = min(writeStart + tileSize, end) if not fixed_step: # write remaining value if not a nan if previousValue and writeStart != end and \ not np.isnan(previousValue): _file.write("%s\t%d\t%d\t%.1f\n" % (chrom, writeStart, end, previousValue)) # """ tempFileName = _file.name _file.close() return(tempFileName)
def writeBedGraph( bamOrBwFileList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, numberOfProcessors=None, format="bedgraph", extendPairedEnds=True, missingDataAsZero=False, smoothLength=0, fixed_step=False, ): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. """ bamHandlers = [ bamHandler.openBam(indexedFile) for indexedFile, fileFormat in bamOrBwFileList if fileFormat == "bam" ] if len(bamHandlers): genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=False) else: genomeChunkLength = int(10e6) bigwigs = [fileName for fileName, fileFormat in bamOrBwFileList if fileFormat == "bigwig"] cCommon = [] chromNamesAndSize = {} for bw in bigwigs: bwh = pyBigWig.open(bw) for chromName, size in bwh.chroms().items(): if chromName in chromNamesAndSize: cCommon.append(chromName) if chromNamesAndSize[chromName] != size: print "\nWARNING\n" "Chromosome {} length reported in the " "bigwig files differ.\n{} for {}\n" "{} for {}.\n\nThe smallest " "length will be used".format( chromName, chromNamesAndSize[chromName], bigwigs[0], size, bw ) chromNamesAndSize[chromName] = min(chromNamesAndSize[chromName], size) else: chromNamesAndSize[chromName] = size bwh.close() # get the list of common chromosome names and sizes chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems() if k in cCommon] if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce( ( tileSize, fragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds, smoothLength, missingDataAsZero, fixed_step, ), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, numberOfProcessors=numberOfProcessors, ) # concatenate intermediary bedgraph files outFile = open(outputFileName + ".bg", "wb") for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, "rb"), outFile) os.remove(tempFileName) bedGraphFile = outFile.name outFile.close() if format == "bedgraph": os.rename(bedGraphFile, outputFileName) if debug: print "output file: %s" % (outputFileName) else: bedGraphToBigWig(chromNamesAndSize, bedGraphFile, outputFileName, True) if debug: print "output file: %s" % (outputFileName) os.remove(bedGraphFile)
def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, format="bedgraph", smoothLength=0): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. Parameters ---------- func_to_call : str function name to be called to convert the list of coverages computed for each bam file at each position into a single value. An example is a function that takes the ratio between the coverage of two bam files. func_args : dict dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0} out_file_name : str name of the file to save the resulting data. smoothLength : int Distance in bp for smoothing the coverage per tile. """ self.__dict__["smoothLength"] = smoothLength bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList] genome_chunk_length = getGenomeChunkLength(bam_handlers, self.binLength) # check if both bam files correspond to the same species # by comparing the chromosome names: chrom_names_and_size, non_common = getCommonChrNames(bam_handlers, verbose=False) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) for x in list(self.__dict__.keys()): sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x))) res = mapReduce.mapReduce([func_to_call, func_args], writeBedGraph_wrapper, chrom_names_and_size, self_=self, genomeChunkLength=genome_chunk_length, region=self.region, blackListFileName=blackListFileName, numberOfProcessors=self.numberOfProcessors) # concatenate intermediary bedgraph files out_file = open(out_file_name + ".bg", 'wb') for tempfilename in res: if tempfilename: # concatenate all intermediate tempfiles into one # bedgraph file _foo = open(tempfilename, 'rb') shutil.copyfileobj(_foo, out_file) _foo.close() os.remove(tempfilename) bedgraph_file = out_file.name out_file.close() if format == 'bedgraph': os.rename(bedgraph_file, out_file_name) if self.verbose: print("output file: {}".format(out_file_name)) else: bedGraphToBigWig(chrom_names_and_size, bedgraph_file, out_file_name, True) if self.verbose: print("output file: {}".format(out_file_name)) os.remove(bedgraph_file)
def get_scale_factor(args): scale_factor = args.scaleFactor bam_handle = bamHandler.openBam(args.bam, args.bamIndex) bam_mapped = parserCommon.bam_total_reads(bam_handle, args.ignoreForNormalization) if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bam, args.bamIndex, return_lengths=False, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if args.verbose: print( "Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median'])) elif args.extendReads < 1: exit( "*ERROR*: read extension must be bigger than one. Value give: {} " .format(args.extendReads)) elif args.extendReads > 2000: exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print "Estimated read length is {}".format( int(read_len_dict['median'])) current_coverage = \ float(bam_mapped * fragment_length) / args.normalizeTo1x # the scaling sets the coverage to match 1x scale_factor *= 1.0 / current_coverage if debug: print "Estimated current coverage {}".format(current_coverage) print "Scaling factor {}".format(args.scaleFactor) elif args.normalizeUsingRPKM: # the RPKM is the # reads per tile / \ # ( total reads (in millions) * tile length in Kb) million_reads_mapped = float(bam_mapped) / 1e6 tile_len_in_kb = float(args.binSize) / 1000 scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb) if debug: print "scale factor using RPKM is {0}".format(args.scaleFactor) return scale_factor
def main(args=None): args = parse_arguments().parse_args(args) if not args.outRawCounts and not args.plotFile: sys.exit( "Error: You need to specify at least one of --plotFile or --outRawCounts!\n" ) if args.labels is None: args.labels = args.bamfiles if len(args.labels) != len(args.bamfiles): sys.exit( "Error: The number of labels ({0}) does not match the number of BAM files ({1})!" .format(len(args.labels), len(args.bamfiles))) # Get fragment size and chromosome dict fhs = [openBam(x) for x in args.bamfiles] chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose) for fh in fhs: fh.close() frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bamfiles[0], return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: defaultFragmentLength = frag_len_dict['median'] else: sys.exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if args.verbose: print("Fragment length based on paired en data " "estimated to be {0}".format(frag_len_dict['median'])) elif args.extendReads < read_len_dict['median']: sys.stderr.write( "*WARNING*: read extension is smaller than read length (read length = {}). " "Reads will not be extended.\n".format( int(read_len_dict['median']))) defaultFragmentLength = 'read length' elif args.extendReads > 2000: sys.exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(args.extendReads)) else: defaultFragmentLength = args.extendReads else: defaultFragmentLength = 'read length' # Get the chunkLength chunkLength = getChunkLength(args, chromSize) # Map reduce to get the counts/file/feature res = mapReduce([args, defaultFragmentLength], getEnrichment_worker, chromSize, genomeChunkLength=chunkLength, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) features = res[0][1] featureCounts = [] for i in list(range(len(args.bamfiles))): d = dict() for x in features: d[x] = 0 featureCounts.append(d) # res is a list, with each element a list (length len(args.bamfiles)) of dicts totalCounts = [0] * len(args.bamfiles) for x in res: for i, y in enumerate(x[2]): totalCounts[i] += y for i, y in enumerate(x[0]): for k, v in y.items(): featureCounts[i][k] += v # Make a plot if args.plotFile: plotEnrichment(args, featureCounts, totalCounts, features) # Raw counts if args.outRawCounts: of = open(args.outRawCounts, "w") of.write("file\tfeatureType\tpercent\n") for i, x in enumerate(args.labels): for k, v in featureCounts[i].items(): of.write("{0}\t{1}\t{2:5.2f}\n".format(x, k, (100.0 * v) / totalCounts[i])) of.close()
def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args, bed_regions_list=None): r"""Writes a bedgraph based on the read coverage found on bamFiles The given func is called to compute the desired bedgraph value using the funcArgs Parameters ---------- chrom : str Chrom name start : int start coordinate end : int end coordinate func_to_call : str function name to be called to convert the list of coverages computed for each bam file at each position into a single value. An example is a function that takes the ratio between the coverage of two bam files. func_args : dict dict of arguments to pass to `func`. smoothLength : int Distance in bp for smoothing the coverage per tile. bed_regions_list: list List of tuples of the form (chrom, start, end) corresponding to bed regions to be processed. If not bed file was passed to the object constructor then this list is empty. Returns ------- temporary file with the bedgraph results for the region queried. Examples -------- >>> test_path = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/" >>> bamFile1 = test_path + "testA.bam" >>> bin_length = 50 >>> number_of_samples = 0 # overruled by step_size >>> func_to_call = scaleCoverage >>> funcArgs = {'scaleFactor': 1.0} >>> c = WriteBedGraph([bamFile1], bin_length, number_of_samples, stepSize=50) >>> tempFile = c.writeBedGraph_worker( '3R', 0, 200, func_to_call, funcArgs) >>> open(tempFile, 'r').readlines() ['3R\t0\t100\t0.00\n', '3R\t100\t200\t1.0\n'] >>> os.remove(tempFile) """ if start > end: raise NameError("start position ({0}) bigger " "than end position ({1})".format(start, end)) coverage = [] bam_handlers = [bamHandler.openBam(bam) for bam in self.bamFilesList] for bam in bam_handlers: coverage.append( self.get_coverage_of_region(bam, chrom, start, end, self.binLength)) bam.close() _file = open(utilities.getTempFileName(suffix='.bg'), 'w') previous_value = None length_coverage = len(coverage[0]) for tileIndex in xrange(length_coverage): tileCoverage = [] for index in range(len(self.bamFilesList)): if self.smoothLength > 0: vector_start, vector_end = self.getSmoothRange(tileIndex, self.binLength, self.smoothLength, length_coverage) tileCoverage.append( np.mean(coverage[index][vector_start:vector_end])) else: tileCoverage.append(coverage[index][tileIndex]) value = func_to_call(tileCoverage, func_args) """ # uncomment this lines if fixed step bedgraph is wanted if not np.isnan(value): writeStart = start + tileIndex*self.binLength writeEnd = min(writeStart+self.binLength, end) _file.write( "%s\t%d\t%d\t%.2f\n" % (chrom, writeStart, writeEnd, value) ) """ if previous_value is None: writeStart = start + tileIndex * self.binLength writeEnd = min(writeStart + self.binLength, end) previous_value = value elif previous_value == value: writeEnd = min(writeEnd + self.binLength, end) elif previous_value != value: if not np.isnan(previous_value): _file.write( "{}\t{}\t{}\t{:.2f}\n".format(chrom, writeStart, writeEnd, previous_value)) previous_value = value writeStart = writeEnd writeEnd = min(writeStart + self.binLength, end) # write remaining value if not a nan if previous_value and writeStart != end and not np.isnan(previous_value): _file.write("%s\t%d\t%d\t%.1f\n" % (chrom, writeStart, end, previous_value)) tempfilename = _file.name _file.close() return tempfilename