def convertBED(oname, tmpFiles, chromDict): """ Stores results in BEDPE format, which is: chromosome frag_leftend frag_rightend The fragment ends can be shifted """ ofile = open(oname, "w") for tmpFile in tmpFiles: fh = pysam.AlignmentFile(tmpFile) for b in fh.fetch(until_eof=True): tLen = getTLen(b, notAbs=True) if tLen > 0: start = b.pos end = start + tLen if end > chromDict[b.reference_name]: end = chromDict[b.reference_name] if end - start < 1: continue ofile.write("{}\t{}\t{}\n".format(b.reference_name, start, end)) fh.close() os.unlink(tmpFile) ofile.close()
def getEnrichment_worker(arglist): """ This is the worker function of plotEnrichment. In short, given a region, iterate over all reads **starting** in it. Filter/extend them as requested and check each for an overlap with findOverlaps. For each overlap, increment the counter for that feature. """ chrom, start, end, args, defaultFragmentLength = arglist if args.verbose: sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end)) gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels) olist = [] total = [0] * len(args.bamfiles) for idx, f in enumerate(args.bamfiles): odict = dict() for x in gtf.features: odict[x] = 0 fh = openBam(f) chrom = mungeChromosome(chrom, fh.references) prev_start_pos = None # to store the start positions for read in fh.fetch(chrom, start, end): # Filter if read.pos < start: # Ensure that a given alignment is processed only once continue if read.flag & 4: continue if args.minMappingQuality and read.mapq < args.minMappingQuality: continue if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: continue tLen = getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: continue if args.ignoreDuplicates and prev_start_pos \ and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse): continue prev_start_pos = (read.reference_start, read.pnext, read.is_reverse) total[idx] += 1 # Get blocks, possibly extending features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset)) if features is not None and len(features) > 0: for x in features: odict[x] += 1 olist.append(odict) return olist, gtf.features, total
def shiftRead(b, chromDict, args): if not b.is_proper_pair: return None tLen = getTLen(b, notAbs=True) start = b.pos end = start + b.query_alignment_end if b.is_reverse and not b.is_read2: end -= args.shift[2] deltaTLen = args.shift[3] - args.shift[2] elif b.is_reverse and b.is_read2: end += args.shift[1] deltaTLen = args.shift[1] - args.shift[0] elif not b.is_reverse and not b.is_read2: start += args.shift[0] deltaTLen = args.shift[1] - args.shift[0] else: start -= args.shift[3] deltaTLen = args.shift[3] - args.shift[2] # Sanity check if end - start < 1: if b.is_reverse: start = end - 1 else: end = start + 1 if start < 0: start = 0 if end > chromDict[b.reference_name]: end = chromDict[b.reference_name] if end - start < 1: return None # create a new read b2 = pysam.AlignedSegment() b2.query_name = b.query_name b2.flag = b.flag b2.reference_id = b.reference_id b2.reference_start = start b2.mapping_quality = b.mapping_quality b2.cigar = ((0, end - start), ) # Returned cigar is only matches if tLen < 0: b2.template_length = tLen - deltaTLen else: b2.template_length = tLen + deltaTLen b2.next_reference_id = b.next_reference_id b2.next_reference_start = b.next_reference_start if b.is_proper_pair: if b2.is_read2 and b2.is_reverse: b2.next_reference_start += args.shift[0] elif not b2.is_read2 and b2.is_reverse: b2.next_reference_start -= args.shift[3] return b2
def shiftRead(b, chromDict, args): if not b.is_proper_pair: return None tLen = getTLen(b, notAbs=True) start = b.pos end = start + b.query_alignment_end if b.is_reverse and not b.is_read2: end -= args.shift[2] deltaTLen = args.shift[3] - args.shift[2] elif b.is_reverse and b.is_read2: end += args.shift[1] deltaTLen = args.shift[1] - args.shift[0] elif not b.is_reverse and not b.is_read2: start += args.shift[0] deltaTLen = args.shift[1] - args.shift[0] else: start -= args.shift[3] deltaTLen = args.shift[3] - args.shift[2] # Sanity check if end - start < 1: if b.is_reverse: start = end - 1 else: end = start + 1 if start < 0: start = 0 if end > chromDict[b.reference_name]: end = chromDict[b.reference_name] if end - start < 1: return None # create a new read b2 = pysam.AlignedSegment() b2.query_name = b.query_name b2.flag = b.flag b2.reference_id = b.reference_id b2.reference_start = start b2.mapping_quality = b.mapping_quality b2.cigar = ((0, end - start),) # Returned cigar is only matches if tLen < 0: b2.template_length = tLen - deltaTLen else: b2.template_length = tLen + deltaTLen b2.next_reference_id = b.next_reference_id b2.next_reference_start = b.next_reference_start if b.is_proper_pair: if b2.is_read2 and b2.is_reverse: b2.next_reference_start += args.shift[0] elif not b2.is_read2 and b2.is_reverse: b2.next_reference_start -= args.shift[3] return b2
def getEnrichment_worker(arglist): """ This is the worker function of plotEnrichment. In short, given a region, iterate over all reads **starting** in it. Filter/extend them as requested and check each for an overlap with findOverlaps. For each overlap, increment the counter for that feature. """ chrom, start, end, args, defaultFragmentLength = arglist if args.verbose: sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end)) gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels) olist = [] total = [0] * len(args.bamfiles) for idx, f in enumerate(args.bamfiles): odict = dict() for x in gtf.features: odict[x] = 0 fh = openBam(f) chrom = mungeChromosome(chrom, fh.references) prev_start_pos = None # to store the start positions for read in fh.fetch(chrom, start, end): # Filter if read.pos < start: # Ensure that a given alignment is processed only once continue if read.flag & 4: continue if args.minMappingQuality and read.mapq < args.minMappingQuality: continue if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: continue tLen = getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: continue if args.ignoreDuplicates and prev_start_pos \ and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse): continue prev_start_pos = (read.reference_start, read.pnext, read.is_reverse) total[idx] += 1 # Get blocks, possibly extending features = gtf.findOverlaps( chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset)) if features is not None and len(features) > 0: for x in features: odict[x] += 1 olist.append(odict) return olist, gtf.features, total
def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentFromRead_func=None): """ Returns a numpy array that corresponds to the number of reads that overlap with each tile. >>> test = Tester() >>> import pysam >>> c = SumCoveragePerBin([], stepSize=1, extendReads=300) For this case the reads are length 36. The number of overlapping read fragments is 4 and 5 for the positions tested. Note that reads are NOT extended, due to there being a 0 length input list of BAM files! >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2', ... [(5000833, 5000834), (5000834, 5000835)]) array([4., 5.]) In the following case the reads length is 50. Reads are not extended. >>> c.extendReads=False >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)]) array([2., 4., 4.]) """ if not fragmentFromRead_func: fragmentFromRead_func = self.get_fragment_from_read nbins = len(regions) if len(regions[0]) == 3: nbins = 0 for reg in regions: nbins += (reg[1] - reg[0]) // reg[2] coverages = np.zeros(nbins, dtype='float64') if self.defaultFragmentLength == 'read length': extension = 0 else: extension = self.maxPairedFragmentLength blackList = None if self.blackListFileName is not None: blackList = GTF(self.blackListFileName) vector_start = 0 for idx, reg in enumerate(regions): if len(reg) == 3: tileSize = int(reg[2]) nRegBins = (reg[1] - reg[0]) // tileSize else: nRegBins = 1 tileSize = int(reg[1] - reg[0]) # Blacklisted regions have a coverage of 0 if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]): continue regStart = int(max(0, reg[0] - extension)) regEnd = reg[1] + int(extension) # If alignments are extended and there's a blacklist, ensure that no # reads originating in a blacklist are fetched if blackList and reg[0] > 0 and extension > 0: o = blackList.findOverlaps(chrom, regStart, reg[0]) if o is not None and len(o) > 0: regStart = o[-1][1] o = blackList.findOverlaps(chrom, reg[1], regEnd) if o is not None and len(o) > 0: regEnd = o[0][0] start_time = time.time() # caching seems faster. TODO: profile the function c = 0 try: # BAM input if chrom not in bamHandle.references: raise NameError("chromosome {} not found in bam file".format(chrom)) except: # bigWig input, as used by plotFingerprint if bamHandle.chroms(chrom): _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float) _[np.isnan(_)] = 0.0 _ = _ * tileSize coverages += _ continue else: raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms())) prev_pos = set() lpos = None # of previous processed read pair for read in bamHandle.fetch(chrom, regStart, regEnd): if read.is_unmapped: continue if self.minMappingQuality and read.mapq < self.minMappingQuality: continue # filter reads based on SAM flag if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include: continue if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0: continue # Fragment lengths tLen = getTLen(read) if self.minFragmentLength > 0 and tLen < self.minFragmentLength: continue if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength: continue # get rid of duplicate reads that have same position on each of the # pairs if self.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if tLen >= 0: s = read.pos e = s + tLen else: s = read.pnext e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) # since reads can be split (e.g. RNA-seq reads) each part of the # read that maps is called a position block. try: position_blocks = fragmentFromRead_func(read) except TypeError: # the get_fragment_from_read functions returns None in some cases. # Those cases are to be skipped, hence the continue line. continue last_eIdx = None for fragmentStart, fragmentEnd in position_blocks: if fragmentEnd is None or fragmentStart is None: continue fragmentLength = fragmentEnd - fragmentStart if fragmentLength == 0: continue # skip reads that are not in the region being # evaluated. if fragmentEnd <= reg[0] or fragmentStart >= reg[1]: continue if fragmentStart < reg[0]: fragmentStart = reg[0] if fragmentEnd > reg[0] + len(coverages) * tileSize: fragmentEnd = reg[0] + len(coverages) * tileSize sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0) eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins) if eIdx >= len(coverages): eIdx = len(coverages) - 1 if last_eIdx is not None: sIdx = max(last_eIdx, sIdx) if sIdx >= eIdx: continue # First bin if fragmentEnd < reg[0] + (sIdx + 1) * tileSize: _ = fragmentEnd - fragmentStart else: _ = reg[0] + (sIdx + 1) * tileSize - fragmentStart if _ > tileSize: _ = tileSize coverages[sIdx] += _ _ = sIdx + 1 while _ < eIdx: coverages[_] += tileSize _ += 1 while eIdx - sIdx >= nRegBins: eIdx -= 1 if eIdx > sIdx: _ = fragmentEnd - (reg[0] + eIdx * tileSize) if _ > tileSize: _ = tileSize elif _ < 0: _ = 0 coverages[eIdx] += _ last_eIdx = eIdx c += 1 if self.verbose: endTime = time.time() print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % ( multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1])) vector_start += nRegBins # change zeros to NAN if self.zerosToNans: coverages[coverages == 0] = np.nan return coverages
def getEnrichment_worker(arglist): """ This is the worker function of plotEnrichment. In short, given a region, iterate over all reads **starting** in it. Filter/extend them as requested and check each for an overlap with findOverlaps. For each overlap, increment the counter for that feature. """ chrom, start, end, args, defaultFragmentLength = arglist if args.verbose: sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end)) olist = [] total = [0] * len(args.bamfiles) for idx, f in enumerate(args.bamfiles): odict = dict() for x in gtf.features: odict[x] = 0 fh = openBam(f) chrom = mungeChromosome(chrom, fh.references) lpos = None prev_pos = set() for read in fh.fetch(chrom, start, end): # Filter if read.pos < start: # Ensure that a given alignment is processed only once continue if read.flag & 4: continue if args.minMappingQuality and read.mapq < args.minMappingQuality: continue if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: continue tLen = getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: continue if args.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if tLen >= 0: s = read.pos e = s + tLen else: s = read.pnext e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) total[idx] += 1 # Get blocks, possibly extending features = gtf.findOverlaps( chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset)) if features is not None and len(features) > 0: for x in features: odict[x] += 1 olist.append(odict) return olist, gtf.features, total
def getFractionKept_worker(chrom, start, end, bamFile, args): """ Queries the BAM file and counts the number of alignments kept/found in the first 50000 bases. """ bam = bamHandler.openBam(bamFile) end = min(end, start + 50000) tot = 0 filtered = 0 prev_start_pos = None # to store the start positions if chrom in bam.references: for read in bam.fetch(chrom, start, end): tot += 1 if args.minMappingQuality and read.mapq < args.minMappingQuality: filtered += 1 continue # filter reads based on SAM flag if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: filtered += 1 continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: filtered += 1 continue # fragment length filtering tLen = utilities.getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: filtered += 1 continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: filtered += 1 continue # get rid of duplicate reads that have same position on each of the # pairs if args.ignoreDuplicates and prev_start_pos \ and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse): filtered += 1 continue prev_start_pos = (read.reference_start, read.pnext, read.is_reverse) # If filterRNAstrand is in args, then filter accordingly # This is very similar to what's used in the get_fragment_from_read function in the filterRnaStrand class if hasattr(args, "filterRNAstrand"): if read.is_paired: if args.filterRNAstrand == 'forward': if not ((read.flag & 128 == 128 and read.flag & 16 == 0) or (read.flag & 64 == 64 and read.flag & 32 == 0)): filtered += 1 continue elif args.filterRNAstrand == 'reverse': if not (read.flag & 144 == 144 or read.flag & 96 == 96): filtered += 1 continue else: if args.filterRNAstrand == 'forward' and read.flag & 16 == 0: filtered += 1 continue elif args.filterRNAstrand == 'reverse' and read.flag & 16 == 16: filtered += 1 continue return (filtered, tot)
def filterWorker(arglist): chrom, start, end, args, chromDict = arglist fh = openBam(args.bam) mode = 'wbu' oname = getTempFileName(suffix='.bam') if args.filteredOutReads: onameFiltered = getTempFileName(suffix='.bam') else: onameFiltered = None ofh = pysam.AlignmentFile(oname, mode=mode, template=fh) if onameFiltered: ofiltered = pysam.AlignmentFile(onameFiltered, mode=mode, template=fh) else: ofiltered = None prev_pos = set() lpos = None nFiltered = 0 total = 0 for read in fh.fetch(chrom, start, end): if read.pos < start: # ensure that we never double count (in case distanceBetweenBins == 0) continue total += 1 if read.flag & 4: # Ignore unmapped reads, they were counted already nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.minMappingQuality and read.mapq < args.minMappingQuality: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: nFiltered += 1 if ofiltered: ofiltered.write(read) continue tLen = getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if tLen >= 0: s = read.pos e = s + tLen else: s = read.pnext e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) # filterRNAstrand if args.filterRNAstrand: if read.is_paired: if args.filterRNAstrand == 'forward': if read.flag & 144 == 128 or read.flag & 96 == 64: pass else: nFiltered += 1 if ofiltered: ofiltered.write(read) continue elif args.filterRNAstrand == 'reverse': if read.flag & 144 == 144 or read.flag & 96 == 96: pass else: nFiltered += 1 if ofiltered: ofiltered.write(read) continue else: if args.filterRNAstrand == 'forward': if read.flag & 16 == 16: pass else: nFiltered += 1 if ofiltered: ofiltered.write(read) continue elif args.filterRNAstrand == 'reverse': if read.flag & 16 == 0: pass else: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.shift: read = shiftRead(read, chromDict, args) if not read: continue # Read survived filtering ofh.write(read) # The results from the workers will get sorted, so get the TID tid = fh.get_tid(chrom) ofh.close() if ofiltered: ofiltered.close() fh.close() return tid, start, total, nFiltered, oname, onameFiltered
def getEnrichment_worker(arglist): """ This is the worker function of plotEnrichment. In short, given a region, iterate over all reads **starting** in it. Filter/extend them as requested and check each for an overlap with findOverlaps. For each overlap, increment the counter for that feature. """ chrom, start, end, args, defaultFragmentLength = arglist if args.verbose: sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end)) olist = [] total = [0] * len(args.bamfiles) for idx, f in enumerate(args.bamfiles): odict = dict() for x in gtf.features: odict[x] = 0 fh = openBam(f) chrom = mungeChromosome(chrom, fh.references) lpos = None prev_pos = set() for read in fh.fetch(chrom, start, end): # Filter if read.pos < start: # Ensure that a given alignment is processed only once continue if read.flag & 4: continue if args.minMappingQuality and read.mapq < args.minMappingQuality: continue if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: continue tLen = getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: continue if args.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if tLen >= 0: s = read.pos e = s + tLen else: s = read.pnext e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) total[idx] += 1 # Get blocks, possibly extending features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset)) if features is not None and len(features) > 0: for x in features: odict[x] += 1 olist.append(odict) return olist, gtf.features, total
def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentFromRead_func=None): """ Returns a numpy array that corresponds to the number of reads that overlap with each tile. >>> test = Tester() >>> import pysam >>> c = SumCoveragePerBin([], stepSize=1, extendReads=300) For this case the reads are length 36. The number of overlapping read fragments is 4 and 5 for the positions tested. Note that reads are NOT extended, due to there being a 0 length input list of BAM files! >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2', ... [(5000833, 5000834), (5000834, 5000835)]) array([ 4., 5.]) In the following case the reads length is 50. Reads are not extended. >>> c.extendReads=False >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)]) array([ 2., 4., 4.]) """ if not fragmentFromRead_func: fragmentFromRead_func = self.get_fragment_from_read nbins = len(regions) if len(regions[0]) == 3: nbins = 0 for reg in regions: nbins += (reg[1] - reg[0]) // reg[2] coverages = np.zeros(nbins, dtype='float64') if self.defaultFragmentLength == 'read length': extension = 0 else: extension = self.maxPairedFragmentLength blackList = None if self.blackListFileName is not None: blackList = GTF(self.blackListFileName) vector_start = 0 for idx, reg in enumerate(regions): if len(reg) == 3: tileSize = int(reg[2]) nRegBins = (reg[1] - reg[0]) // tileSize else: nRegBins = 1 tileSize = int(reg[1] - reg[0]) # Blacklisted regions have a coverage of 0 if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]): continue regStart = int(max(0, reg[0] - extension)) regEnd = reg[1] + int(extension) # If alignments are extended and there's a blacklist, ensure that no # reads originating in a blacklist are fetched if blackList and reg[0] > 0 and extension > 0: o = blackList.findOverlaps(chrom, regStart, reg[0]) if o is not None and len(o) > 0: regStart = o[-1][1] o = blackList.findOverlaps(chrom, reg[1], regEnd) if o is not None and len(o) > 0: regEnd = o[0][0] start_time = time.time() # caching seems faster. TODO: profile the function c = 0 try: # BAM input if chrom in bamHandle.references: reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd) if r.flag & 4 == 0] else: raise NameError("chromosome {} not found in bam file".format(chrom)) except: # bigWig input, as used by plotFingerprint if bamHandle.chroms(chrom): _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float) _[np.isnan(_)] = 0.0 _ = _ * tileSize coverages += _ continue else: raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms())) prev_start_pos = None # to store the start positions # of previous processed read pair for read in reads: if self.minMappingQuality and read.mapq < self.minMappingQuality: continue # filter reads based on SAM flag if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include: continue if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0: continue # Fragment lengths tLen = getTLen(read) if self.minFragmentLength > 0 and tLen < self.minFragmentLength: continue if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength: continue # get rid of duplicate reads that have same position on each of the # pairs if self.ignoreDuplicates and prev_start_pos \ and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse): continue # since reads can be split (e.g. RNA-seq reads) each part of the # read that maps is called a position block. try: position_blocks = fragmentFromRead_func(read) except TypeError: # the get_fragment_from_read functions returns None in some cases. # Those cases are to be skipped, hence the continue line. continue last_eIdx = None for fragmentStart, fragmentEnd in position_blocks: if fragmentEnd is None or fragmentStart is None: continue fragmentLength = fragmentEnd - fragmentStart if fragmentLength == 0: continue # skip reads that are not in the region being # evaluated. if fragmentEnd <= reg[0] or fragmentStart >= reg[1]: continue if fragmentStart < reg[0]: fragmentStart = reg[0] if fragmentEnd > reg[0] + len(coverages) * tileSize: fragmentEnd = reg[0] + len(coverages) * tileSize sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0) eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins) if eIdx >= len(coverages): eIdx = len(coverages) - 1 if last_eIdx is not None: sIdx = max(last_eIdx, sIdx) if sIdx >= eIdx: continue # First bin if fragmentEnd < reg[0] + (sIdx + 1) * tileSize: _ = fragmentEnd - fragmentStart else: _ = reg[0] + (sIdx + 1) * tileSize - fragmentStart if _ > tileSize: _ = tileSize coverages[sIdx] += _ _ = sIdx + 1 while _ < eIdx: coverages[_] += tileSize _ += 1 while eIdx - sIdx >= nRegBins: eIdx -= 1 if eIdx > sIdx: _ = fragmentEnd - (reg[0] + eIdx * tileSize) if _ > tileSize: _ = tileSize elif _ < 0: _ = 0 coverages[eIdx] += _ last_eIdx = eIdx prev_start_pos = (read.reference_start, read.pnext, read.is_reverse) c += 1 if self.verbose: endTime = time.time() print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % ( multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1])) vector_start += nRegBins # change zeros to NAN if self.zerosToNans: coverages[coverages == 0] = np.nan return coverages
def getFractionKept_worker(chrom, start, end, bamFile, args, offset): """ Queries the BAM file and counts the number of alignments kept/found in the first 50000 bases. """ bam = bamHandler.openBam(bamFile) start += offset * 50000 end = min(end, start + 50000) tot = 0 filtered = 0 if end <= start: return (filtered, tot) prev_pos = set() lpos = None if chrom in bam.references: for read in bam.fetch(chrom, start, end): tot += 1 if read.is_unmapped: continue if args.minMappingQuality and read.mapq < args.minMappingQuality: filtered += 1 continue # filter reads based on SAM flag if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: filtered += 1 continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: filtered += 1 continue # fragment length filtering tLen = utilities.getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: filtered += 1 continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: filtered += 1 continue # get rid of duplicate reads that have same position on each of the # pairs if args.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if tLen >= 0: s = read.pos e = s + tLen else: s = read.pnext e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: filtered += 1 continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) # If filterRNAstrand is in args, then filter accordingly # This is very similar to what's used in the get_fragment_from_read function in the filterRnaStrand class if hasattr(args, "filterRNAstrand"): if read.is_paired: if args.filterRNAstrand == 'forward': if not ((read.flag & 128 == 128 and read.flag & 16 == 0) or (read.flag & 64 == 64 and read.flag & 32 == 0)): filtered += 1 continue elif args.filterRNAstrand == 'reverse': if not (read.flag & 144 == 144 or read.flag & 96 == 96): filtered += 1 continue else: if args.filterRNAstrand == 'forward' and read.flag & 16 == 0: filtered += 1 continue elif args.filterRNAstrand == 'reverse' and read.flag & 16 == 16: filtered += 1 continue return (filtered, tot)
def getFractionKept_worker(chrom, start, end, bamFile, args): """ Queries the BAM file and counts the number of alignments kept/found in the first 50000 bases. """ bam = bamHandler.openBam(bamFile) end = min(end, start + 50000) tot = 0 filtered = 0 prev_pos = set() lpos = None if chrom in bam.references: for read in bam.fetch(chrom, start, end): tot += 1 if read.is_unmapped: continue if args.minMappingQuality and read.mapq < args.minMappingQuality: filtered += 1 continue # filter reads based on SAM flag if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: filtered += 1 continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: filtered += 1 continue # fragment length filtering tLen = utilities.getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: filtered += 1 continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: filtered += 1 continue # get rid of duplicate reads that have same position on each of the # pairs if args.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if tLen >= 0: s = read.pos e = s + tLen else: s = read.pnext e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: filtered += 1 continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) # If filterRNAstrand is in args, then filter accordingly # This is very similar to what's used in the get_fragment_from_read function in the filterRnaStrand class if hasattr(args, "filterRNAstrand"): if read.is_paired: if args.filterRNAstrand == 'forward': if not ( (read.flag & 128 == 128 and read.flag & 16 == 0) or (read.flag & 64 == 64 and read.flag & 32 == 0)): filtered += 1 continue elif args.filterRNAstrand == 'reverse': if not (read.flag & 144 == 144 or read.flag & 96 == 96): filtered += 1 continue else: if args.filterRNAstrand == 'forward' and read.flag & 16 == 0: filtered += 1 continue elif args.filterRNAstrand == 'reverse' and read.flag & 16 == 16: filtered += 1 continue return (filtered, tot)