Пример #1
0
def convertBED(oname, tmpFiles, chromDict):
    """
    Stores results in BEDPE format, which is:
    chromosome	frag_leftend	frag_rightend

    The fragment ends can be shifted
    """
    ofile = open(oname, "w")
    for tmpFile in tmpFiles:
        fh = pysam.AlignmentFile(tmpFile)

        for b in fh.fetch(until_eof=True):
            tLen = getTLen(b, notAbs=True)
            if tLen > 0:
                start = b.pos
                end = start + tLen
                if end > chromDict[b.reference_name]:
                    end = chromDict[b.reference_name]
                if end - start < 1:
                    continue
                ofile.write("{}\t{}\t{}\n".format(b.reference_name, start,
                                                  end))
        fh.close()
        os.unlink(tmpFile)
    ofile.close()
Пример #2
0
def getEnrichment_worker(arglist):
    """
    This is the worker function of plotEnrichment.

    In short, given a region, iterate over all reads **starting** in it.
    Filter/extend them as requested and check each for an overlap with
    findOverlaps. For each overlap, increment the counter for that feature.
    """
    chrom, start, end, args, defaultFragmentLength = arglist
    if args.verbose:
        sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))

    gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels)
    olist = []
    total = [0] * len(args.bamfiles)
    for idx, f in enumerate(args.bamfiles):
        odict = dict()
        for x in gtf.features:
            odict[x] = 0
        fh = openBam(f)

        chrom = mungeChromosome(chrom, fh.references)

        prev_start_pos = None  # to store the start positions
        for read in fh.fetch(chrom, start, end):
            # Filter
            if read.pos < start:
                # Ensure that a given alignment is processed only once
                continue
            if read.flag & 4:
                continue
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                continue
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                continue
            tLen = getTLen(read)
            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
                continue
            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
                continue
            if args.ignoreDuplicates and prev_start_pos \
                    and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
                continue
            prev_start_pos = (read.reference_start, read.pnext, read.is_reverse)
            total[idx] += 1

            # Get blocks, possibly extending
            features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset))

            if features is not None and len(features) > 0:
                for x in features:
                    odict[x] += 1
        olist.append(odict)
    return olist, gtf.features, total
Пример #3
0
def shiftRead(b, chromDict, args):
    if not b.is_proper_pair:
        return None
    tLen = getTLen(b, notAbs=True)
    start = b.pos
    end = start + b.query_alignment_end
    if b.is_reverse and not b.is_read2:
        end -= args.shift[2]
        deltaTLen = args.shift[3] - args.shift[2]
    elif b.is_reverse and b.is_read2:
        end += args.shift[1]
        deltaTLen = args.shift[1] - args.shift[0]
    elif not b.is_reverse and not b.is_read2:
        start += args.shift[0]
        deltaTLen = args.shift[1] - args.shift[0]
    else:
        start -= args.shift[3]
        deltaTLen = args.shift[3] - args.shift[2]

    # Sanity check
    if end - start < 1:
        if b.is_reverse:
            start = end - 1
        else:
            end = start + 1
    if start < 0:
        start = 0
    if end > chromDict[b.reference_name]:
        end = chromDict[b.reference_name]
    if end - start < 1:
        return None

    # create a new read
    b2 = pysam.AlignedSegment()
    b2.query_name = b.query_name
    b2.flag = b.flag
    b2.reference_id = b.reference_id
    b2.reference_start = start
    b2.mapping_quality = b.mapping_quality
    b2.cigar = ((0, end - start), )  # Returned cigar is only matches
    if tLen < 0:
        b2.template_length = tLen - deltaTLen
    else:
        b2.template_length = tLen + deltaTLen
    b2.next_reference_id = b.next_reference_id
    b2.next_reference_start = b.next_reference_start
    if b.is_proper_pair:
        if b2.is_read2 and b2.is_reverse:
            b2.next_reference_start += args.shift[0]
        elif not b2.is_read2 and b2.is_reverse:
            b2.next_reference_start -= args.shift[3]

    return b2
Пример #4
0
def shiftRead(b, chromDict, args):
    if not b.is_proper_pair:
        return None
    tLen = getTLen(b, notAbs=True)
    start = b.pos
    end = start + b.query_alignment_end
    if b.is_reverse and not b.is_read2:
        end -= args.shift[2]
        deltaTLen = args.shift[3] - args.shift[2]
    elif b.is_reverse and b.is_read2:
        end += args.shift[1]
        deltaTLen = args.shift[1] - args.shift[0]
    elif not b.is_reverse and not b.is_read2:
        start += args.shift[0]
        deltaTLen = args.shift[1] - args.shift[0]
    else:
        start -= args.shift[3]
        deltaTLen = args.shift[3] - args.shift[2]

    # Sanity check
    if end - start < 1:
        if b.is_reverse:
            start = end - 1
        else:
            end = start + 1
    if start < 0:
        start = 0
    if end > chromDict[b.reference_name]:
        end = chromDict[b.reference_name]
    if end - start < 1:
        return None

    # create a new read
    b2 = pysam.AlignedSegment()
    b2.query_name = b.query_name
    b2.flag = b.flag
    b2.reference_id = b.reference_id
    b2.reference_start = start
    b2.mapping_quality = b.mapping_quality
    b2.cigar = ((0, end - start),)  # Returned cigar is only matches
    if tLen < 0:
        b2.template_length = tLen - deltaTLen
    else:
        b2.template_length = tLen + deltaTLen
    b2.next_reference_id = b.next_reference_id
    b2.next_reference_start = b.next_reference_start
    if b.is_proper_pair:
        if b2.is_read2 and b2.is_reverse:
            b2.next_reference_start += args.shift[0]
        elif not b2.is_read2 and b2.is_reverse:
            b2.next_reference_start -= args.shift[3]

    return b2
Пример #5
0
def convertBED(oname, tmpFiles, chromDict):
    """
    Stores results in BEDPE format, which is:
    chromosome	frag_leftend	frag_rightend

    The fragment ends can be shifted
    """
    ofile = open(oname, "w")
    for tmpFile in tmpFiles:
        fh = pysam.AlignmentFile(tmpFile)

        for b in fh.fetch(until_eof=True):
            tLen = getTLen(b, notAbs=True)
            if tLen > 0:
                start = b.pos
                end = start + tLen
                if end > chromDict[b.reference_name]:
                    end = chromDict[b.reference_name]
                if end - start < 1:
                    continue
                ofile.write("{}\t{}\t{}\n".format(b.reference_name, start, end))
        fh.close()
        os.unlink(tmpFile)
    ofile.close()
Пример #6
0
def getEnrichment_worker(arglist):
    """
    This is the worker function of plotEnrichment.

    In short, given a region, iterate over all reads **starting** in it.
    Filter/extend them as requested and check each for an overlap with
    findOverlaps. For each overlap, increment the counter for that feature.
    """
    chrom, start, end, args, defaultFragmentLength = arglist
    if args.verbose:
        sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))

    gtf = Enrichment(args.BED,
                     keepExons=args.keepExons,
                     labels=args.regionLabels)
    olist = []
    total = [0] * len(args.bamfiles)
    for idx, f in enumerate(args.bamfiles):
        odict = dict()
        for x in gtf.features:
            odict[x] = 0
        fh = openBam(f)

        chrom = mungeChromosome(chrom, fh.references)

        prev_start_pos = None  # to store the start positions
        for read in fh.fetch(chrom, start, end):
            # Filter
            if read.pos < start:
                # Ensure that a given alignment is processed only once
                continue
            if read.flag & 4:
                continue
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                continue
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                continue
            tLen = getTLen(read)
            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
                continue
            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
                continue
            if args.ignoreDuplicates and prev_start_pos \
                    and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
                continue
            prev_start_pos = (read.reference_start, read.pnext,
                              read.is_reverse)
            total[idx] += 1

            # Get blocks, possibly extending
            features = gtf.findOverlaps(
                chrom,
                getBAMBlocks(read, defaultFragmentLength, args.centerReads,
                             args.Offset))

            if features is not None and len(features) > 0:
                for x in features:
                    odict[x] += 1
        olist.append(odict)
    return olist, gtf.features, total
Пример #7
0
    def get_coverage_of_region(self, bamHandle, chrom, regions,
                               fragmentFromRead_func=None):
        """
        Returns a numpy array that corresponds to the number of reads
        that overlap with each tile.

        >>> test = Tester()
        >>> import pysam
        >>> c = SumCoveragePerBin([], stepSize=1, extendReads=300)

        For this case the reads are length 36. The number of overlapping
        read fragments is 4 and 5 for the positions tested. Note that reads are
        NOT extended, due to there being a 0 length input list of BAM files!

        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
        ... [(5000833, 5000834), (5000834, 5000835)])
        array([4., 5.])

        In the following  case the reads length is 50. Reads are not extended.

        >>> c.extendReads=False
        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
        array([2., 4., 4.])


        """
        if not fragmentFromRead_func:
            fragmentFromRead_func = self.get_fragment_from_read
        nbins = len(regions)
        if len(regions[0]) == 3:
            nbins = 0
            for reg in regions:
                nbins += (reg[1] - reg[0]) // reg[2]
        coverages = np.zeros(nbins, dtype='float64')

        if self.defaultFragmentLength == 'read length':
            extension = 0
        else:
            extension = self.maxPairedFragmentLength

        blackList = None
        if self.blackListFileName is not None:
            blackList = GTF(self.blackListFileName)

        vector_start = 0
        for idx, reg in enumerate(regions):
            if len(reg) == 3:
                tileSize = int(reg[2])
                nRegBins = (reg[1] - reg[0]) // tileSize
            else:
                nRegBins = 1
                tileSize = int(reg[1] - reg[0])

            # Blacklisted regions have a coverage of 0
            if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
                continue
            regStart = int(max(0, reg[0] - extension))
            regEnd = reg[1] + int(extension)

            # If alignments are extended and there's a blacklist, ensure that no
            # reads originating in a blacklist are fetched
            if blackList and reg[0] > 0 and extension > 0:
                o = blackList.findOverlaps(chrom, regStart, reg[0])
                if o is not None and len(o) > 0:
                    regStart = o[-1][1]
                o = blackList.findOverlaps(chrom, reg[1], regEnd)
                if o is not None and len(o) > 0:
                    regEnd = o[0][0]

            start_time = time.time()
            # caching seems faster. TODO: profile the function
            c = 0
            try:
                # BAM input
                if chrom not in bamHandle.references:
                    raise NameError("chromosome {} not found in bam file".format(chrom))
            except:
                # bigWig input, as used by plotFingerprint
                if bamHandle.chroms(chrom):
                    _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float)
                    _[np.isnan(_)] = 0.0
                    _ = _ * tileSize
                    coverages += _
                    continue
                else:
                    raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms()))

            prev_pos = set()
            lpos = None
            # of previous processed read pair
            for read in bamHandle.fetch(chrom, regStart, regEnd):
                if read.is_unmapped:
                    continue
                if self.minMappingQuality and read.mapq < self.minMappingQuality:
                    continue

                # filter reads based on SAM flag
                if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
                    continue
                if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
                    continue

                # Fragment lengths
                tLen = getTLen(read)
                if self.minFragmentLength > 0 and tLen < self.minFragmentLength:
                    continue
                if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength:
                    continue

                # get rid of duplicate reads that have same position on each of the
                # pairs
                if self.ignoreDuplicates:
                    # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                    if tLen >= 0:
                        s = read.pos
                        e = s + tLen
                    else:
                        s = read.pnext
                        e = s - tLen
                    if read.reference_id != read.next_reference_id:
                        e = read.pnext
                    if lpos is not None and lpos == read.reference_start \
                            and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                        continue
                    if lpos != read.reference_start:
                        prev_pos.clear()
                    lpos = read.reference_start
                    prev_pos.add((s, e, read.next_reference_id, read.is_reverse))

                # since reads can be split (e.g. RNA-seq reads) each part of the
                # read that maps is called a position block.
                try:
                    position_blocks = fragmentFromRead_func(read)
                except TypeError:
                    # the get_fragment_from_read functions returns None in some cases.
                    # Those cases are to be skipped, hence the continue line.
                    continue

                last_eIdx = None
                for fragmentStart, fragmentEnd in position_blocks:
                    if fragmentEnd is None or fragmentStart is None:
                        continue
                    fragmentLength = fragmentEnd - fragmentStart
                    if fragmentLength == 0:
                        continue
                    # skip reads that are not in the region being
                    # evaluated.
                    if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
                        continue

                    if fragmentStart < reg[0]:
                        fragmentStart = reg[0]
                    if fragmentEnd > reg[0] + len(coverages) * tileSize:
                        fragmentEnd = reg[0] + len(coverages) * tileSize

                    sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
                    eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
                    if eIdx >= len(coverages):
                        eIdx = len(coverages) - 1
                    if last_eIdx is not None:
                        sIdx = max(last_eIdx, sIdx)
                        if sIdx >= eIdx:
                            continue

                    # First bin
                    if fragmentEnd < reg[0] + (sIdx + 1) * tileSize:
                        _ = fragmentEnd - fragmentStart
                    else:
                        _ = reg[0] + (sIdx + 1) * tileSize - fragmentStart
                    if _ > tileSize:
                        _ = tileSize
                    coverages[sIdx] += _
                    _ = sIdx + 1
                    while _ < eIdx:
                        coverages[_] += tileSize
                        _ += 1
                    while eIdx - sIdx >= nRegBins:
                        eIdx -= 1
                    if eIdx > sIdx:
                        _ = fragmentEnd - (reg[0] + eIdx * tileSize)
                        if _ > tileSize:
                            _ = tileSize
                        elif _ < 0:
                            _ = 0
                        coverages[eIdx] += _
                    last_eIdx = eIdx

                c += 1

            if self.verbose:
                endTime = time.time()
                print("%s,  processing %s (%.1f per sec) reads @ %s:%s-%s" % (
                    multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1]))

            vector_start += nRegBins

        # change zeros to NAN
        if self.zerosToNans:
            coverages[coverages == 0] = np.nan

        return coverages
Пример #8
0
def getEnrichment_worker(arglist):
    """
    This is the worker function of plotEnrichment.

    In short, given a region, iterate over all reads **starting** in it.
    Filter/extend them as requested and check each for an overlap with
    findOverlaps. For each overlap, increment the counter for that feature.
    """
    chrom, start, end, args, defaultFragmentLength = arglist
    if args.verbose:
        sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))

    olist = []
    total = [0] * len(args.bamfiles)
    for idx, f in enumerate(args.bamfiles):
        odict = dict()
        for x in gtf.features:
            odict[x] = 0
        fh = openBam(f)

        chrom = mungeChromosome(chrom, fh.references)

        lpos = None
        prev_pos = set()
        for read in fh.fetch(chrom, start, end):
            # Filter
            if read.pos < start:
                # Ensure that a given alignment is processed only once
                continue
            if read.flag & 4:
                continue
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                continue
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                continue
            tLen = getTLen(read)
            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
                continue
            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
                continue
            if args.ignoreDuplicates:
                # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                if tLen >= 0:
                    s = read.pos
                    e = s + tLen
                else:
                    s = read.pnext
                    e = s - tLen
                if read.reference_id != read.next_reference_id:
                    e = read.pnext
                if lpos is not None and lpos == read.reference_start \
                        and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                    continue
                if lpos != read.reference_start:
                    prev_pos.clear()
                lpos = read.reference_start
                prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
            total[idx] += 1

            # Get blocks, possibly extending
            features = gtf.findOverlaps(
                chrom,
                getBAMBlocks(read, defaultFragmentLength, args.centerReads,
                             args.Offset))

            if features is not None and len(features) > 0:
                for x in features:
                    odict[x] += 1
        olist.append(odict)
    return olist, gtf.features, total
Пример #9
0
def getFractionKept_worker(chrom, start, end, bamFile, args):
    """
    Queries the BAM file and counts the number of alignments kept/found in the
    first 50000 bases.
    """
    bam = bamHandler.openBam(bamFile)
    end = min(end, start + 50000)
    tot = 0
    filtered = 0
    prev_start_pos = None  # to store the start positions
    if chrom in bam.references:
        for read in bam.fetch(chrom, start, end):
            tot += 1
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                filtered += 1
                continue

            # filter reads based on SAM flag
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                filtered += 1
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                filtered += 1
                continue

            # fragment length filtering
            tLen = utilities.getTLen(read)
            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
                filtered += 1
                continue
            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
                filtered += 1
                continue

            # get rid of duplicate reads that have same position on each of the
            # pairs
            if args.ignoreDuplicates and prev_start_pos \
                    and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
                filtered += 1
                continue
            prev_start_pos = (read.reference_start, read.pnext, read.is_reverse)

            # If filterRNAstrand is in args, then filter accordingly
            # This is very similar to what's used in the get_fragment_from_read function in the filterRnaStrand class
            if hasattr(args, "filterRNAstrand"):
                if read.is_paired:
                    if args.filterRNAstrand == 'forward':
                        if not ((read.flag & 128 == 128 and read.flag & 16 == 0) or (read.flag & 64 == 64 and read.flag & 32 == 0)):
                            filtered += 1
                            continue
                    elif args.filterRNAstrand == 'reverse':
                        if not (read.flag & 144 == 144 or read.flag & 96 == 96):
                            filtered += 1
                            continue
                else:
                    if args.filterRNAstrand == 'forward' and read.flag & 16 == 0:
                        filtered += 1
                        continue
                    elif args.filterRNAstrand == 'reverse' and read.flag & 16 == 16:
                        filtered += 1
                        continue

    return (filtered, tot)
Пример #10
0
def filterWorker(arglist):
    chrom, start, end, args, chromDict = arglist
    fh = openBam(args.bam)

    mode = 'wbu'
    oname = getTempFileName(suffix='.bam')
    if args.filteredOutReads:
        onameFiltered = getTempFileName(suffix='.bam')
    else:
        onameFiltered = None
    ofh = pysam.AlignmentFile(oname, mode=mode, template=fh)
    if onameFiltered:
        ofiltered = pysam.AlignmentFile(onameFiltered, mode=mode, template=fh)
    else:
        ofiltered = None

    prev_pos = set()
    lpos = None

    nFiltered = 0
    total = 0
    for read in fh.fetch(chrom, start, end):
        if read.pos < start:
            # ensure that we never double count (in case distanceBetweenBins == 0)
            continue

        total += 1
        if read.flag & 4:
            # Ignore unmapped reads, they were counted already
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.minMappingQuality and read.mapq < args.minMappingQuality:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue
        if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        tLen = getTLen(read)
        if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue
        if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.ignoreDuplicates:
            # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
            if tLen >= 0:
                s = read.pos
                e = s + tLen
            else:
                s = read.pnext
                e = s - tLen
            if read.reference_id != read.next_reference_id:
                e = read.pnext
            if lpos is not None and lpos == read.reference_start \
                    and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                nFiltered += 1
                if ofiltered:
                    ofiltered.write(read)
                continue
            if lpos != read.reference_start:
                prev_pos.clear()
            lpos = read.reference_start
            prev_pos.add((s, e, read.next_reference_id, read.is_reverse))

        # filterRNAstrand
        if args.filterRNAstrand:
            if read.is_paired:
                if args.filterRNAstrand == 'forward':
                    if read.flag & 144 == 128 or read.flag & 96 == 64:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
                elif args.filterRNAstrand == 'reverse':
                    if read.flag & 144 == 144 or read.flag & 96 == 96:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
            else:
                if args.filterRNAstrand == 'forward':
                    if read.flag & 16 == 16:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
                elif args.filterRNAstrand == 'reverse':
                    if read.flag & 16 == 0:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue

        if args.shift:
            read = shiftRead(read, chromDict, args)
            if not read:
                continue

        # Read survived filtering
        ofh.write(read)

    # The results from the workers will get sorted, so get the TID
    tid = fh.get_tid(chrom)

    ofh.close()
    if ofiltered:
        ofiltered.close()
    fh.close()
    return tid, start, total, nFiltered, oname, onameFiltered
Пример #11
0
def getFractionKept_worker(chrom, start, end, bamFile, args):
    """
    Queries the BAM file and counts the number of alignments kept/found in the
    first 50000 bases.
    """
    bam = bamHandler.openBam(bamFile)
    end = min(end, start + 50000)
    tot = 0
    filtered = 0
    prev_start_pos = None  # to store the start positions
    if chrom in bam.references:
        for read in bam.fetch(chrom, start, end):
            tot += 1
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                filtered += 1
                continue

            # filter reads based on SAM flag
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                filtered += 1
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                filtered += 1
                continue

            # fragment length filtering
            tLen = utilities.getTLen(read)
            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
                filtered += 1
                continue
            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
                filtered += 1
                continue

            # get rid of duplicate reads that have same position on each of the
            # pairs
            if args.ignoreDuplicates and prev_start_pos \
                    and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
                filtered += 1
                continue
            prev_start_pos = (read.reference_start, read.pnext, read.is_reverse)

            # If filterRNAstrand is in args, then filter accordingly
            # This is very similar to what's used in the get_fragment_from_read function in the filterRnaStrand class
            if hasattr(args, "filterRNAstrand"):
                if read.is_paired:
                    if args.filterRNAstrand == 'forward':
                        if not ((read.flag & 128 == 128 and read.flag & 16 == 0) or (read.flag & 64 == 64 and read.flag & 32 == 0)):
                            filtered += 1
                            continue
                    elif args.filterRNAstrand == 'reverse':
                        if not (read.flag & 144 == 144 or read.flag & 96 == 96):
                            filtered += 1
                            continue
                else:
                    if args.filterRNAstrand == 'forward' and read.flag & 16 == 0:
                        filtered += 1
                        continue
                    elif args.filterRNAstrand == 'reverse' and read.flag & 16 == 16:
                        filtered += 1
                        continue

    return (filtered, tot)
Пример #12
0
def getEnrichment_worker(arglist):
    """
    This is the worker function of plotEnrichment.

    In short, given a region, iterate over all reads **starting** in it.
    Filter/extend them as requested and check each for an overlap with
    findOverlaps. For each overlap, increment the counter for that feature.
    """
    chrom, start, end, args, defaultFragmentLength = arglist
    if args.verbose:
        sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))

    olist = []
    total = [0] * len(args.bamfiles)
    for idx, f in enumerate(args.bamfiles):
        odict = dict()
        for x in gtf.features:
            odict[x] = 0
        fh = openBam(f)

        chrom = mungeChromosome(chrom, fh.references)

        lpos = None
        prev_pos = set()
        for read in fh.fetch(chrom, start, end):
            # Filter
            if read.pos < start:
                # Ensure that a given alignment is processed only once
                continue
            if read.flag & 4:
                continue
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                continue
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                continue
            tLen = getTLen(read)
            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
                continue
            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
                continue
            if args.ignoreDuplicates:
                # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                if tLen >= 0:
                    s = read.pos
                    e = s + tLen
                else:
                    s = read.pnext
                    e = s - tLen
                if read.reference_id != read.next_reference_id:
                    e = read.pnext
                if lpos is not None and lpos == read.reference_start \
                        and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                    continue
                if lpos != read.reference_start:
                    prev_pos.clear()
                lpos = read.reference_start
                prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
            total[idx] += 1

            # Get blocks, possibly extending
            features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset))

            if features is not None and len(features) > 0:
                for x in features:
                    odict[x] += 1
        olist.append(odict)
    return olist, gtf.features, total
Пример #13
0
    def get_coverage_of_region(self, bamHandle, chrom, regions,
                               fragmentFromRead_func=None):
        """
        Returns a numpy array that corresponds to the number of reads
        that overlap with each tile.

        >>> test = Tester()
        >>> import pysam
        >>> c = SumCoveragePerBin([], stepSize=1, extendReads=300)

        For this case the reads are length 36. The number of overlapping
        read fragments is 4 and 5 for the positions tested. Note that reads are
        NOT extended, due to there being a 0 length input list of BAM files!

        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
        ... [(5000833, 5000834), (5000834, 5000835)])
        array([ 4.,  5.])

        In the following  case the reads length is 50. Reads are not extended.

        >>> c.extendReads=False
        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
        array([ 2.,  4.,  4.])


        """
        if not fragmentFromRead_func:
            fragmentFromRead_func = self.get_fragment_from_read
        nbins = len(regions)
        if len(regions[0]) == 3:
            nbins = 0
            for reg in regions:
                nbins += (reg[1] - reg[0]) // reg[2]
        coverages = np.zeros(nbins, dtype='float64')

        if self.defaultFragmentLength == 'read length':
            extension = 0
        else:
            extension = self.maxPairedFragmentLength

        blackList = None
        if self.blackListFileName is not None:
            blackList = GTF(self.blackListFileName)

        vector_start = 0
        for idx, reg in enumerate(regions):
            if len(reg) == 3:
                tileSize = int(reg[2])
                nRegBins = (reg[1] - reg[0]) // tileSize
            else:
                nRegBins = 1
                tileSize = int(reg[1] - reg[0])

            # Blacklisted regions have a coverage of 0
            if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
                continue
            regStart = int(max(0, reg[0] - extension))
            regEnd = reg[1] + int(extension)

            # If alignments are extended and there's a blacklist, ensure that no
            # reads originating in a blacklist are fetched
            if blackList and reg[0] > 0 and extension > 0:
                o = blackList.findOverlaps(chrom, regStart, reg[0])
                if o is not None and len(o) > 0:
                    regStart = o[-1][1]
                o = blackList.findOverlaps(chrom, reg[1], regEnd)
                if o is not None and len(o) > 0:
                    regEnd = o[0][0]

            start_time = time.time()
            # caching seems faster. TODO: profile the function
            c = 0
            try:
                # BAM input
                if chrom in bamHandle.references:
                    reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd)
                             if r.flag & 4 == 0]
                else:
                    raise NameError("chromosome {} not found in bam file".format(chrom))
            except:
                # bigWig input, as used by plotFingerprint
                if bamHandle.chroms(chrom):
                    _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float)
                    _[np.isnan(_)] = 0.0
                    _ = _ * tileSize
                    coverages += _
                    continue
                else:
                    raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms()))

            prev_start_pos = None  # to store the start positions
            # of previous processed read pair
            for read in reads:
                if self.minMappingQuality and read.mapq < self.minMappingQuality:
                    continue

                # filter reads based on SAM flag
                if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
                    continue
                if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
                    continue

                # Fragment lengths
                tLen = getTLen(read)
                if self.minFragmentLength > 0 and tLen < self.minFragmentLength:
                    continue
                if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength:
                    continue

                # get rid of duplicate reads that have same position on each of the
                # pairs
                if self.ignoreDuplicates and prev_start_pos \
                        and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
                    continue

                # since reads can be split (e.g. RNA-seq reads) each part of the
                # read that maps is called a position block.
                try:
                    position_blocks = fragmentFromRead_func(read)
                except TypeError:
                    # the get_fragment_from_read functions returns None in some cases.
                    # Those cases are to be skipped, hence the continue line.
                    continue

                last_eIdx = None
                for fragmentStart, fragmentEnd in position_blocks:
                    if fragmentEnd is None or fragmentStart is None:
                        continue
                    fragmentLength = fragmentEnd - fragmentStart
                    if fragmentLength == 0:
                        continue
                    # skip reads that are not in the region being
                    # evaluated.
                    if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
                        continue

                    if fragmentStart < reg[0]:
                        fragmentStart = reg[0]
                    if fragmentEnd > reg[0] + len(coverages) * tileSize:
                        fragmentEnd = reg[0] + len(coverages) * tileSize

                    sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
                    eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
                    if eIdx >= len(coverages):
                        eIdx = len(coverages) - 1
                    if last_eIdx is not None:
                        sIdx = max(last_eIdx, sIdx)
                        if sIdx >= eIdx:
                            continue

                    # First bin
                    if fragmentEnd < reg[0] + (sIdx + 1) * tileSize:
                        _ = fragmentEnd - fragmentStart
                    else:
                        _ = reg[0] + (sIdx + 1) * tileSize - fragmentStart
                    if _ > tileSize:
                        _ = tileSize
                    coverages[sIdx] += _
                    _ = sIdx + 1
                    while _ < eIdx:
                        coverages[_] += tileSize
                        _ += 1
                    while eIdx - sIdx >= nRegBins:
                        eIdx -= 1
                    if eIdx > sIdx:
                        _ = fragmentEnd - (reg[0] + eIdx * tileSize)
                        if _ > tileSize:
                            _ = tileSize
                        elif _ < 0:
                            _ = 0
                        coverages[eIdx] += _
                    last_eIdx = eIdx

                prev_start_pos = (read.reference_start, read.pnext, read.is_reverse)
                c += 1

            if self.verbose:
                endTime = time.time()
                print("%s,  processing %s (%.1f per sec) reads @ %s:%s-%s" % (
                    multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1]))

            vector_start += nRegBins

        # change zeros to NAN
        if self.zerosToNans:
            coverages[coverages == 0] = np.nan

        return coverages
Пример #14
0
def filterWorker(arglist):
    chrom, start, end, args, chromDict = arglist
    fh = openBam(args.bam)

    mode = 'wbu'
    oname = getTempFileName(suffix='.bam')
    if args.filteredOutReads:
        onameFiltered = getTempFileName(suffix='.bam')
    else:
        onameFiltered = None
    ofh = pysam.AlignmentFile(oname, mode=mode, template=fh)
    if onameFiltered:
        ofiltered = pysam.AlignmentFile(onameFiltered, mode=mode, template=fh)
    else:
        ofiltered = None

    prev_pos = set()
    lpos = None

    nFiltered = 0
    total = 0
    for read in fh.fetch(chrom, start, end):
        if read.pos < start:
            # ensure that we never double count (in case distanceBetweenBins == 0)
            continue

        total += 1
        if read.flag & 4:
            # Ignore unmapped reads, they were counted already
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.minMappingQuality and read.mapq < args.minMappingQuality:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue
        if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        tLen = getTLen(read)
        if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue
        if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.ignoreDuplicates:
            # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
            if tLen >= 0:
                s = read.pos
                e = s + tLen
            else:
                s = read.pnext
                e = s - tLen
            if read.reference_id != read.next_reference_id:
                e = read.pnext
            if lpos is not None and lpos == read.reference_start \
                    and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                nFiltered += 1
                if ofiltered:
                    ofiltered.write(read)
                continue
            if lpos != read.reference_start:
                prev_pos.clear()
            lpos = read.reference_start
            prev_pos.add((s, e, read.next_reference_id, read.is_reverse))

        # filterRNAstrand
        if args.filterRNAstrand:
            if read.is_paired:
                if args.filterRNAstrand == 'forward':
                    if read.flag & 144 == 128 or read.flag & 96 == 64:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
                elif args.filterRNAstrand == 'reverse':
                    if read.flag & 144 == 144 or read.flag & 96 == 96:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
            else:
                if args.filterRNAstrand == 'forward':
                    if read.flag & 16 == 16:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
                elif args.filterRNAstrand == 'reverse':
                    if read.flag & 16 == 0:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue

        if args.shift:
            read = shiftRead(read, chromDict, args)
            if not read:
                continue

        # Read survived filtering
        ofh.write(read)

    # The results from the workers will get sorted, so get the TID
    tid = fh.get_tid(chrom)

    ofh.close()
    if ofiltered:
        ofiltered.close()
    fh.close()
    return tid, start, total, nFiltered, oname, onameFiltered
Пример #15
0
def getFractionKept_worker(chrom, start, end, bamFile, args, offset):
    """
    Queries the BAM file and counts the number of alignments kept/found in the
    first 50000 bases.
    """
    bam = bamHandler.openBam(bamFile)
    start += offset * 50000
    end = min(end, start + 50000)
    tot = 0
    filtered = 0

    if end <= start:
        return (filtered, tot)

    prev_pos = set()
    lpos = None
    if chrom in bam.references:
        for read in bam.fetch(chrom, start, end):
            tot += 1
            if read.is_unmapped:
                continue

            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                filtered += 1
                continue

            # filter reads based on SAM flag
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                filtered += 1
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                filtered += 1
                continue

            # fragment length filtering
            tLen = utilities.getTLen(read)
            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
                filtered += 1
                continue
            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
                filtered += 1
                continue

            # get rid of duplicate reads that have same position on each of the
            # pairs
            if args.ignoreDuplicates:
                # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                if tLen >= 0:
                    s = read.pos
                    e = s + tLen
                else:
                    s = read.pnext
                    e = s - tLen
                if read.reference_id != read.next_reference_id:
                    e = read.pnext
                if lpos is not None and lpos == read.reference_start \
                        and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                    filtered += 1
                    continue
                if lpos != read.reference_start:
                    prev_pos.clear()
                lpos = read.reference_start
                prev_pos.add((s, e, read.next_reference_id, read.is_reverse))

            # If filterRNAstrand is in args, then filter accordingly
            # This is very similar to what's used in the get_fragment_from_read function in the filterRnaStrand class
            if hasattr(args, "filterRNAstrand"):
                if read.is_paired:
                    if args.filterRNAstrand == 'forward':
                        if not ((read.flag & 128 == 128 and read.flag & 16 == 0) or (read.flag & 64 == 64 and read.flag & 32 == 0)):
                            filtered += 1
                            continue
                    elif args.filterRNAstrand == 'reverse':
                        if not (read.flag & 144 == 144 or read.flag & 96 == 96):
                            filtered += 1
                            continue
                else:
                    if args.filterRNAstrand == 'forward' and read.flag & 16 == 0:
                        filtered += 1
                        continue
                    elif args.filterRNAstrand == 'reverse' and read.flag & 16 == 16:
                        filtered += 1
                        continue

    return (filtered, tot)
Пример #16
0
def getFractionKept_worker(chrom, start, end, bamFile, args):
    """
    Queries the BAM file and counts the number of alignments kept/found in the
    first 50000 bases.
    """
    bam = bamHandler.openBam(bamFile)
    end = min(end, start + 50000)
    tot = 0
    filtered = 0
    prev_pos = set()
    lpos = None
    if chrom in bam.references:
        for read in bam.fetch(chrom, start, end):
            tot += 1
            if read.is_unmapped:
                continue

            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                filtered += 1
                continue

            # filter reads based on SAM flag
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                filtered += 1
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                filtered += 1
                continue

            # fragment length filtering
            tLen = utilities.getTLen(read)
            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
                filtered += 1
                continue
            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
                filtered += 1
                continue

            # get rid of duplicate reads that have same position on each of the
            # pairs
            if args.ignoreDuplicates:
                # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                if tLen >= 0:
                    s = read.pos
                    e = s + tLen
                else:
                    s = read.pnext
                    e = s - tLen
                if read.reference_id != read.next_reference_id:
                    e = read.pnext
                if lpos is not None and lpos == read.reference_start \
                        and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                    filtered += 1
                    continue
                if lpos != read.reference_start:
                    prev_pos.clear()
                lpos = read.reference_start
                prev_pos.add((s, e, read.next_reference_id, read.is_reverse))

            # If filterRNAstrand is in args, then filter accordingly
            # This is very similar to what's used in the get_fragment_from_read function in the filterRnaStrand class
            if hasattr(args, "filterRNAstrand"):
                if read.is_paired:
                    if args.filterRNAstrand == 'forward':
                        if not (
                            (read.flag & 128 == 128 and read.flag & 16 == 0) or
                            (read.flag & 64 == 64 and read.flag & 32 == 0)):
                            filtered += 1
                            continue
                    elif args.filterRNAstrand == 'reverse':
                        if not (read.flag & 144 == 144
                                or read.flag & 96 == 96):
                            filtered += 1
                            continue
                else:
                    if args.filterRNAstrand == 'forward' and read.flag & 16 == 0:
                        filtered += 1
                        continue
                    elif args.filterRNAstrand == 'reverse' and read.flag & 16 == 16:
                        filtered += 1
                        continue

    return (filtered, tot)