예제 #1
0
def writeBedGraph_worker(chrom, start, end, tileSize, defaultFragmentLength,
                         bamFilesList, func, funcArgs, extendPairedEnds=True,
                         smoothLength=0, zerosToNans=True,
                         minMappingQuality=None,
                         ignoreDuplicates=False,
                         fragmentFromRead_func=None,
                         centerRead=False, samFlag=None):

    r"""
    Writes a bedgraph having as base a number of bam files.

    The given func is called to compute the desired bedgraph value
    using the funcArgs

    tileSize
    >>> test = Tester()
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0,
    ... [test.bamFile1], scaleCoverage, funcArgs, True, 0, False)
    >>> open(tempFile, 'r').readlines()
    ['3R\t0\t100\t0.00\n', '3R\t100\t200\t1.0\n']
    >>> os.remove(tempFile)

    Test the file being writen for single end reads with
    no extension and no smoothing
    >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0,
    ... [test.bamFile1], scaleCoverage, funcArgs)
    >>> open(tempFile, 'r').readlines()
    ['3R\t100\t200\t1.0\n']
    >>> os.remove(tempFile)

    Test scaling
    >>> funcArgs = {'scaleFactor': 3.0}
    >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0,
    ... [test.bamFile1], scaleCoverage, funcArgs)
    >>> open(tempFile, 'r').readlines()
    ['3R\t100\t200\t3.0\n']
    >>> os.remove(tempFile)

    Test ignore duplicates
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0,
    ... [test.bamFile2], scaleCoverage, funcArgs, ignoreDuplicates=True)
    >>> open(tempFile, 'r').readlines()
    ['3R\t50\t200\t1.0\n']
    >>> os.remove(tempFile)

    Test smoothing
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> tempFile = writeBedGraph_worker( '3R', 100, 200, 20, 0,
    ... [test.bamFile2], scaleCoverage, funcArgs, smoothLength=60)
    >>> open(tempFile, 'r').readlines()
    ['3R\t100\t120\t1.00\n', '3R\t120\t140\t1.67\n', '3R\t140\t160\t2.00\n', '3R\t160\t180\t2.33\n', '3R\t180\t200\t2.0\n']
    >>> os.remove(tempFile)

    Test ratio (needs two bam files)
    >>> funcArgs = {}
    >>> tempFile = writeBedGraph_worker( '3R', 100, 200, 50, 0,
    ... [test.bamFile1, test.bamFile2], ratio , funcArgs)
    >>> open(tempFile, 'r').readlines()
    ['3R\t100\t150\t1.00\n', '3R\t150\t200\t0.5\n']
    >>> os.remove(tempFile)


    Test minMapping quality
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0,
    ... [test.bamFile2], scaleCoverage, funcArgs, minMappingQuality=40)
    >>> open(tempFile, 'r').readlines()
    ['3R\t150\t200\t1.0\n']
    >>> os.remove(tempFile)

    """
    if start > end:
        raise NameError("start position ({0}) bigger "
                        "than end position ({1})".format(start, end))

    coverage = []
    for bamFile in bamFilesList:
        bamHandle = openBam(bamFile)
        coverage.append(
            getCoverageOfRegion(
                bamHandle, chrom, start, end, tileSize,
                defaultFragmentLength, extendPairedEnds, zerosToNans,
                ignoreDuplicates=ignoreDuplicates,
                minMappingQuality=minMappingQuality,
                fragmentFromRead_func=fragmentFromRead_func,
                centerRead=centerRead, samFlag=samFlag))
        bamHandle.close()

    _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
    previousValue = None

    lengthCoverage = len(coverage[0])
    for tileIndex in xrange(lengthCoverage):

        tileCoverage = []
        for index in range(len(bamFilesList)):
            if smoothLength > 0:
                vectorStart, vectorEnd = getSmoothRange(
                    tileIndex, tileSize, smoothLength, lengthCoverage)
                tileCoverage.append(
                    np.mean(coverage[index][vectorStart:vectorEnd]))
            else:
                tileCoverage.append(coverage[index][tileIndex])

        # if zerosToNans == True and sum(tileCoverage) == 0.0:
        #   continue

        value = func(tileCoverage, funcArgs)
        """
        # uncomment this lines if fixed step bedgraph is wanted
        if not  np.isnan(value):
            writeStart = start + tileIndex*tileSize
            writeEnd  =  min(writeStart+tileSize, end)
            _file.write( "%s\t%d\t%d\t%.2f\n" % (chrom, writeStart,
                                                 writeEnd, value) )
        """

        if previousValue is None:
            writeStart = start + tileIndex * tileSize
            writeEnd = min(writeStart + tileSize, end)
            previousValue = value

        elif previousValue == value:
            writeEnd = min(writeEnd + tileSize, end)

        elif previousValue != value:
            if not np.isnan(previousValue):
                _file.write(
                    "{}\t{}\t{}\t{:.2f}\n".format(chrom, writeStart,
                                                  writeEnd, previousValue))
            previousValue = value
            writeStart = writeEnd
            writeEnd = min(writeStart + tileSize, end)

    # write remaining value if not a nan
    if previousValue and writeStart != end and not np.isnan(previousValue):
        _file.write("%s\t%d\t%d\t%.1f\n" % (chrom, writeStart,
                                            end, previousValue))

    tempFileName = _file.name
    _file.close()
    return(tempFileName)
예제 #2
0
def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end,
                             step=None,
                             tag_but_not_change_number=False,
                             verbose=True):
    r"""
    Writes a BAM file, deleting and adding some reads in order to compensate
    for the GC bias. **This is a stochastic method.**
    >>> np.random.seed(1)
    >>> test = Tester()
    >>> args = test.testWriteCorrectedSam()
    >>> tempFile = writeCorrectedSam_worker(*args, \
    ... tag_but_not_change_number=True, verbose=False)
    >>> try:
    ...     import StringIO
    ... except ImportError:
    ...     from io import StringIO
    >>> ostdout = sys.stdout
    >>> import tempfile
    >>> sys.stdout = tempfile.TemporaryFile()
    >>> idx = pysam.index(tempFile)
    >>> sys.stdout = ostdout
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    >>> tempFile = \
    ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\
    ... tag_but_not_change_number=True, verbose=False)
    >>> sys.stdout = tempfile.TemporaryFile()
    >>> idx = pysam.index(tempFile)
    >>> sys.stdout = ostdout
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    """
    global R_gc
    fragmentLength = len(R_gc) - 1

    if verbose:
        print("Sam for %s %s %s " % (chrNameBit, start, end))
    i = 0

    tbit = py2bit.open(global_vars['2bit'])

    bam = openBam(global_vars['bam'])
    tempFileName = utilities.getTempFileName(suffix='.bam')

    outfile = pysam.Samfile(tempFileName, 'wb', template=bam)
    startTime = time.time()
    matePairs = {}
    read_repetitions = 0
    removed_duplicated_reads = 0

    # cache data
    # r.flag & 4 == 0 is to filter unmapped reads that
    # have a genomic position
    reads = [r for r in bam.fetch(chrNameBam, start, end)
             if r.pos > start and r.flag & 4 == 0]

    r_index = -1
    for read in reads:
        if read.pos <= start or read.is_unmapped:
            continue
        r_index += 1
        copies = None
        gc = None

        # check if a mate has already been procesed
        # to apply the same correction
        try:
            copies = matePairs[read.qname]['copies']
            gc = matePairs[read.qname]['gc']
            del(matePairs[read.qname])
        except:
            # this exception happens when a mate is
            # not present. This could
            # happen because of removal of the mate
            # by some filtering
            gc = getReadGCcontent(tbit, read, fragmentLength,
                                  chrNameBit)
            if gc:
                copies = numCopiesOfRead(float(1) / R_gc[gc])
            else:
                copies = 1
        # is this read in the same orientation and position as the previous?
        if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \
                and read.is_reverse == reads[r_index - 1].is_reverse \
                and read.pnext == reads[r_index - 1].pnext:
            read_repetitions += 1
            if read_repetitions >= global_vars['max_dup_gc'][gc]:
                copies = 0  # in other words do not take into account this read
                removed_duplicated_reads += 1
        else:
            read_repetitions = 0

        readName = read.qname
        # Each tag is a tuple of (tag name, value, type)
        # Note that get_tags() returns ord(type) rather than type and this must
        # be fixed!
        # It turns out that the "with_value_type" option only started working in
        # pysam-0.8.4, so we can't reliably add tags on earlier versions without
        # potentially creating BAM files that break HTSJDK/IGV/etc.

        readTag = read.get_tags(with_value_type=True)
        replace_tags = False
        if len(readTag) > 0:
            if len(readTag[0]) == 3:
                if type(readTag[2]) is int:
                    readTag = [(x[0], x[1], chr(x[2])) for x in readTag]
                replace_tags = True
        else:
            replace_tags = True

        if gc:
            GC = int(100 * np.round(float(gc) / fragmentLength,
                                    decimals=2))
            readTag.append(
                ('YC', float(round(float(1) / R_gc[gc], 2)), "f"))
            readTag.append(('YN', copies, "i"))
        else:
            GC = -1

        readTag.append(('YG', GC, "i"))
        if replace_tags:
            read.set_tags(readTag)

        if read.is_paired and read.is_proper_pair \
                and not read.mate_is_unmapped \
                and not read.is_reverse:
            matePairs[readName] = {'copies': copies,
                                   'gc': gc}

        """
        outfile.write(read)
        """
        if tag_but_not_change_number:
            outfile.write(read)
            continue

        for numCop in range(1, copies + 1):
            # the read has to be renamed such that newly
            # formed pairs will match
            if numCop > 1:
                read.qname = readName + "_%d" % (numCop)
            outfile.write(read)

        if verbose:
            if i % 500000 == 0 and i > 0:
                endTime = time.time()
                print("{},  processing {} ({:.1f} per sec) reads "
                      "@ {}:{}-{}".format(multiprocessing.current_process().name,
                                          i, i / (endTime - startTime),
                                          chrNameBit, start, end))
        i += 1

    outfile.close()
    if verbose:
        endTime = time.time()
        print("{},  processing {} ({:.1f} per sec) reads "
              "@ {}:{}-{}".format(multiprocessing.current_process().name,
                                  i, i / (endTime - startTime),
                                  chrNameBit, start, end))
        percentage = float(removed_duplicated_reads) * 100 / len(reads) \
            if len(reads) > 0 else 0
        print("duplicated reads removed %d of %d (%.2f) " %
              (removed_duplicated_reads, len(reads), percentage))

    return tempFileName
예제 #3
0
def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step):
    r"""writes a bedgraph file containing the GC correction of
    a region from the genome

    >>> test = Tester()
    >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk())
    >>> open(tempFile, 'r').readlines()
    ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n']
    >>> os.remove(tempFile)
    """
    global R_gc
    fragmentLength = len(R_gc) - 1

    cvg_corr = np.zeros(end - start)

    i = 0

    tbit = py2bit.open(global_vars['2bit'])
    bam = pysam.Samfile(global_vars['bam'])
    read_repetitions = 0
    removed_duplicated_reads = 0
    startTime = time.time()

    # caching seems to be faster
    # r.flag & 4 == 0 is to skip unmapped
    # reads that nevertheless are asigned
    # to a genomic position
    reads = [r for r in bam.fetch(chrNameBam, start, end) if r.flag & 4 == 0]

    bam.close()
    r_index = -1
    for read in reads:
        r_index += 1
        try:
            # calculate GC content of read fragment
            gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit)
        except Exception as detail:
            print(detail)
            """ this exception happens when the end of a
            chromosome is reached """
            continue
        if not gc:
            continue

        # is this read in the same orientation and position as the previous?
        if r_index > 0 and read.pos == reads[r_index - 1].pos and \
                read.is_reverse == reads[r_index - 1].is_reverse \
                and read.pnext == reads[r_index - 1].pnext:
            read_repetitions += 1
            if read_repetitions >= global_vars['max_dup_gc'][gc]:
                removed_duplicated_reads += 1
                continue
        else:
            read_repetitions = 0

        try:
            fragmentStart, fragmentEnd = getFragmentFromRead(
                read, fragmentLength, extendPairedEnds=True)
            vectorStart = max(fragmentStart - start, 0)
            vectorEnd = min(fragmentEnd - start, end - start)
        except TypeError:
            # the get_fragment_from_read functions returns None in some cases.
            # Those cases are to be skipped, hence the continue line.
            continue

        cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc]
        i += 1
    if debug:
        endTime = time.time()
        print("{}, processing {} ({:.1f} per sec) ")
        "reads @ {}:{}-{}".format(multiprocessing.current_process().name, i,
                                  i / (endTime - startTime), chrNameBit, start,
                                  end)

    if i == 0:
        return None

    _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
    # save in bedgraph format
    for bin in range(0, len(cvg_corr), step):
        value = np.mean(cvg_corr[bin:min(bin + step, end)])
        if value > 0:
            writeStart = start + bin
            writeEnd = min(start + bin + step, end)
            _file.write("%s\t%d\t%d\t%.1f\n" %
                        (chrNameBit, writeStart, writeEnd, value))

    tempFileName = _file.name
    _file.close()
    return tempFileName
    def writeBedGraph_worker(self, chrom, start, end,
                             func_to_call, func_args,
                             bed_regions_list=None):
        r"""Writes a bedgraph based on the read coverage found on bamFiles

        The given func is called to compute the desired bedgraph value
        using the funcArgs

        Parameters
        ----------
        chrom : str
            Chrom name
        start : int
            start coordinate
        end : int
            end coordinate
        func_to_call : str
            function name to be called to convert the list of coverages computed
            for each bam file at each position into a single value. An example
            is a function that takes the ratio between the coverage of two
            bam files.
        func_args : dict
            dict of arguments to pass to `func`.
        smoothLength : int
            Distance in bp for smoothing the coverage per tile.
        bed_regions_list: list
            List of tuples of the form (chrom, start, end)
            corresponding to bed regions to be processed.
            If not bed file was passed to the object constructor
            then this list is empty.

        Returns
        -------
        temporary file with the bedgraph results for the region queried.

        Examples
        --------
        >>> test_path = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
        >>> bamFile1 = test_path +  "testA.bam"
        >>> bin_length = 50
        >>> number_of_samples = 0 # overruled by step_size
        >>> func_to_call = scaleCoverage
        >>> funcArgs = {'scaleFactor': 1.0}

        >>> c = WriteBedGraph([bamFile1], bin_length, number_of_samples, stepSize=50)
        >>> tempFile = c.writeBedGraph_worker( '3R', 0, 200, func_to_call, funcArgs)
        >>> open(tempFile, 'r').readlines()
        ['3R\t0\t100\t0.00\n', '3R\t100\t200\t1.0\n']
        >>> os.remove(tempFile)


        """
        if start > end:
            raise NameError("start position ({0}) bigger "
                            "than end position ({1})".format(start, end))

        coverage = []
        bam_handlers = [bamHandler.openBam(bam) for bam in self.bamFilesList]
        for bam in bam_handlers:
            coverage.append(
                self.get_coverage_of_region(bam, chrom, start, end, self.binLength))
            bam.close()

        _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
        previous_value = None

        length_coverage = len(coverage[0])
        for tileIndex in xrange(length_coverage):

            tileCoverage = []
            for index in range(len(self.bamFilesList)):
                if self.smoothLength > 0:
                    vector_start, vector_end = self.getSmoothRange(tileIndex,
                                                                   self.binLength,
                                                                   self.smoothLength,
                                                                   length_coverage)
                    tileCoverage.append(
                        np.mean(coverage[index][vector_start:vector_end]))
                else:
                    tileCoverage.append(coverage[index][tileIndex])

            value = func_to_call(tileCoverage, func_args)
            """
            # uncomment this lines if fixed step bedgraph is wanted
            if not  np.isnan(value):
                writeStart = start + tileIndex*self.binLength
                writeEnd  =  min(writeStart+self.binLength, end)
                _file.write( "%s\t%d\t%d\t%.2f\n" % (chrom, writeStart,
                                                     writeEnd, value) )
            """

            if previous_value is None:
                writeStart = start + tileIndex * self.binLength
                writeEnd = min(writeStart + self.binLength, end)
                previous_value = value

            elif previous_value == value:
                writeEnd = min(writeEnd + self.binLength, end)

            elif previous_value != value:
                if not np.isnan(previous_value):
                    _file.write(
                        "{}\t{}\t{}\t{:.2f}\n".format(chrom, writeStart,
                                                      writeEnd, previous_value))
                previous_value = value
                writeStart = writeEnd
                writeEnd = min(writeStart + self.binLength, end)

        # write remaining value if not a nan
        if previous_value and writeStart != end and not np.isnan(previous_value):
            _file.write("%s\t%d\t%d\t%.1f\n" % (chrom, writeStart,
                                                end, previous_value))

        tempfilename = _file.name
        _file.close()
        return tempfilename
예제 #5
0
def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step):
    r"""writes a bedgraph file containing the GC correction of
    a region from the genome

    >>> test = Tester()
    >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk())
    >>> open(tempFile, 'r').readlines()
    ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n']
    >>> os.remove(tempFile)
    """
    global R_gc
    fragmentLength = len(R_gc) - 1

    cvg_corr = np.zeros(end - start)

    i = 0

    tbit = py2bit.open(global_vars['2bit'])
    bam = openBam(global_vars['bam'])
    read_repetitions = 0
    removed_duplicated_reads = 0
    startTime = time.time()

    # caching seems to be faster
    # r.flag & 4 == 0 is to skip unmapped
    # reads that nevertheless are asigned
    # to a genomic position
    reads = [r for r in bam.fetch(chrNameBam, start, end)
             if r.flag & 4 == 0]

    bam.close()

    r_index = -1
    for read in reads:
        if read.is_unmapped:
            continue
        r_index += 1
        try:
            # calculate GC content of read fragment
            gc = getReadGCcontent(tbit, read, fragmentLength,
                                  chrNameBit)
        except Exception as detail:
            print(detail)
            """ this exception happens when the end of a
            chromosome is reached """
            continue
        if not gc:
            continue

        # is this read in the same orientation and position as the previous?
        if r_index > 0 and read.pos == reads[r_index - 1].pos and \
                read.is_reverse == reads[r_index - 1].is_reverse \
                and read.pnext == reads[r_index - 1].pnext:
            read_repetitions += 1
            if read_repetitions >= global_vars['max_dup_gc'][gc]:
                removed_duplicated_reads += 1
                continue
        else:
            read_repetitions = 0

        try:
            fragmentStart, fragmentEnd = getFragmentFromRead(read, fragmentLength, extendPairedEnds=True)
            vectorStart = max(fragmentStart - start, 0)
            vectorEnd = min(fragmentEnd - start, end - start)
        except TypeError:
            # the get_fragment_from_read functions returns None in some cases.
            # Those cases are to be skipped, hence the continue line.
            continue

        cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc]
        i += 1

    try:
        if debug:
            endTime = time.time()
            print("{}, processing {} ({:.1f} per sec) ")
            "reads @ {}:{}-{}".format(multiprocessing.current_process().name,
                                      i, i / (endTime - startTime),
                                      chrNameBit, start, end)
    except NameError:
        pass

    if i == 0:
        return None

    _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
    # save in bedgraph format
    for bin in range(0, len(cvg_corr), step):
        value = np.mean(cvg_corr[bin:min(bin + step, end)])
        if value > 0:
            writeStart = start + bin
            writeEnd = min(start + bin + step, end)
            _file.write("%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart,
                                                writeEnd, value))

    tempFileName = _file.name
    _file.close()
    return tempFileName
예제 #6
0
def writeCorrectedSam_worker(chrNameBam,
                             chrNameBit,
                             start,
                             end,
                             step=None,
                             tag_but_not_change_number=False,
                             verbose=True):
    r"""
    Writes a BAM file, deleting and adding some reads in order to compensate
    for the GC bias. **This is a stochastic method.**
    >>> np.random.seed(1)
    >>> test = Tester()
    >>> args = test.testWriteCorrectedSam()
    >>> tempFile = writeCorrectedSam_worker(*args, \
    ... tag_but_not_change_number=True, verbose=False)
    >>> try:
    ...     import StringIO
    ... except ImportError:
    ...     from io import StringIO
    >>> ostdout = sys.stdout
    >>> import tempfile
    >>> sys.stdout = tempfile.TemporaryFile()
    >>> idx = pysam.index(tempFile)
    >>> sys.stdout = ostdout
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    >>> tempFile = \
    ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\
    ... tag_but_not_change_number=True, verbose=False)
    >>> sys.stdout = tempfile.TemporaryFile()
    >>> idx = pysam.index(tempFile)
    >>> sys.stdout = ostdout
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    """
    global R_gc
    fragmentLength = len(R_gc) - 1

    if verbose:
        print("Sam for %s %s %s " % (chrNameBit, start, end))
    i = 0

    tbit = py2bit.open(global_vars['2bit'])

    bam = openBam(global_vars['bam'])
    tempFileName = utilities.getTempFileName(suffix='.bam')

    outfile = pysam.Samfile(tempFileName, 'wb', template=bam)
    startTime = time.time()
    matePairs = {}
    read_repetitions = 0
    removed_duplicated_reads = 0

    # cache data
    # r.flag & 4 == 0 is to filter unmapped reads that
    # have a genomic position
    reads = [
        r for r in bam.fetch(chrNameBam, start, end)
        if r.pos > start and r.flag & 4 == 0
    ]

    r_index = -1
    for read in reads:
        if read.pos <= start or read.is_unmapped:
            continue
        r_index += 1
        copies = None
        gc = None

        # check if a mate has already been procesed
        # to apply the same correction
        try:
            copies = matePairs[read.qname]['copies']
            gc = matePairs[read.qname]['gc']
            del (matePairs[read.qname])
        except:
            # this exception happens when a mate is
            # not present. This could
            # happen because of removal of the mate
            # by some filtering
            gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit)
            if gc:
                copies = numCopiesOfRead(float(1) / R_gc[gc])
            else:
                copies = 1
        # is this read in the same orientation and position as the previous?
        if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \
                and read.is_reverse == reads[r_index - 1].is_reverse \
                and read.pnext == reads[r_index - 1].pnext:
            read_repetitions += 1
            if read_repetitions >= global_vars['max_dup_gc'][gc]:
                copies = 0  # in other words do not take into account this read
                removed_duplicated_reads += 1
        else:
            read_repetitions = 0

        readName = read.qname
        # Each tag is a tuple of (tag name, value, type)
        # Note that get_tags() returns ord(type) rather than type and this must
        # be fixed!
        # It turns out that the "with_value_type" option only started working in
        # pysam-0.8.4, so we can't reliably add tags on earlier versions without
        # potentially creating BAM files that break HTSJDK/IGV/etc.

        readTag = read.get_tags(with_value_type=True)
        replace_tags = False
        if len(readTag) > 0:
            if len(readTag[0]) == 3:
                if type(readTag[2]) is int:
                    readTag = [(x[0], x[1], chr(x[2])) for x in readTag]
                replace_tags = True
        else:
            replace_tags = True

        if gc:
            GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2))
            readTag.append(('YC', float(round(float(1) / R_gc[gc], 2)), "f"))
            readTag.append(('YN', copies, "i"))
        else:
            GC = -1

        readTag.append(('YG', GC, "i"))
        if replace_tags:
            read.set_tags(readTag)

        if read.is_paired and read.is_proper_pair \
                and not read.mate_is_unmapped \
                and not read.is_reverse:
            matePairs[readName] = {'copies': copies, 'gc': gc}
        """
        outfile.write(read)
        """
        if tag_but_not_change_number:
            outfile.write(read)
            continue

        for numCop in range(1, copies + 1):
            # the read has to be renamed such that newly
            # formed pairs will match
            if numCop > 1:
                read.qname = readName + "_%d" % (numCop)
            outfile.write(read)

        if verbose:
            if i % 500000 == 0 and i > 0:
                endTime = time.time()
                print("{},  processing {} ({:.1f} per sec) reads "
                      "@ {}:{}-{}".format(
                          multiprocessing.current_process().name, i,
                          i / (endTime - startTime), chrNameBit, start, end))
        i += 1

    outfile.close()
    if verbose:
        endTime = time.time()
        print("{},  processing {} ({:.1f} per sec) reads "
              "@ {}:{}-{}".format(multiprocessing.current_process().name, i,
                                  i / (endTime - startTime), chrNameBit, start,
                                  end))
        percentage = float(removed_duplicated_reads) * 100 / len(reads) \
            if len(reads) > 0 else 0
        print("duplicated reads removed %d of %d (%.2f) " %
              (removed_duplicated_reads, len(reads), percentage))

    return tempFileName
예제 #7
0
def main(args=None):
    args = process_args(args)
    global F_gc, N_gc, R_gc

    data = np.loadtxt(args.GCbiasFrequenciesFile.name)

    F_gc = data[:, 0]
    N_gc = data[:, 1]
    R_gc = data[:, 2]

    global global_vars
    global_vars = {}
    global_vars['2bit'] = args.genome
    global_vars['bam'] = args.bamfile

    # compute the probability to find more than one read (a redundant read)
    # at a certain position based on the gc of the read fragment
    # the binomial function is used for that
    max_dup_gc = [binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x])
                  if F_gc[x] > 0 and N_gc[x] > 0 else 1
                  for x in range(len(F_gc))]

    global_vars['max_dup_gc'] = max_dup_gc

    bit = twobit.TwoBitFile(open(global_vars['2bit']))
    bam = pysam.Samfile(global_vars['bam'])

    global_vars['genome_size'] = sum([bit[x].size for x in bit.index])
    global_vars['total_reads'] = bam.mapped
    global_vars['reads_per_bp'] = \
        float(global_vars['total_reads']) / args.effectiveGenomeSize

    # apply correction
    print "applying correction"
    # divide the genome in fragments containing about 4e5 reads.
    # This amount of reads takes about 20 seconds
    # to process per core (48 cores, 256 Gb memory)
    chunkSize = int(4e5 / global_vars['reads_per_bp'])

    # chromSizes: list of tuples
    chromSizes = [(bam.references[i], bam.lengths[i])
                  for i in range(len(bam.references))]

    regionStart = 0
    if args.region:
        chromSizes, regionStart, regionEnd, chunkSize = \
            mapReduce.getUserRegion(chromSizes, args.region,
                                    max_chunk_size=chunkSize)

    print "genome partition size for multiprocessing: {}".format(chunkSize)
    print "using region {}".format(args.region)
    mp_args = []
    bedGraphStep = args.binSize
    chrNameBitToBam = tbitToBamChrName(bit.index.keys(), bam.references)
    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.iteritems()])
    print chrNameBitToBam, chrNameBamToBit
    c = 1
    for chrom, size in chromSizes:
        start = 0 if regionStart == 0 else regionStart
        for i in xrange(start, size, chunkSize):
            try:
                chrNameBamToBit[chrom]
            except KeyError:
                print "no sequence information for "
                "chromosome {} in 2bit file".format(chrom)
                print "Reads in this chromosome will be skipped"
                continue
            length = min(size, i + chunkSize)
            mp_args.append((chrom, chrNameBamToBit[chrom], i, length,
                            bedGraphStep))
            c += 1

    pool = multiprocessing.Pool(args.numberOfProcessors)

    if args.correctedFile.name.endswith('bam'):
        if len(mp_args) > 1 and args.numberOfProcessors > 1:
            print ("using {} processors for {} "
                   "number of tasks".format(args.numberOfProcessors,
                                            len(mp_args)))

            res = pool.map_async(
                writeCorrectedSam_wrapper, mp_args).get(9999999)
        else:
            res = map(writeCorrectedSam_wrapper, mp_args)

        if len(res) == 1:
            command = "cp {} {}".format(res[0], args.correctedFile.name)
            run_shell_command(command)
        else:
            print "concatenating (sorted) intermediate BAMs"
            header = pysam.Samfile(res[0])
            of = pysam.Samfile(args.correctedFile.name, "wb", template=header)
            header.close()
            for f in res:
                f = pysam.Samfile(f)
                for e in f.fetch(until_eof=True):
                    of.write(e)
                f.close()
            of.close()

        print "indexing BAM"
        pysam.index(args.correctedFile.name)

        for tempFileName in res:
            os.remove(tempFileName)

    if args.correctedFile.name.endswith('bg') or \
            args.correctedFile.name.endswith('bw'):

        _temp_bg_file_name = utilities.getTempFileName(suffix='_all.bg')
        if len(mp_args) > 1 and args.numberOfProcessors > 1:

            res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999)
        else:
            res = map(writeCorrected_wrapper, mp_args)

        # concatenate intermediary bedgraph files
        _temp_bg_file = open(_temp_bg_file_name, 'w')
        for tempFileName in res:
            if tempFileName:
                # concatenate all intermediate tempfiles into one
                # bedgraph file
                shutil.copyfileobj(open(tempFileName, 'rb'), _temp_bg_file)
                os.remove(tempFileName)
        _temp_bg_file.close()
        args.correctedFile.close()

        if args.correctedFile.name.endswith('bg'):
            shutil.move(_temp_bg_file_name, args.correctedFile.name)

        else:
            chromSizes = [(x, bit[x].size) for x in bit.keys()]
            writeBedGraph.bedGraphToBigWig(chromSizes, _temp_bg_file_name,
                                           args.correctedFile.name)
            os.remove(_temp_bg_file)
예제 #8
0
def filterWorker(arglist):
    chrom, start, end, args, chromDict = arglist
    fh = openBam(args.bam)

    mode = 'wbu'
    oname = getTempFileName(suffix='.bam')
    if args.filteredOutReads:
        onameFiltered = getTempFileName(suffix='.bam')
    else:
        onameFiltered = None
    ofh = pysam.AlignmentFile(oname, mode=mode, template=fh)
    if onameFiltered:
        ofiltered = pysam.AlignmentFile(onameFiltered, mode=mode, template=fh)
    else:
        ofiltered = None

    prev_pos = set()
    lpos = None

    nFiltered = 0
    total = 0
    for read in fh.fetch(chrom, start, end):
        if read.pos < start:
            # ensure that we never double count (in case distanceBetweenBins == 0)
            continue

        total += 1
        if read.flag & 4:
            # Ignore unmapped reads, they were counted already
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.minMappingQuality and read.mapq < args.minMappingQuality:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue
        if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        tLen = getTLen(read)
        if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue
        if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.ignoreDuplicates:
            # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
            if tLen >= 0:
                s = read.pos
                e = s + tLen
            else:
                s = read.pnext
                e = s - tLen
            if read.reference_id != read.next_reference_id:
                e = read.pnext
            if lpos is not None and lpos == read.reference_start \
                    and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                nFiltered += 1
                if ofiltered:
                    ofiltered.write(read)
                continue
            if lpos != read.reference_start:
                prev_pos.clear()
            lpos = read.reference_start
            prev_pos.add((s, e, read.next_reference_id, read.is_reverse))

        # filterRNAstrand
        if args.filterRNAstrand:
            if read.is_paired:
                if args.filterRNAstrand == 'forward':
                    if read.flag & 144 == 128 or read.flag & 96 == 64:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
                elif args.filterRNAstrand == 'reverse':
                    if read.flag & 144 == 144 or read.flag & 96 == 96:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
            else:
                if args.filterRNAstrand == 'forward':
                    if read.flag & 16 == 16:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
                elif args.filterRNAstrand == 'reverse':
                    if read.flag & 16 == 0:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue

        if args.shift:
            read = shiftRead(read, chromDict, args)
            if not read:
                continue

        # Read survived filtering
        ofh.write(read)

    # The results from the workers will get sorted, so get the TID
    tid = fh.get_tid(chrom)

    ofh.close()
    if ofiltered:
        ofiltered.close()
    fh.close()
    return tid, start, total, nFiltered, oname, onameFiltered
예제 #9
0
def main(args=None):
    args = process_args(args)
    global F_gc, N_gc, R_gc

    data = np.loadtxt(args.GCbiasFrequenciesFile.name)

    F_gc = data[:, 0]
    N_gc = data[:, 1]
    R_gc = data[:, 2]

    global global_vars
    global_vars = {}
    global_vars['2bit'] = args.genome
    global_vars['bam'] = args.bamfile

    # compute the probability to find more than one read (a redundant read)
    # at a certain position based on the gc of the read fragment
    # the binomial function is used for that
    max_dup_gc = [
        binom.isf(1e-7, F_gc[x], 1.0 /
                  N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1
        for x in range(len(F_gc))
    ]

    global_vars['max_dup_gc'] = max_dup_gc

    bit = twobit.TwoBitFile(open(global_vars['2bit']))
    bam = pysam.Samfile(global_vars['bam'])

    global_vars['genome_size'] = sum([bit[x].size for x in bit.index])
    global_vars['total_reads'] = bam.mapped
    global_vars['reads_per_bp'] = \
        float(global_vars['total_reads']) / args.effectiveGenomeSize

    # apply correction
    print "applying correction"
    # divide the genome in fragments containing about 4e5 reads.
    # This amount of reads takes about 20 seconds
    # to process per core (48 cores, 256 Gb memory)
    chunkSize = int(4e5 / global_vars['reads_per_bp'])

    # chromSizes: list of tuples
    chromSizes = [(bam.references[i], bam.lengths[i])
                  for i in range(len(bam.references))]

    regionStart = 0
    if args.region:
        chromSizes, regionStart, regionEnd, chunkSize = \
            mapReduce.getUserRegion(chromSizes, args.region,
                                    max_chunk_size=chunkSize)

    print "genome partition size for multiprocessing: {}".format(chunkSize)
    print "using region {}".format(args.region)
    mp_args = []
    bedGraphStep = args.binSize
    chrNameBitToBam = tbitToBamChrName(bit.index.keys(), bam.references)
    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.iteritems()])
    print chrNameBitToBam, chrNameBamToBit
    c = 1
    for chrom, size in chromSizes:
        start = 0 if regionStart == 0 else regionStart
        for i in xrange(start, size, chunkSize):
            try:
                chrNameBamToBit[chrom]
            except KeyError:
                print "no sequence information for "
                "chromosome {} in 2bit file".format(chrom)
                print "Reads in this chromosome will be skipped"
                continue
            length = min(size, i + chunkSize)
            mp_args.append(
                (chrom, chrNameBamToBit[chrom], i, length, bedGraphStep))
            c += 1

    pool = multiprocessing.Pool(args.numberOfProcessors)

    if args.correctedFile.name.endswith('bam'):
        if len(mp_args) > 1 and args.numberOfProcessors > 1:
            print("using {} processors for {} "
                  "number of tasks".format(args.numberOfProcessors,
                                           len(mp_args)))

            res = pool.map_async(writeCorrectedSam_wrapper,
                                 mp_args).get(9999999)
        else:
            res = map(writeCorrectedSam_wrapper, mp_args)

        if len(res) == 1:
            command = "cp {} {}".format(res[0], args.correctedFile.name)
            run_shell_command(command)
        else:
            print "concatenating (sorted) intermediate BAMs"
            header = pysam.Samfile(res[0])
            of = pysam.Samfile(args.correctedFile.name, "wb", template=header)
            header.close()
            for f in res:
                f = pysam.Samfile(f)
                for e in f.fetch(until_eof=True):
                    of.write(e)
                f.close()
            of.close()

        print "indexing BAM"
        pysam.index(args.correctedFile.name)

        for tempFileName in res:
            os.remove(tempFileName)

    if args.correctedFile.name.endswith('bg') or \
            args.correctedFile.name.endswith('bw'):

        _temp_bg_file_name = utilities.getTempFileName(suffix='_all.bg')
        if len(mp_args) > 1 and args.numberOfProcessors > 1:

            res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999)
        else:
            res = map(writeCorrected_wrapper, mp_args)

        # concatenate intermediary bedgraph files
        _temp_bg_file = open(_temp_bg_file_name, 'w')
        for tempFileName in res:
            if tempFileName:
                # concatenate all intermediate tempfiles into one
                # bedgraph file
                shutil.copyfileobj(open(tempFileName, 'rb'), _temp_bg_file)
                os.remove(tempFileName)
        _temp_bg_file.close()
        args.correctedFile.close()

        if args.correctedFile.name.endswith('bg'):
            shutil.move(_temp_bg_file_name, args.correctedFile.name)

        else:
            chromSizes = [(x, bit[x].size) for x in bit.keys()]
            writeBedGraph.bedGraphToBigWig(chromSizes, _temp_bg_file_name,
                                           args.correctedFile.name)
            os.remove(_temp_bg_file)
예제 #10
0
def writeBedGraph_worker(chrom,
                         start,
                         end,
                         tileSize,
                         defaultFragmentLength,
                         bamFilesList,
                         func,
                         funcArgs,
                         extendPairedEnds=True,
                         smoothLength=0,
                         zerosToNans=True,
                         minMappingQuality=None,
                         ignoreDuplicates=False,
                         fragmentFromRead_func=None,
                         centerRead=False):
    r"""
    Writes a bedgraph having as base a number of bam files.

    The given func is called to compute the desired bedgraph value
    using the funcArgs

    tileSize
    >>> test = Tester()
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0,
    ... [test.bamFile1], scaleCoverage, funcArgs, True, 0, False)
    >>> open(tempFile, 'r').readlines()
    ['3R\t0\t100\t0.00\n', '3R\t100\t200\t1.0\n']
    >>> os.remove(tempFile)

    Test the file being writen for single end reads with
    no extension and no smoothing
    >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0,
    ... [test.bamFile1], scaleCoverage, funcArgs)
    >>> open(tempFile, 'r').readlines()
    ['3R\t100\t200\t1.0\n']
    >>> os.remove(tempFile)

    Test scaling
    >>> funcArgs = {'scaleFactor': 3.0}
    >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0,
    ... [test.bamFile1], scaleCoverage, funcArgs)
    >>> open(tempFile, 'r').readlines()
    ['3R\t100\t200\t3.0\n']
    >>> os.remove(tempFile)

    Test ignore duplicates
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0,
    ... [test.bamFile2], scaleCoverage, funcArgs, ignoreDuplicates=True)
    >>> open(tempFile, 'r').readlines()
    ['3R\t50\t200\t1.0\n']
    >>> os.remove(tempFile)

    Test smoothing
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> tempFile = writeBedGraph_worker( '3R', 100, 200, 20, 0,
    ... [test.bamFile2], scaleCoverage, funcArgs, smoothLength=60)
    >>> open(tempFile, 'r').readlines()
    ['3R\t100\t120\t1.00\n', '3R\t120\t140\t1.67\n', '3R\t140\t160\t2.00\n', '3R\t160\t180\t2.33\n', '3R\t180\t200\t2.0\n']
    >>> os.remove(tempFile)

    Test ratio (needs two bam files)
    >>> funcArgs = {}
    >>> tempFile = writeBedGraph_worker( '3R', 100, 200, 50, 0,
    ... [test.bamFile1, test.bamFile2], ratio , funcArgs)
    >>> open(tempFile, 'r').readlines()
    ['3R\t100\t150\t1.00\n', '3R\t150\t200\t0.5\n']
    >>> os.remove(tempFile)


    Test minMapping quality
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0,
    ... [test.bamFile2], scaleCoverage, funcArgs, minMappingQuality=40)
    >>> open(tempFile, 'r').readlines()
    ['3R\t150\t200\t1.0\n']
    >>> os.remove(tempFile)

    """
    if start > end:
        raise NameError("start position ({0}) bigger "
                        "than end position ({1})".format(start, end))

    coverage = []
    for bamFile in bamFilesList:
        bamHandle = openBam(bamFile)
        coverage.append(
            getCoverageOfRegion(bamHandle,
                                chrom,
                                start,
                                end,
                                tileSize,
                                defaultFragmentLength,
                                extendPairedEnds,
                                zerosToNans,
                                ignoreDuplicates=ignoreDuplicates,
                                minMappingQuality=minMappingQuality,
                                fragmentFromRead_func=fragmentFromRead_func,
                                centerRead=centerRead))
        bamHandle.close()

    _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
    previousValue = None

    lengthCoverage = len(coverage[0])
    for tileIndex in xrange(lengthCoverage):

        tileCoverage = []
        for index in range(len(bamFilesList)):
            if smoothLength > 0:
                vectorStart, vectorEnd = getSmoothRange(
                    tileIndex, tileSize, smoothLength, lengthCoverage)
                tileCoverage.append(
                    np.mean(coverage[index][vectorStart:vectorEnd]))
            else:
                tileCoverage.append(coverage[index][tileIndex])

        # if zerosToNans == True and sum(tileCoverage) == 0.0:
        #   continue

        value = func(tileCoverage, funcArgs)
        """
        # uncomment this lines if fixed step bedgraph is wanted
        if not  np.isnan(value):
            writeStart = start + tileIndex*tileSize
            writeEnd  =  min(writeStart+tileSize, end)
            _file.write( "%s\t%d\t%d\t%.2f\n" % (chrom, writeStart,
                                                 writeEnd, value) )
        """

        if previousValue is None:
            writeStart = start + tileIndex * tileSize
            writeEnd = min(writeStart + tileSize, end)
            previousValue = value

        elif previousValue == value:
            writeEnd = min(writeEnd + tileSize, end)

        elif previousValue != value:
            if not np.isnan(previousValue):
                _file.write("{}\t{}\t{}\t{:.2f}\n".format(
                    chrom, writeStart, writeEnd, previousValue))
            previousValue = value
            writeStart = writeEnd
            writeEnd = min(writeStart + tileSize, end)

    # write remaining value if not a nan
    if previousValue and writeStart != end and not np.isnan(previousValue):
        _file.write("%s\t%d\t%d\t%.1f\n" %
                    (chrom, writeStart, end, previousValue))

    tempFileName = _file.name
    _file.close()
    return (tempFileName)
예제 #11
0
def writeCorrectedSam_worker(chrNameBam,
                             chrNameBit,
                             start,
                             end,
                             step=None,
                             tag_but_not_change_number=False,
                             verbose=True):
    r"""
    Writes a SAM file, deleting and adding some reads in order to compensate
    for the GC bias. **This is a stochastic method.**

    First, check if samtools can be executed, otherwise the test will fail
    >>> resp = cfg.checkProgram(samtools, 'view', '')
    >>> np.random.seed(1)
    >>> test = Tester()
    >>> args = test.testWriteCorrectedSam()
    >>> tempFile = writeCorrectedSam_worker(*args, \
    ... tag_but_not_change_number=True, verbose=False)
    >>> res = os.system("{} index {}".format(test.samtools, tempFile))
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['CP'] for r in bam.fetch(args[0], 200, 250)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    >>> tempFile = \
    ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\
    ... tag_but_not_change_number=True, verbose=False)
    >>> res = os.system("{} index {}".format(test.samtools, tempFile))
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['CP'] for r in bam.fetch('chr2L', 0, 50)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    """
    global R_gc
    fragmentLength = len(R_gc) - 1

    if verbose:
        print "Sam for %s %s %s " % (chrNameBit, start, end)
    i = 0

    tbit = twobit.TwoBitFile(open(global_vars['2bit']))

    bam = pysam.Samfile(global_vars['bam'])
    tempFileName = utilities.getTempFileName(suffix='.sam')

    outfile = pysam.Samfile(tempFileName, 'wh', template=bam)
    startTime = time.time()
    matePairs = {}
    read_repetitions = 0
    removed_duplicated_reads = 0
    # cache data
    # r.flag & 4 == 0 is to filter unmapped reads that
    # have a genomic position
    reads = [
        r for r in bam.fetch(chrNameBam, start, end)
        if r.pos > start and r.flag & 4 == 0
    ]

    r_index = -1
    for read in reads:
        r_index += 1
        copies = None
        gc = None

        # check if a mate has already been procesed
        # to apply the same correction
        try:
            copies = matePairs[read.qname]['copies']
            gc = matePairs[read.qname]['gc']
            del (matePairs[read.qname])
        except:
            # this exception happens when a mate is
            # not present. This could
            # happen because of removal of the mate
            # by some filtering
            gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit)
            if gc:
                copies = numCopiesOfRead(float(1) / R_gc[gc])
            else:
                copies = 1
        # is this read in the same orientation and position as the previous?
        if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \
                and read.is_reverse == reads[r_index - 1].is_reverse \
                and read.pnext == reads[r_index - 1].pnext:
            read_repetitions += 1
            if read_repetitions >= global_vars['max_dup_gc'][gc]:
                copies = 0  # in other words do not take into account this read
                removed_duplicated_reads += 1
        else:
            read_repetitions = 0

        readName = read.qname
        readTag = read.tags
        if gc:
            GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2))
            readTag.append(('CO', float(round(float(1) / R_gc[gc], 2))))
            readTag.append(('CP', copies))
        else:
            GC = -1

        readTag.append(('GC', GC))
        read.tags = readTag

        if read.is_paired and read.is_proper_pair \
                and not read.mate_is_unmapped \
                and not read.is_reverse:
            matePairs[readName] = {'copies': copies, 'gc': gc}
        """
        outfile.write(read)
        """
        if tag_but_not_change_number:
            outfile.write(read)
            continue

        for numCop in range(1, copies + 1):
            # the read has to be renamed such that newly
            # formed pairs will match
            if numCop > 1:
                read.qname = readName + "_%d" % (numCop)
            outfile.write(read)

        if verbose:
            if i % 500000 == 0 and i > 0:
                endTime = time.time()
                print "{},  processing {} ({:.1f} per sec) reads " \
                    "@ {}:{}-{}".format(multiprocessing.current_process().name,
                                        i, i / (endTime - startTime),
                                        chrNameBit, start, end)
        i += 1

    outfile.close()
    if verbose:
        endTime = time.time()
        print "{},  processing {} ({:.1f} per sec) reads " \
            "@ {}:{}-{}".format(multiprocessing.current_process().name,
                                i, i / (endTime - startTime),
                                chrNameBit, start, end)
        percentage = float(removed_duplicated_reads) * 100 / len(reads) \
            if len(reads) > 0 else 0
        print "duplicated reads removed %d of %d (%.2f) " % \
            (removed_duplicated_reads, len(reads), percentage)

    # convert sam to bam.
    command = '{0} view -bS {1} 2> /dev/null > {1}.bam'.format(
        samtools, tempFileName)
    if verbose:
        sys.stderr.write("running {}\n".format(command))

    run_shell_command(command)

    os.remove(tempFileName)
    return tempFileName + ".bam"
예제 #12
0
def filterWorker(arglist):
    chrom, start, end, args, chromDict = arglist
    fh = openBam(args.bam)

    mode = 'wbu'
    oname = getTempFileName(suffix='.bam')
    if args.filteredOutReads:
        onameFiltered = getTempFileName(suffix='.bam')
    else:
        onameFiltered = None
    ofh = pysam.AlignmentFile(oname, mode=mode, template=fh)
    if onameFiltered:
        ofiltered = pysam.AlignmentFile(onameFiltered, mode=mode, template=fh)
    else:
        ofiltered = None

    prev_pos = set()
    lpos = None

    nFiltered = 0
    total = 0
    for read in fh.fetch(chrom, start, end):
        if read.pos < start:
            # ensure that we never double count (in case distanceBetweenBins == 0)
            continue

        total += 1
        if read.flag & 4:
            # Ignore unmapped reads, they were counted already
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.minMappingQuality and read.mapq < args.minMappingQuality:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue
        if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        tLen = getTLen(read)
        if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue
        if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.ignoreDuplicates:
            # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
            if tLen >= 0:
                s = read.pos
                e = s + tLen
            else:
                s = read.pnext
                e = s - tLen
            if read.reference_id != read.next_reference_id:
                e = read.pnext
            if lpos is not None and lpos == read.reference_start \
                    and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                nFiltered += 1
                if ofiltered:
                    ofiltered.write(read)
                continue
            if lpos != read.reference_start:
                prev_pos.clear()
            lpos = read.reference_start
            prev_pos.add((s, e, read.next_reference_id, read.is_reverse))

        # filterRNAstrand
        if args.filterRNAstrand:
            if read.is_paired:
                if args.filterRNAstrand == 'forward':
                    if read.flag & 144 == 128 or read.flag & 96 == 64:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
                elif args.filterRNAstrand == 'reverse':
                    if read.flag & 144 == 144 or read.flag & 96 == 96:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
            else:
                if args.filterRNAstrand == 'forward':
                    if read.flag & 16 == 16:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
                elif args.filterRNAstrand == 'reverse':
                    if read.flag & 16 == 0:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue

        if args.shift:
            read = shiftRead(read, chromDict, args)
            if not read:
                continue

        # Read survived filtering
        ofh.write(read)

    # The results from the workers will get sorted, so get the TID
    tid = fh.get_tid(chrom)

    ofh.close()
    if ofiltered:
        ofiltered.close()
    fh.close()
    return tid, start, total, nFiltered, oname, onameFiltered
예제 #13
0
    def writeBedGraph_worker(self,
                             chrom,
                             start,
                             end,
                             func_to_call,
                             func_args,
                             smooth_length=0,
                             bed_regions_list=None):
        r"""Writes a bedgraph based on the read coverage found on bamFiles

        The given func is called to compute the desired bedgraph value
        using the funcArgs

        Parameters
        ----------
        chrom : str
            Chrom name
        start : int
            start coordinate
        end : int
            end coordinate
        func_to_call : str
            function name to be called to convert the list of coverages computed
            for each bam file at each position into a single value. An example
            is a function that takes the ratio between the coverage of two
            bam files.
        func_args : dict
            dict of arguments to pass to `func`.
        smooth_length : int
            Distance in bp for smoothing the coverage per tile.
        bed_regions_list: list
            List of tuples of the form (chrom, start, end)
            corresponding to bed regions to be processed.
            If not bed file was passed to the object constructor
            then this list is empty.

        Returns
        -------
        temporary file with the bedgraph results for the region queried.

        Example
        -------
        >>> test_path = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
        >>> bamFile1 = test_path +  "testA.bam"
        >>> bin_length = 50
        >>> number_of_samples = 0 # overruled by step_size
        >>> func_to_call = scaleCoverage
        >>> funcArgs = {'scaleFactor': 1.0}

        >>> c = WriteBedGraph([bamFile1], bin_length, number_of_samples, stepSize=50)
        >>> tempFile = c.writeBedGraph_worker( '3R', 0, 200, func_to_call, funcArgs)
        >>> open(tempFile, 'r').readlines()
        ['3R\t0\t100\t0.00\n', '3R\t100\t200\t1.0\n']
        >>> os.remove(tempFile)


        """
        if start > end:
            raise NameError("start position ({0}) bigger "
                            "than end position ({1})".format(start, end))

        coverage = []
        bam_handlers = [bamHandler.openBam(bam) for bam in self.bamFilesList]
        for bam in bam_handlers:
            coverage.append(
                self.get_coverage_of_region(bam, chrom, start, end,
                                            self.binLength))
            bam.close()

        _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
        previous_value = None

        length_coverage = len(coverage[0])
        for tileIndex in xrange(length_coverage):

            tileCoverage = []
            for index in range(len(self.bamFilesList)):
                if smooth_length > 0:
                    vector_start, vector_end = self.getSmoothRange(
                        tileIndex, self.binLength, smooth_length,
                        length_coverage)
                    tileCoverage.append(
                        np.mean(coverage[index][vector_start:vector_end]))
                else:
                    tileCoverage.append(coverage[index][tileIndex])

            value = func_to_call(tileCoverage, func_args)
            """
            # uncomment this lines if fixed step bedgraph is wanted
            if not  np.isnan(value):
                writeStart = start + tileIndex*self.binLength
                writeEnd  =  min(writeStart+self.binLength, end)
                _file.write( "%s\t%d\t%d\t%.2f\n" % (chrom, writeStart,
                                                     writeEnd, value) )
            """

            if previous_value is None:
                writeStart = start + tileIndex * self.binLength
                writeEnd = min(writeStart + self.binLength, end)
                previous_value = value

            elif previous_value == value:
                writeEnd = min(writeEnd + self.binLength, end)

            elif previous_value != value:
                if not np.isnan(previous_value):
                    _file.write("{}\t{}\t{}\t{:.2f}\n".format(
                        chrom, writeStart, writeEnd, previous_value))
                previous_value = value
                writeStart = writeEnd
                writeEnd = min(writeStart + self.binLength, end)

        # write remaining value if not a nan
        if previous_value and writeStart != end and not np.isnan(
                previous_value):
            _file.write("%s\t%d\t%d\t%.1f\n" %
                        (chrom, writeStart, end, previous_value))

        tempfilename = _file.name
        _file.close()
        return tempfilename
예제 #14
0
    def writeBedGraph_worker(self, chrom, start, end,
                             func_to_call, func_args,
                             bed_regions_list=None):
        r"""Writes a bedgraph based on the read coverage found on bamFiles

        The given func is called to compute the desired bedgraph value
        using the funcArgs

        Parameters
        ----------
        chrom : str
            Chrom name
        start : int
            start coordinate
        end : int
            end coordinate
        func_to_call : str
            function name to be called to convert the list of coverages computed
            for each bam file at each position into a single value. An example
            is a function that takes the ratio between the coverage of two
            bam files.
        func_args : dict
            dict of arguments to pass to `func`.
        smoothLength : int
            Distance in bp for smoothing the coverage per tile.
        bed_regions_list: list
            List of tuples of the form (chrom, start, end)
            corresponding to bed regions to be processed.
            If not bed file was passed to the object constructor
            then this list is empty.

        Returns
        -------
        A list of [chromosome, start, end, temporary file], where the temporary file contains the bedgraph results for the region queried.

        Examples
        --------
        >>> test_path = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
        >>> bamFile1 = test_path +  "testA.bam"
        >>> bin_length = 50
        >>> number_of_samples = 0 # overruled by step_size
        >>> func_to_call = scaleCoverage
        >>> funcArgs = {'scaleFactor': 1.0}

        >>> c = WriteBedGraph([bamFile1], bin_length, number_of_samples, stepSize=50)
        >>> tempFile = c.writeBedGraph_worker( '3R', 0, 200, func_to_call, funcArgs)
        >>> f = open(tempFile[3], 'r')
        >>> f.readlines()
        ['3R\t0\t100\t0\n', '3R\t100\t200\t1\n']
        >>> f.close()
        >>> os.remove(tempFile[3])


        """
        if start > end:
            raise NameError("start position ({0}) bigger "
                            "than end position ({1})".format(start, end))

        coverage, _ = self.count_reads_in_region(chrom, start, end)

        _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
        previous_value = None
        line_string = "{}\t{}\t{}\t{:g}\n"
        for tileIndex in range(coverage.shape[0]):

            if self.smoothLength is not None and self.smoothLength > 0:
                vector_start, vector_end = self.getSmoothRange(tileIndex,
                                                               self.binLength,
                                                               self.smoothLength,
                                                               coverage.shape[0])
                tileCoverage = np.mean(coverage[vector_start:vector_end, :], axis=0)
            else:
                tileCoverage = coverage[tileIndex, :]
            if self.skipZeroOverZero and np.sum(tileCoverage) == 0:
                continue

            value = func_to_call(tileCoverage, func_args)
            """
            # uncomment these lines if fixed step bedgraph is required
            if not np.isnan(value):
                writeStart = start + tileIndex * self.binLength
                writeEnd  =  min(writeStart + self.binLength, end)
                _file.write(line_string.format(chrom, writeStart,
                                               writeEnd, value))
            continue
            """

            if previous_value is None:
                writeStart = start + tileIndex * self.binLength
                writeEnd = min(writeStart + self.binLength, end)
                previous_value = value

            elif previous_value == value:
                writeEnd = min(writeEnd + self.binLength, end)

            elif previous_value != value:
                if not np.isnan(previous_value):
                    _file.write(
                        line_string.format(chrom, writeStart, writeEnd, previous_value))
                previous_value = value
                writeStart = writeEnd
                writeEnd = min(writeStart + self.binLength, end)

        # write remaining value if not a nan
        if previous_value is not None and writeStart != end and not np.isnan(previous_value):
            _file.write(line_string.format(chrom, writeStart,
                                           end, previous_value))

        tempfilename = _file.name
        _file.close()
        return chrom, start, end, tempfilename