Пример #1
0
def get_num_kept_reads(args, stats):
    """
    Substracts from the total number of mapped reads in a bamfile
    the proportion of reads that fall into blacklisted regions
    or that are filtered

    :return: integer
    """
    if stats is None:
        bam_handle, mapped, unmapped, stats = bamHandler.openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
    else:
        bam_handle = bamHandler.openBam(args.bam)
    bam_mapped_total = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization, stats)
    if args.blackListFileName:
        blacklisted = utilities.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization,
                                                      args.blackListFileName, args.numberOfProcessors)
        print("There are {0} alignments, of which {1} are completely "
              "within a blacklist region.".format(bam_mapped_total, blacklisted))
        num_kept_reads = bam_mapped_total - blacklisted
    else:
        num_kept_reads = bam_mapped_total
    ftk = fraction_kept(args, stats)
    if ftk < 1:
        num_kept_reads *= ftk
        print("Due to filtering, {0}% of the aforementioned alignments "
              "will be used {1}".format(100 * ftk, num_kept_reads))

    return num_kept_reads, bam_mapped_total
Пример #2
0
def get_num_kept_reads(args, stats):
    """
    Substracts from the total number of mapped reads in a bamfile
    the proportion of reads that fall into blacklisted regions
    or that are filtered

    :return: integer
    """
    if stats is None:
        bam_handle, mapped, unmapped, stats = bamHandler.openBam(
            args.bam, returnStats=True, nThreads=args.numberOfProcessors)
    else:
        bam_handle = bamHandler.openBam(args.bam)
    bam_mapped_total = utilities.bam_total_reads(bam_handle,
                                                 args.ignoreForNormalization,
                                                 stats)
    if args.blackListFileName:
        blacklisted = utilities.bam_blacklisted_reads(
            bam_handle, args.ignoreForNormalization, args.blackListFileName,
            args.numberOfProcessors)
        print("There are {0} alignments, of which {1} are completely "
              "within a blacklist region.".format(bam_mapped_total,
                                                  blacklisted))
        num_kept_reads = bam_mapped_total - blacklisted
    else:
        num_kept_reads = bam_mapped_total
    ftk = fraction_kept(args, stats)
    if ftk < 1:
        num_kept_reads *= ftk
        print("Due to filtering, {0}% of the aforementioned alignments "
              "will be used {1}".format(100 * ftk, num_kept_reads))

    return num_kept_reads, bam_mapped_total
Пример #3
0
def get_scale_factor(args):

    scale_factor = args.scaleFactor
    bam_handle = bamHandler.openBam(args.bam)
    bam_mapped = parserCommon.bam_total_reads(bam_handle, args.ignoreForNormalization)
    blacklisted = parserCommon.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization, args.blackListFileName)
    bam_mapped -= blacklisted

    if args.normalizeTo1x:
        # try to guess fragment length if the bam file contains paired end reads
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
                                                                    return_lengths=False,
                                                                    blackListFileName=args.blackListFileName,
                                                                    numberOfProcessors=args.numberOfProcessors,
                                                                    verbose=args.verbose)
        if args.extendReads:
            if args.extendReads is True:
                # try to guess fragment length if the bam file contains paired end reads
                if frag_len_dict:
                    fragment_length = frag_len_dict['median']
                else:
                    exit("*ERROR*: library is not paired-end. Please provide an extension length.")
                if args.verbose:
                    print("Fragment length based on paired en data "
                          "estimated to be {}".format(frag_len_dict['median']))

            elif args.extendReads < 1:
                exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads))
            elif args.extendReads > 2000:
                exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
            else:
                fragment_length = args.extendReads

        else:
            # set as fragment length the read length
            fragment_length = int(read_len_dict['median'])
            if args.verbose:
                print "Estimated read length is {}".format(int(read_len_dict['median']))

        current_coverage = \
            float(bam_mapped * fragment_length) / args.normalizeTo1x
        # the scaling sets the coverage to match 1x
        scale_factor *= 1.0 / current_coverage
        if debug:
            print "Estimated current coverage {}".format(current_coverage)
            print "Scaling factor {}".format(args.scaleFactor)

    elif args.normalizeUsingRPKM:
        # the RPKM is the # reads per tile / \
        #    ( total reads (in millions) * tile length in Kb)
        million_reads_mapped = float(bam_mapped) / 1e6
        tile_len_in_kb = float(args.binSize) / 1000

        scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb)

        if debug:
            print "scale factor using RPKM is {0}".format(args.scaleFactor)

    return scale_factor
Пример #4
0
 def __init__(self):
     import os
     self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/"
     self.tbitFile = self.root + "sequence.2bit"
     self.bamFile = self.root + "test.bam"
     self.mappability = self.root + "mappability.bw"
     self.chrNameBam = '2L'
     self.chrNameBit = 'chr2L'
     bam = bamHandler.openBam(self.bamFile)
     tbit = py2bit.open(self.tbitFile)
     global debug
     debug = 0
     global global_vars
     global_vars = {'2bit': self.tbitFile,
                    'bam': self.bamFile,
                    'filter_out': None,
                    'mappability': self.mappability,
                    'extra_sampling_file': None,
                    'max_reads': 5,
                    'min_reads': 0,
                    'min_reads': 0,
                    'reads_per_bp': 0.3,
                    'total_reads': bam.mapped,
                    'genome_size': sum(tbit.chroms().values())
                    }
Пример #5
0
 def __init__(self):
     import os
     self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/"
     self.tbitFile = self.root + "sequence.2bit"
     self.bamFile = self.root + "test.bam"
     self.mappability = self.root + "mappability.bw"
     self.chrNameBam = '2L'
     self.chrNameBit = 'chr2L'
     bam = bamHandler.openBam(self.bamFile)
     tbit = py2bit.open(self.tbitFile)
     global debug
     debug = 0
     global global_vars
     global_vars = {'2bit': self.tbitFile,
                    'bam': self.bamFile,
                    'filter_out': None,
                    'mappability': self.mappability,
                    'extra_sampling_file': None,
                    'max_reads': 5,
                    'min_reads': 0,
                    'min_reads': 0,
                    'reads_per_bp': 0.3,
                    'total_reads': bam.mapped,
                    'genome_size': sum(tbit.chroms().values())
                    }
Пример #6
0
def countReadsPerGC_worker(chromNameBam,
                           start, end, stepSize, regionSize,
                           chrNameBamToBit, verbose=False):
    """given a genome region defined by
    (start, end), the GC content is quantified for
    regions of size regionSize that are contiguous
    """

    chromNameBit = chrNameBamToBit[chromNameBam]
    tbit = twobit.TwoBitFile(open(global_vars['2bit']))
    bam = bamHandler.openBam(global_vars['bam'])
    c = 1
    sub_reads_per_gc = []
    positions_to_sample = getPositionsToSample(chromNameBit,
                                               start, end, stepSize)

    for index in xrange(len(positions_to_sample)):
        i = positions_to_sample[index]
        # stop if region extends over the chromosome end
        if tbit[chromNameBit].size < i + regionSize:
            break

        try:
            gc = getGC_content(tbit[chromNameBit].get(i, i + regionSize))
        except Exception as detail:
            if verbose:
                print "{}:{}-{}".format(chromNameBit, i, i + regionSize)
                print detail
            continue
        numberReads = bam.count(chromNameBam, i, i + regionSize)
        sub_reads_per_gc.append((numberReads, gc))
        c += 1

    return sub_reads_per_gc
Пример #7
0
 def __init__(self):
     import os
     self.root = os.path.dirname(
         os.path.abspath(__file__)) + "/test/test_corrGC/"
     self.tbitFile = self.root + "sequence.2bit"
     self.bamFile = self.root + "test.bam"
     self.mappability = self.root + "mappability.bw"
     self.chrNameBam = '2L'
     self.chrNameBit = 'chr2L'
     self.samtools = cfg.config.get('external_tools', 'samtools')
     bam = bamHandler.openBam(self.bamFile)
     bit = twobit.TwoBitFile(open(self.tbitFile))
     global debug
     debug = 0
     global global_vars
     global_vars = {
         '2bit': self.tbitFile,
         'bam': self.bamFile,
         'filter_out': None,
         'mappability': self.mappability,
         'extra_sampling_file': None,
         'max_reads': 5,
         'min_reads': 0,
         'min_reads': 0,
         'reads_per_bp': 0.3,
         'total_reads': bam.mapped,
         'genome_size': sum([bit[x].size for x in bit.index])
     }
Пример #8
0
def countReadsPerGC_worker(chromNameBam,
                           start, end, stepSize, regionSize,
                           chrNameBamToBit, verbose=False):
    """given a genome region defined by
    (start, end), the GC content is quantified for
    regions of size regionSize that are contiguous
    """

    chromNameBit = chrNameBamToBit[chromNameBam]
    tbit = py2bit.open(global_vars['2bit'])
    bam = bamHandler.openBam(global_vars['bam'])
    c = 1
    sub_reads_per_gc = []
    positions_to_sample = getPositionsToSample(chromNameBit,
                                               start, end, stepSize)

    for index in range(len(positions_to_sample)):
        i = positions_to_sample[index]
        # stop if region extends over the chromosome end
        if tbit.chroms(chromNameBit) < i + regionSize:
            break

        try:
            gc = getGC_content(tbit, chromNameBit, int(i), int(i + regionSize))
        except Exception as detail:
            if verbose:
                print("{}:{}-{}".format(chromNameBit, i, i + regionSize))
                print(detail)
            continue
        numberReads = bam.count(chromNameBam, i, i + regionSize)
        sub_reads_per_gc.append((numberReads, gc))
        c += 1

    return sub_reads_per_gc
Пример #9
0
 def testTabulateGCcontent(self):
     fragmentLength = {'median': 10}
     chrNameBitToBam = {'chr2L': '2L'}
     stepSize = 1
     bam = bamHandler.openBam(global_vars['bam'])
     chromSizes = [(bam.references[i], bam.lengths[i])
                   for i in range(len(bam.references))]
     return (fragmentLength, chrNameBitToBam, stepSize, chromSizes, 1)
Пример #10
0
 def testCountReadsPerGC(self):
     regionSize = 300
     chrNameBitToBam = {'chr2L': '2L'}
     stepSize = 1
     bam = bamHandler.openBam(global_vars['bam'])
     chromSizes = [(bam.references[i], bam.lengths[i])
                   for i in range(len(bam.references))]
     return (regionSize, chrNameBitToBam, stepSize, chromSizes, 1)
Пример #11
0
def fraction_kept(args):
    """
    Count the following:
    (A) The total number of alignments sampled
    (B) The total number of alignments ignored due to any of the following:
        --samFlagInclude
        --samFlagExclude
        --minMappingQuality
        --ignoreDuplicates
        --minFragmentLength
        --maxFragmentLength

    Black list regions are already accounted for. This works by sampling the
    genome (by default, we'll iterate until we sample 1% or 100,000 alignments,
    whichever is smaller (unless there are fewer than 100,000 alignments, in
    which case sample everything).

    The sampling works by dividing the genome into bins and only looking at the
    first 50000 bases. If this doesn't yield sufficient alignments then the bin
    size is halved.
    """
    filtered = 0
    total = 0
    distanceBetweenBins = 2000000
    bam_handle = bamHandler.openBam(args.bam)
    bam_mapped = utilities.bam_total_reads(bam_handle,
                                           args.ignoreForNormalization)
    num_needed_to_sample = max(bam_mapped if bam_mapped <= 100000 else 0,
                               min(100000, 0.01 * bam_mapped))
    if args.ignoreForNormalization:
        chrom_sizes = [(chrom_name, bam_handle.lengths[idx])
                       for idx, chrom_name in enumerate(bam_handle.references)
                       if chrom_name not in args.ignoreForNormalization]
    else:
        chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))

    while total < num_needed_to_sample and distanceBetweenBins > 50000:
        # If we've iterated, then halve distanceBetweenBins
        distanceBetweenBins /= 2
        if distanceBetweenBins < 50000:
            distanceBetweenBins = 50000

        res = mapReduce.mapReduce((bam_handle.filename, args),
                                  getFractionKept_wrapper,
                                  chrom_sizes,
                                  genomeChunkLength=distanceBetweenBins,
                                  blackListFileName=args.blackListFileName,
                                  numberOfProcessors=args.numberOfProcessors,
                                  verbose=args.verbose)

        if len(res):
            filtered, total = np.sum(res, axis=0)

    if total == 0:
        # This should never happen
        total = 1

    return 1.0 - float(filtered) / float(total)
Пример #12
0
def bam_blacklisted_worker(args):
    bam, chrom, start, end = args
    fh = openBam(bam)
    blacklisted = 0
    for r in fh.fetch(reference=chrom, start=start, end=end):
        if r.reference_start >= start and r.reference_start + r.infer_query_length(always=False) - 1 <= end:
            blacklisted += 1
    fh.close()
    return blacklisted
Пример #13
0
def bam_blacklisted_worker(args):
    bam, chrom, start, end = args
    fh = openBam(bam)
    blacklisted = 0
    for r in fh.fetch(reference=chrom, start=start, end=end):
        if r.reference_start >= start and r.reference_start + r.infer_query_length(always=False) - 1 <= end:
            blacklisted += 1
    fh.close()
    return blacklisted
Пример #14
0
 def testCountReadsPerGC(self):
     regionSize = 300
     chrNameBitToBam = {'chr2L': '2L'}
     stepSize = 1
     bam = bamHandler.openBam(global_vars['bam'])
     chromSizes = [(bam.references[i], bam.lengths[i])
                   for i in range(len(bam.references))]
     return (regionSize,
             chrNameBitToBam, stepSize, chromSizes, 1)
Пример #15
0
 def testTabulateGCcontent(self):
     fragmentLength = {'median': 10}
     chrNameBitToBam = {'chr2L': '2L'}
     stepSize = 1
     bam = bamHandler.openBam(global_vars['bam'])
     chromSizes = [(bam.references[i], bam.lengths[i])
                   for i in range(len(bam.references))]
     return (fragmentLength,
             chrNameBitToBam, stepSize, chromSizes, 1)
Пример #16
0
def getEnrichment_worker(arglist):
    """
    This is the worker function of plotEnrichment.

    In short, given a region, iterate over all reads **starting** in it.
    Filter/extend them as requested and check each for an overlap with
    findOverlaps. For each overlap, increment the counter for that feature.
    """
    chrom, start, end, args, defaultFragmentLength = arglist
    if args.verbose:
        sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))

    gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels)
    olist = []
    total = [0] * len(args.bamfiles)
    for idx, f in enumerate(args.bamfiles):
        odict = dict()
        for x in gtf.features:
            odict[x] = 0
        fh = openBam(f)

        chrom = mungeChromosome(chrom, fh.references)

        prev_start_pos = None  # to store the start positions
        for read in fh.fetch(chrom, start, end):
            # Filter
            if read.pos < start:
                # Ensure that a given alignment is processed only once
                continue
            if read.flag & 4:
                continue
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                continue
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                continue
            tLen = getTLen(read)
            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
                continue
            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
                continue
            if args.ignoreDuplicates and prev_start_pos \
                    and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
                continue
            prev_start_pos = (read.reference_start, read.pnext, read.is_reverse)
            total[idx] += 1

            # Get blocks, possibly extending
            features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset))

            if features is not None and len(features) > 0:
                for x in features:
                    odict[x] += 1
        olist.append(odict)
    return olist, gtf.features, total
Пример #17
0
def fraction_kept(args):
    """
    Count the following:
    (A) The total number of alignments sampled
    (B) The total number of alignments ignored due to any of the following:
        --samFlagInclude
        --samFlagExclude
        --minMappingQuality
        --ignoreDuplicates
        --minFragmentLength
        --maxFragmentLength

    Black list regions are already accounted for. This works by sampling the
    genome (by default, we'll iterate until we sample 1% or 100,000 alignments,
    whichever is smaller (unless there are fewer than 100,000 alignments, in
    which case sample everything).

    The sampling works by dividing the genome into bins and only looking at the
    first 50000 bases. If this doesn't yield sufficient alignments then the bin
    size is halved.
    """
    filtered = 0
    total = 0
    distanceBetweenBins = 2000000
    bam_handle = bamHandler.openBam(args.bam)
    bam_mapped = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization)
    num_needed_to_sample = max(bam_mapped if bam_mapped <= 100000 else 0, min(100000, 0.01 * bam_mapped))
    if args.ignoreForNormalization:
        chrom_sizes = [(chrom_name, bam_handle.lengths[idx]) for idx, chrom_name in enumerate(bam_handle.references)
                       if chrom_name not in args.ignoreForNormalization]
    else:
        chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))

    while total < num_needed_to_sample and distanceBetweenBins > 50000:
        # If we've iterated, then halve distanceBetweenBins
        distanceBetweenBins /= 2
        if distanceBetweenBins < 50000:
            distanceBetweenBins = 50000

        res = mapReduce.mapReduce((bam_handle.filename, args),
                                  getFractionKept_wrapper,
                                  chrom_sizes,
                                  genomeChunkLength=distanceBetweenBins,
                                  blackListFileName=args.blackListFileName,
                                  numberOfProcessors=args.numberOfProcessors,
                                  verbose=args.verbose)

        if len(res):
            filtered, total = np.sum(res, axis=0)

    if total == 0:
        # This should never happen
        total = 1

    return 1.0 - float(filtered) / float(total)
Пример #18
0
def getEnrichment_worker(arglist):
    """
    This is the worker function of plotEnrichment.

    In short, given a region, iterate over all reads **starting** in it.
    Filter/extend them as requested and check each for an overlap with
    findOverlaps. For each overlap, increment the counter for that feature.
    """
    chrom, start, end, args, defaultFragmentLength = arglist
    if args.verbose:
        sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))

    gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels)
    olist = []
    for f in args.bamfiles:
        odict = dict()
        for x in gtf.features:
            odict[x] = 0
        fh = openBam(f)

        chrom = mungeChromosome(chrom, fh.references)

        prev_start_pos = None  # to store the start positions
        for read in fh.fetch(chrom, start, end):
            # Filter
            if read.pos < start:
                # Ensure that a given alignment is processed only once
                continue
            if read.flag & 4:
                continue
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                continue
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                continue
            if args.minFragmentLength > 0 and abs(read.template_length) < args.minFragmentLength:
                continue
            if args.maxFragmentLength > 0 and abs(read.template_length) > args.maxFragmentLength:
                continue
            if args.ignoreDuplicates and prev_start_pos \
                    and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
                continue
            prev_start_pos = (read.reference_start, read.pnext, read.is_reverse)

            # Get blocks, possibly extending
            features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads))

            if features is not None and len(features) > 0:
                for x in features:
                    odict[x] += 1
        olist.append(odict)
    return olist, gtf.features
Пример #19
0
 def getRead(self, readType):
     """ prepare arguments for test
     """
     bam = bamHandler.openBam(self.bamFile_PE)
     if readType == 'paired-reverse':
         read = [x for x in bam.fetch('chr2', 5000081, 5000082)][0]
     elif readType == 'single-forward':
         read = [x for x in bam.fetch('chr2', 5001491, 5001492)][0]
     elif readType == 'single-reverse':
         read = [x for x in bam.fetch('chr2', 5001700, 5001701)][0]
     else:  # by default a forward paired read is returned
         read = [x for x in bam.fetch('chr2', 5000027, 5000028)][0]
     return read
Пример #20
0
 def getRead(self, readType):
     """ prepare arguments for test
     """
     bam = bamHandler.openBam(self.bamFile_PE)
     if readType == 'paired-reverse':
         read = [x for x in bam.fetch('chr2', 5000081, 5000082)][0]
     elif readType == 'single-forward':
         read = [x for x in bam.fetch('chr2', 5001491, 5001492)][0]
     elif readType == 'single-reverse':
         read = [x for x in bam.fetch('chr2', 5001700, 5001701)][0]
     else:  # by default a forward paired read is returned
         read = [x for x in bam.fetch('chr2', 5000027, 5000028)][0]
     return read
Пример #21
0
def getFragmentLength_worker(chrom, start, end, bamFile, distanceBetweenBins):
    """
    Queries the reads at the given region for the distance between
    reads and the read length

    Parameters
    ----------
    chrom : str
        chromosome name
    start : int
        region start
    end : int
        region end
    bamFile : str
        BAM file name
    distanceBetweenBins : int
        the number of bases at the end of each bin to ignore

    Returns
    -------
    np.array
        an np.array, where first column is fragment length, the
        second is for read length
    """
    bam = bamHandler.openBam(bamFile)
    end = max(start + 1, end - distanceBetweenBins)
    if chrom in bam.references:
        reads = np.array([
            (abs(r.template_length), r.infer_query_length(always=False))
            for r in bam.fetch(chrom, start, end)
            if r.is_proper_pair and r.is_read1 and not r.is_unmapped
        ])
        if not len(reads):
            # if the previous operation produces an empty list
            # it could be that the data is not paired, then
            # we try with out filtering
            reads = np.array([(abs(r.template_length),
                               r.infer_query_length(always=False))
                              for r in bam.fetch(chrom, start, end)
                              if not r.is_unmapped])
    else:
        raise NameError("chromosome {} not found in bam file".format(chrom))

    if not len(reads):
        reads = np.array([]).reshape(0, 2)

    return reads
def getFragmentLength_worker(chrom, start, end, bamFile, distanceBetweenBins):
    """
    Queries the reads at the given region for the distance between
    reads and the read length

    Parameters
    ----------
    chrom : str
        chromosome name
    start : int
        region start
    end : int
        region end
    bamFile : str
        BAM file name
    distanceBetweenBins : int
        the number of bases at the end of each bin to ignore

    Returns
    -------
    np.array
        an np.array, where first column is fragment length, the
        second is for read length
    """
    bam = bamHandler.openBam(bamFile)
    end = max(start + 1, end - distanceBetweenBins)
    if chrom in bam.references:
        reads = np.array([(abs(r.template_length), r.infer_query_length(always=False))
                          for r in bam.fetch(chrom, start, end)
                          if r.is_proper_pair and r.is_read1])
        if not len(reads):
            # if the previous operation produces an empty list
            # it could be that the data is not paired, then
            # we try with out filtering
            reads = np.array([(abs(r.template_length), r.infer_query_length(always=False))
                              for r in bam.fetch(chrom, start, end)])
    else:
        raise NameError("chromosome {} not found in bam file".format(chrom))

    if not len(reads):
        reads = np.array([]).reshape(0, 2)

    return reads
Пример #23
0
def get_scale_factors(args):

    bam1 = bamHandler.openBam(args.bamfile1)
    bam2 = bamHandler.openBam(args.bamfile2)

    bam1_mapped = parserCommon.bam_total_reads(bam1, args.ignoreForNormalization)
    bam2_mapped = parserCommon.bam_total_reads(bam2, args.ignoreForNormalization)

    if args.scaleFactors:
        scale_factors = map(float, args.scaleFactors.split(":"))
    else:
        if args.scaleFactorsMethod == 'SES':
            scalefactors_dict = estimateScaleFactor(
                [bam1.filename, bam2.filename],
                args.sampleLength, args.numberOfSamples,
                1,
                numberOfProcessors=args.numberOfProcessors,
                verbose=args.verbose,
                chrsToSkip=args.ignoreForNormalization)

            scale_factors = scalefactors_dict['size_factors']

            if args.verbose:
                print "Size factors using SES: {}".format(scale_factors)
                print "%s regions of size %s where used " % \
                    (scalefactors_dict['sites_sampled'],
                     args.sampleLength)

                print "size factor if the number of mapped " \
                    "reads would have been used:"
                print tuple(
                    float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped]))

        elif args.scaleFactorsMethod == 'readCount':
            scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array([bam1_mapped, bam2_mapped])
            if args.verbose:
                print "Size factors using total number " \
                    "of mapped reads: {}".format(scale_factors)

    # in case the subtract method is used, the final difference
    # would be normalized according to the given method
    if args.ratio == 'subtract':
        # The next lines identify which of the samples is not scaled down.
        # The normalization using RPKM or normalize to 1x would use
        # as reference such sample. Since the other sample would be
        # scaled to match the un-scaled one, the normalization factor
        # for both samples should be based on the unscaled one.
        # For example, if sample A is unscaled and sample B is scaled by 0.5,
        # then normalizing factor for A to report RPKM read counts
        # is also applied to B.
        if scale_factors[0] == 1:
            mappedReads = bam1_mapped
            bamfile = args.bamfile1
        else:
            mappedReads = bam2_mapped
            bamfile = args.bamfile2

        if args.scaleFactors is None:
            if args.normalizeTo1x:
                # try to guess fragment length if the bam file contains paired end reads
                from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
                frag_len_dict, read_len_dict = get_read_and_fragment_length(bamfile,
                                                                            return_lengths=False,
                                                                            numberOfProcessors=args.numberOfProcessors,
                                                                            verbose=args.verbose)
                if args.extendReads:
                    if args.extendReads is True:
                        # try to guess fragment length if the bam file contains paired end reads
                        if frag_len_dict:
                            fragment_length = frag_len_dict['median']
                        else:
                            exit("*ERROR*: library is not paired-end. Please provide an extension length.")
                        if args.verbose:
                            print("Fragment length based on paired en data "
                                  "estimated to be {}".format(frag_len_dict['median']))

                    elif args.extendReads < 1:
                        exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads))
                    elif args.extendReads > 2000:
                        exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
                    else:
                        fragment_length = args.extendReads

                else:
                    # set as fragment length the read length
                    fragment_length = int(read_len_dict['median'])
                    if args.verbose:
                        print "Estimated read length is {}".format(int(read_len_dict['median']))

                current_coverage = float(mappedReads * fragment_length) / args.normalizeTo1x
                # the coverage scale factor is 1 / coverage,
                coverage_scale_factor = 1.0 / current_coverage
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print "Estimated current coverage {}".format(current_coverage)
                    print "Scale factor to convert " \
                          "current coverage to 1: {}".format(coverage_scale_factor)
            else:
                # by default normalize using RPKM
                # the RPKM is:
                # Num reads per tile/(total reads (in millions)*tile length in Kb)
                millionReadsMapped = float(mappedReads) / 1e6
                tileLengthInKb = float(args.binSize) / 1000
                coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb)
                scale_factors = np.array(scale_factors) * coverage_scale_factor

                if args.verbose:
                    print "scale factor for   "
                    "RPKM is {0}".format(coverage_scale_factor)

    return scale_factors
Пример #24
0
    def run(self, allArgs=None):
        bamFilesHandles = []
        for x in self.bamFilesList:
            try:
                y = bamHandler.openBam(x)
            except SystemExit:
                sys.exit(sys.exc_info()[1])
            except:
                y = pyBigWig.open(x)
            bamFilesHandles.append(y)

        chromsizes, non_common = deeptools.utilities.getCommonChrNames(
            bamFilesHandles, verbose=self.verbose)

        # skip chromosome in the list. This is usually for the
        # X chromosome which may have either one copy  in a male sample
        # or a mixture of male/female and is unreliable.
        # Also the skip may contain heterochromatic regions and
        # mitochondrial DNA
        if len(self.chrsToSkip):
            chromsizes = [x for x in chromsizes if x[0] not in self.chrsToSkip]

        chrNames, chrLengths = list(zip(*chromsizes))

        genomeSize = sum(chrLengths)

        if self.bedFile is None:
            chunkSize = self.get_chunk_length(bamFilesHandles, genomeSize,
                                              chromsizes, chrLengths)
        else:
            chunkSize = None

        [bam_h.close() for bam_h in bamFilesHandles]

        if self.verbose:
            print("step size is {}".format(self.stepSize))

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        # Handle GTF options
        transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(
            allArgs)

        # use map reduce to call countReadsInRegions_wrapper
        imap_res = mapReduce.mapReduce(
            [],
            countReadsInRegions_wrapper,
            chromsizes,
            self_=self,
            genomeChunkLength=chunkSize,
            bedFile=self.bedFile,
            blackListFileName=self.blackListFileName,
            region=self.region,
            numberOfProcessors=self.numberOfProcessors,
            transcriptID=transcriptID,
            exonID=exonID,
            keepExons=keepExons,
            transcript_id_designator=transcript_id_designator)

        if self.out_file_for_raw_data:
            if len(non_common):
                sys.stderr.write(
                    "*Warning*\nThe resulting bed file does not contain information for "
                    "the chromosomes that were not common between the bigwig files\n"
                )

            # concatenate intermediary bedgraph files
            ofile = open(self.out_file_for_raw_data, "w")
            for _values, tempFileName in imap_res:
                if tempFileName:
                    # concatenate all intermediate tempfiles into one
                    _foo = open(tempFileName, 'r')
                    shutil.copyfileobj(_foo, ofile)
                    _foo.close()
                    os.remove(tempFileName)

            ofile.close()

        try:
            num_reads_per_bin = np.concatenate([x[0] for x in imap_res],
                                               axis=0)
            return num_reads_per_bin

        except ValueError:
            if self.bedFile:
                sys.exit(
                    '\nNo coverage values could be computed.\n\n'
                    'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                    'The valid chromosome names are:\n{}'.format(chrNames))
            else:
                sys.exit(
                    '\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                    'contain mapped reads.')
Пример #25
0
def openBam(bamFile, bamIndex=None):
    return bamHandler.openBam(bamFile, bamIndex)
Пример #26
0
def main(args=None):
    args = parse_arguments().parse_args(args)

    if args.extraSampling:
        extra_sampling_file = args.extraSampling.name
        args.extraSampling.close()
    else:
        extra_sampling_file = None

    global global_vars
    global_vars = {}
    global_vars['2bit'] = args.genome
    global_vars['bam'] = args.bamfile
    global_vars['filter_out'] = args.blackListFileName
    global_vars['extra_sampling_file'] = extra_sampling_file

    tbit = py2bit.open(global_vars['2bit'])
    bam = bamHandler.openBam(global_vars['bam'])

    if args.fragmentLength:
        fragment_len_dict = \
            {'median': args.fragmentLength}

    else:
        fragment_len_dict, __ = \
            get_read_and_fragment_length(args.bamfile, None,
                                         numberOfProcessors=args.numberOfProcessors,
                                         verbose=args.verbose)
        if not fragment_len_dict:
            print("\nPlease provide the fragment length used for the "
                  "sample preparation.\n")
            exit(1)

        fragment_len_dict = {'median': int(fragment_len_dict['median'])}

    chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references)

    global_vars['genome_size'] = sum(tbit.chroms().values())
    global_vars['total_reads'] = bam.mapped
    global_vars['reads_per_bp'] = \
        float(global_vars['total_reads']) / args.effectiveGenomeSize

    confidence_p_value = float(1) / args.sampleSize

    # chromSizes: list of tuples
    chromSizes = [(bam.references[i], bam.lengths[i])
                  for i in range(len(bam.references))]

    # use poisson distribution to identify peaks that should be discarted.
    # I multiply by 4, because the real distribution of reads
    # vary depending on the gc content
    # and the global number of reads per bp may a be too low.
    # empirically, a value of at least 4 times as big as the
    # reads_per_bp was found.
    # Similarly for the min value, I divide by 4.
    global_vars['max_reads'] = \
        poisson(4 * global_vars['reads_per_bp'] *
                fragment_len_dict['median']).isf(confidence_p_value)
    # this may be of not use, unless the depth of sequencing is really high
    # as this value is close to 0
    global_vars['min_reads'] = \
        poisson(0.25 * global_vars['reads_per_bp'] *
                fragment_len_dict['median']).ppf(confidence_p_value)

    for key in global_vars:
        print("{}: {}".format(key, global_vars[key]))

    print("computing frequencies")
    # the GC of the genome is sampled each stepSize bp.
    stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1)
    print("stepSize: {}".format(stepSize))
    data = tabulateGCcontent(fragment_len_dict,
                             chrNameBitToBam, stepSize,
                             chromSizes,
                             numberOfProcessors=args.numberOfProcessors,
                             verbose=args.verbose,
                             region=args.region)

    np.savetxt(args.GCbiasFrequenciesFile.name, data)

    if args.biasPlot:
        reads_per_gc = countReadsPerGC(args.regionSize,
                                       chrNameBitToBam, stepSize * 10,
                                       chromSizes,
                                       numberOfProcessors=args.numberOfProcessors,
                                       verbose=args.verbose,
                                       region=args.region)
        plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)
Пример #27
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    if not args.outRawCounts and not args.plotFile:
        sys.exit("Error: You need to specify at least one of --plotFile or --outRawCounts!\n")

    if args.labels is None:
        args.labels = args.bamfiles
    if args.smartLabels:
        args.labels = smartLabels(args.bamfiles)
    if len(args.labels) != len(args.bamfiles):
        sys.exit("Error: The number of labels ({0}) does not match the number of BAM files ({1})!".format(len(args.labels), len(args.bamfiles)))

    # Ensure that if we're given an attributeKey that it's not empty
    if args.attributeKey and args.attributeKey == "":
        args.attributeKey = None

    global gtf
    if not args.regionLabels and args.smartLabels:
        args.regionLabels = smartLabels(args.BED)
    gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels, attributeKey=args.attributeKey)

    # Get fragment size and chromosome dict
    fhs = [openBam(x) for x in args.bamfiles]
    chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose)
    for fh in fhs:
        fh.close()

    frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bamfiles[0],
                                                                return_lengths=False,
                                                                blackListFileName=args.blackListFileName,
                                                                numberOfProcessors=args.numberOfProcessors,
                                                                verbose=args.verbose)
    if args.extendReads:
        if args.extendReads is True:
            # try to guess fragment length if the bam file contains paired end reads
            if frag_len_dict:
                defaultFragmentLength = frag_len_dict['median']
            else:
                sys.exit("*ERROR*: library is not paired-end. Please provide an extension length.")
            if args.verbose:
                print("Fragment length based on paired en data "
                      "estimated to be {0}".format(frag_len_dict['median']))
        elif args.extendReads < read_len_dict['median']:
            sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). "
                             "Reads will not be extended.\n".format(int(read_len_dict['median'])))
            defaultFragmentLength = 'read length'
        elif args.extendReads > 2000:
            sys.exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
        else:
            defaultFragmentLength = args.extendReads
    else:
        defaultFragmentLength = 'read length'

    # Get the chunkLength
    chunkLength = getChunkLength(args, chromSize)

    # Map reduce to get the counts/file/feature
    res = mapReduce([args, defaultFragmentLength],
                    getEnrichment_worker,
                    chromSize,
                    genomeChunkLength=chunkLength,
                    region=args.region,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    features = res[0][1]
    featureCounts = []
    for i in list(range(len(args.bamfiles))):
        d = dict()
        for x in features:
            d[x] = 0
        featureCounts.append(d)

    # res is a list, with each element a list (length len(args.bamfiles)) of dicts
    totalCounts = [0] * len(args.bamfiles)
    for x in res:
        for i, y in enumerate(x[2]):
            totalCounts[i] += y
        for i, y in enumerate(x[0]):
            for k, v in y.items():
                featureCounts[i][k] += v

    # Make a plot
    if args.plotFile:
        plotEnrichment(args, featureCounts, totalCounts, features)

    # Raw counts
    if args.outRawCounts:
        of = open(args.outRawCounts, "w")
        of.write("file\tfeatureType\tpercent\tfeatureReadCount\ttotalReadCount\n")
        for i, x in enumerate(args.labels):
            for k, v in featureCounts[i].items():
                of.write("{0}\t{1}\t{2:5.2f}\t{3}\t{4}\n".format(x, k, (100.0 * v) / totalCounts[i], v, totalCounts[i]))
        of.close()
def writeBedGraph_worker(
    chrom,
    start,
    end,
    tileSize,
    defaultFragmentLength,
    bamOrBwFileList,
    func,
    funcArgs,
    extendPairedEnds=True,
    smoothLength=0,
    missingDataAsZero=False,
    fixed_step=False,
):
    r"""
    Writes a bedgraph having as base a number of bam files.

    The given func is called to compute the desired bedgraph value
    using the funcArgs

    tileSize
    """
    if start > end:
        raise NameError("start position ({0}) bigger than " "end position ({1})".format(start, end))

    coverage = []

    for indexFile, fileFormat in bamOrBwFileList:
        if fileFormat == "bam":
            bamHandle = bamHandler.openBam(indexFile)
            coverage.append(
                getCoverageFromBam(
                    bamHandle, chrom, start, end, tileSize, defaultFragmentLength, extendPairedEnds, True
                )
            )
            bamHandle.close()
        elif fileFormat == "bigwig":
            bigwigHandle = pyBigWig.open(indexFile)
            coverage.append(getCoverageFromBigwig(bigwigHandle, chrom, start, end, tileSize, missingDataAsZero))
            bigwigHandle.close()

    # is /dev/shm available?
    # working in this directory speeds the process
    try:
        _file = tempfile.NamedTemporaryFile(dir="/dev/shm", delete=False)
    except OSError:
        _file = tempfile.NamedTemporaryFile(delete=False)

    previousValue = None
    lengthCoverage = len(coverage[0])
    for tileIndex in xrange(lengthCoverage):

        tileCoverage = []
        for index in range(len(bamOrBwFileList)):
            if smoothLength > 0:
                vectorStart, vectorEnd = getSmoothRange(tileIndex, tileSize, smoothLength, lengthCoverage)
                tileCoverage.append(np.mean(coverage[index][vectorStart:vectorEnd]))
            else:
                try:
                    tileCoverage.append(coverage[index][tileIndex])
                except IndexError:
                    print "Chromosome {} probably not in one of the bigwig " "files. Remove this chromosome from the bigwig file " "to continue".format(
                        chrom
                    )
                    exit(0)

        #        if  zerosToNans == True and sum(tileCoverage) == 0.0:
        #            continue

        value = func(tileCoverage, funcArgs)

        if fixed_step:
            writeStart = start + tileIndex * tileSize
            writeEnd = min(writeStart + tileSize, end)
            try:
                _file.write("%s\t%d\t%d\t%.2f\n" % (chrom, writeStart, writeEnd, value))
            except TypeError:
                _file.write("{}\t{}\t{}\t{}\n".format(chrom, writeStart, writeEnd, value))
        else:
            if previousValue is None:
                writeStart = start + tileIndex * tileSize
                writeEnd = min(writeStart + tileSize, end)
                previousValue = value

            elif previousValue == value:
                writeEnd = min(writeEnd + tileSize, end)

            elif previousValue != value:
                if not np.isnan(previousValue):
                    _file.write("%s\t%d\t%d\t%.2f\n" % (chrom, writeStart, writeEnd, previousValue))
                previousValue = value
                writeStart = writeEnd
                writeEnd = min(writeStart + tileSize, end)

    if not fixed_step:
        # write remaining value if not a nan
        if previousValue and writeStart != end and not np.isnan(previousValue):
            _file.write("%s\t%d\t%d\t%.1f\n" % (chrom, writeStart, end, previousValue))

    #        """
    tempFileName = _file.name
    _file.close()
    return tempFileName
Пример #29
0
def estimateScaleFactor(bamFilesList,
                        binLength,
                        numberOfSamples,
                        defaultFragmentLength,
                        normalizationLength,
                        avg_method='median',
                        numberOfProcessors=1,
                        verbose=False,
                        chrsToSkip=[]):
    r"""
    Subdivides the genome into chunks to be analyzed in parallel
    using several processors. The code handles the creation of
    workers that compute fragment counts (coverage) for different
    regions and then collect and integrates the results.


    The arguments are:
         'bamFilesList', list of bam files to normalize
         'binLength', the window size in bp, where reads are going to be
                         counted.
         'numberOfSamples', Number of sites to sample.

         'defaultFragmentLength', if the reads are not paired, this value
                      is used extend the reads.
         'normalizationLength', length, in bp, to normalize the data.
                        For a value of 1, are given such that on average
                        1 fragment per base pair is found
         'avg_method', defines how the different values are to be summarized.
                       The options are 'mean' and 'median'

         'chrsToSkip', name of the chromosomes to be excluded from the
                       scale stimation. Usually the chrX is included.

    For example, to test about 1 million regions of length 500 bp,
    the binLength will be 500 and the numberOfSamples is going
    to be the size of the genome divided by the 1 million. This number
    is not exact because regions in which all counts
    are 0 are not taken into  account

    The test data contains reads for 200 bp
    >>> test = Tester()

    >>> dict = estimateScaleFactor([test.bamFile1, test.bamFile2], 50, 4, 0, 1)
    >>> dict['size_factors']
    array([ 1. ,  0.5])
    >>> dict['size_factors_based_on_mean']
    array([ 1. ,  0.5])
    """
    if len(bamFilesList) > 2:
        raise NameError("SES scale factors are only defined for 2 files")

    bamFilesHandlers = [bamHandler.openBam(x) for x in bamFilesList]
    mappedReads = [x.mapped for x in bamFilesHandlers]

    sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64')

    sizeFactorBasedOnMappedReads = \
        sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads

    num_reads_per_bin = getNumReadsPerBin(
        bamFilesList,
        binLength,
        numberOfSamples,
        defaultFragmentLength,
        numberOfProcessors=numberOfProcessors,
        verbose=verbose,
        chrsToSkip=chrsToSkip)

    sitesSampled = len(num_reads_per_bin)

    # the transpose is taken to easily iterate by columns which are now
    # converted to rows
    num_reads_per_bin = num_reads_per_bin.transpose()
    #    np.savetxt("/home/ramirez/tmp/test.num_reads", num_reads_per_bin)
    # size factors based on order statistics
    # see Signal extraction scaling (SES) method in: Diaz et al (2012)
    # Normalization, bias correction, and peak calling for ChIP-seq.
    # Statistical applications in genetics and molecular biology, 11(3).

    # using the same names as in Diaz paper
    # p refers to ChIP, q to input

    p = np.sort(num_reads_per_bin[0, :]).cumsum()
    q = np.sort(num_reads_per_bin[1, :]).cumsum()

    # p[-1] and q[-1] are the maximum values in the  arrays.
    # both p and q are normalized by this value
    diff = np.abs(p / p[-1] - q / q[-1])
    # get the lowest rank for wich the difference is the maximum
    maxIndex = np.flatnonzero(diff == diff.max())[0]
    # Take a lower rank to move to a region with probably
    # less peaks and more background.
    maxIndex = int(maxIndex * 0.8)
    while (maxIndex < len(p)):
        # in rare cases the maxIndex maps to a zero value.
        # In such cases, the next index is used until
        # a non zero value appears.
        cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])])
        if cumSum.min() > 0:
            break
        maxIndex += 1

    meanSES = [
        np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]),
        np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex])
    ]

    # the maxIndex may be too close to the the signal regions
    # so i take a more conservative approach by taking a close number

    sizeFactorsSES = cumSum.min() / cumSum
    median = np.median(num_reads_per_bin, axis=1)

    # consider only those read numbers that are below the 90
    # percentile to stimate the
    # mean and std
    mean = []
    std = []
    for values in num_reads_per_bin:
        maxNumReads = (np.percentile(values, 90))
        if maxNumReads == 0:
            maxNumReads = (np.percentile(values, 99))
            if maxNumReads == 0:
                print "all genomic regions sampled from one "
                "of the bam files have no reads.\n"
                values = values[values <= maxNumReads]

        mean.append(np.mean(values))
        std.append(np.std(values))

    mean = np.array(mean)
    readsPerBin = mean if avg_method == 'mean' else median

    sizeFactor = sizeFactorsSES

    return {
        'size_factors': sizeFactor,
        'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads,
        'size_factors_SES': sizeFactorsSES,
        'size_factors_based_on_mean': mean.min() / mean,
        'size_factors_based_on_median': median.min() / median,
        'mean': mean,
        'meanSES': meanSES,
        'median': median,
        'reads_per_bin': readsPerBin,
        'std': std,
        'sites_sampled': sitesSampled
    }
Пример #30
0
    def count_reads_in_region(self, chrom, start, end, bed_regions_list=None):
        """Counts the reads in each bam file at each 'stepSize' position
        within the interval (start, end) for a window or bin of size binLength.

        The stepSize controls the distance between bins. For example,
        a step size of 20 and a bin size of 20 will create bins next to
        each other. If the step size is smaller than the bin size the
        bins will overlap.

        If a list of bedRegions is given, then the number of reads
        that overlaps with each region is counted.

        Parameters
        ----------
        chrom : str
            Chrom name
        start : int
            start coordinate
        end : int
            end coordinate
        bed_regions_list: list
            List of list of tuples of the form (start, end)
            corresponding to bed regions to be processed.
            If not bed file was passed to the object constructor
            then this list is empty.

        Returns
        -------
        numpy array
            The result is a numpy array that as rows each bin
            and as columns each bam file.


        Examples
        --------
        Initialize some useful values

        >>> test = Tester()
        >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50)

        The transpose is used to get better looking numbers. The first line
        corresponds to the number of reads per bin in the first bamfile.

        >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200)
        >>> _array
        array([[ 0.,  0.],
               [ 0.,  1.],
               [ 1.,  1.],
               [ 1.,  2.]])

        """

        if start > end:
            raise NameError("start %d bigger that end %d" % (start, end))

        if self.stepSize is None:
            raise ValueError("stepSize is not set!")
        # array to keep the read counts for the regions
        subnum_reads_per_bin = []

        start_time = time.time()

        bam_handlers = []
        for fname in self.bamFilesList:
            try:
                bam_handlers.append(bamHandler.openBam(fname))
            except:
                bam_handlers.append(pyBigWig.open(fname))

        blackList = None
        if self.blackListFileName is not None:
            blackList = GTF(self.blackListFileName)

        # A list of lists of tuples
        transcriptsToConsider = []
        if bed_regions_list is not None:
            transcriptsToConsider = [x[1] for x in bed_regions_list]
        else:
            if self.stepSize == self.binLength:
                transcriptsToConsider.append([(start, end, self.binLength)])
            else:
                for i in range(start, end, self.stepSize):
                    if i + self.binLength > end:
                        break
                    if blackList is not None and blackList.findOverlaps(chrom, i, i + self.binLength):
                        continue
                    transcriptsToConsider.append([(i, i + self.binLength)])

        if self.save_data:
            _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
            _file_name = _file.name
        else:
            _file_name = ''

        for bam in bam_handlers:
            for trans in transcriptsToConsider:
                tcov = self.get_coverage_of_region(bam, chrom, trans)
                if bed_regions_list is not None:
                    subnum_reads_per_bin.append(np.sum(tcov))
                else:
                    subnum_reads_per_bin.extend(tcov)

        subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F')

        if self.save_data:
            idx = 0
            for i, trans in enumerate(transcriptsToConsider):
                if len(trans[0]) != 3:
                    starts = ",".join([str(x[0]) for x in trans])
                    ends = ",".join([str(x[1]) for x in trans])
                    _file.write("\t".join([chrom, starts, ends]) + "\t")
                    _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[i, :]]) + "\n")
                else:
                    for exon in trans:
                        for startPos in range(exon[0], exon[1], exon[2]):
                            if idx >= subnum_reads_per_bin.shape[0]:
                                # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size
                                # Counts there are added to the bin before them, but range() will still try to include them.
                                break
                            _file.write("{0}\t{1}\t{2}\t".format(chrom, startPos, startPos + exon[2]))
                            _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[idx, :]]) + "\n")
                            idx += 1
            _file.close()

        if self.verbose:
            endTime = time.time()
            rows = subnum_reads_per_bin.shape[0]
            print("%s countReadsInRegions_worker: processing %d "
                  "(%.1f per sec) @ %s:%s-%s" %
                  (multiprocessing.current_process().name,
                   rows, rows / (endTime - start_time), chrom, start, end))

        return subnum_reads_per_bin, _file_name
Пример #31
0
    def run(self, allArgs=None):
        # Try to determine an optimal fraction of the genome (chunkSize) that is sent to
        # workers for analysis. If too short, too much time is spend loading the files
        # if too long, some processors end up free.
        # the following values are empirical
        bamFilesHandlers = []
        for x in self.bamFilesList:
            try:
                y = bamHandler.openBam(x)
            except:
                y = pyBigWig.open(x)
            bamFilesHandlers.append(y)
        chromSizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandlers, verbose=self.verbose)

        # skip chromosome in the list. This is usually for the
        # X chromosome which may have either one copy  in a male sample
        # or a mixture of male/female and is unreliable.
        # Also the skip may contain heterochromatic regions and
        # mitochondrial DNA
        if len(self.chrsToSkip):
            chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip]

        chrNames, chrLengths = list(zip(*chromSizes))

        genomeSize = sum(chrLengths)
        if self.stepSize is None:
            if self.region is None:
                self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1)
            else:
                # compute the step size, based on the number of samples
                # and the length of the region studied
                (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3]
                self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1)

        # number of samples is better if large
        if np.mean(chrLengths) < self.stepSize and self.bedFile is None:
            min_num_of_samples = int(genomeSize / np.mean(chrLengths))
            raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples))

        max_mapped = []
        for x in bamFilesHandlers:
            try:
                max_mapped.append(x.mapped)
            except:
                # bigWig, use a fixed value
                max_mapped.append(0)
        max_mapped = max(max_mapped)

        # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin
        if max_mapped == 0:
            chunkSize = 10000 * self.binLength
            self.stepSize = self.binLength
        else:
            reads_per_bp = float(max_mapped) / genomeSize
            chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers)))
        [bam_h.close() for bam_h in bamFilesHandlers]

        # Ensure that chunkSize is always at least self.stepSize
        if chunkSize < self.stepSize:
            chunkSize = self.stepSize

        if self.verbose:
            print("step size is {}".format(self.stepSize))

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        # Handle GTF options
        transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs)

        # use map reduce to call countReadsInRegions_wrapper
        imap_res = mapReduce.mapReduce([],
                                       countReadsInRegions_wrapper,
                                       chromSizes,
                                       self_=self,
                                       genomeChunkLength=chunkSize,
                                       bedFile=self.bedFile,
                                       blackListFileName=self.blackListFileName,
                                       region=self.region,
                                       numberOfProcessors=self.numberOfProcessors,
                                       transcriptID=transcriptID,
                                       exonID=exonID,
                                       keepExons=keepExons,
                                       transcript_id_designator=transcript_id_designator)

        if self.out_file_for_raw_data:
            if len(non_common):
                sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
                                 "the chromosomes that were not common between the bigwig files\n")

            # concatenate intermediary bedgraph files
            ofile = open(self.out_file_for_raw_data, "w")
            for _values, tempFileName in imap_res:
                if tempFileName:
                    # concatenate all intermediate tempfiles into one
                    _foo = open(tempFileName, 'r')
                    shutil.copyfileobj(_foo, ofile)
                    _foo.close()
                    os.remove(tempFileName)

            ofile.close()

        try:
            num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
            return num_reads_per_bin

        except ValueError:
            if self.bedFile:
                sys.exit('\nNo coverage values could be computed.\n\n'
                         'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                         'The valid chromosome names are:\n{}'.format(chrNames))
            else:
                sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                         'contain mapped reads.')
Пример #32
0
def getEnrichment_worker(arglist):
    """
    This is the worker function of plotEnrichment.

    In short, given a region, iterate over all reads **starting** in it.
    Filter/extend them as requested and check each for an overlap with
    findOverlaps. For each overlap, increment the counter for that feature.
    """
    chrom, start, end, args, defaultFragmentLength = arglist
    if args.verbose:
        sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))

    olist = []
    total = [0] * len(args.bamfiles)
    for idx, f in enumerate(args.bamfiles):
        odict = dict()
        for x in gtf.features:
            odict[x] = 0
        fh = openBam(f)

        chrom = mungeChromosome(chrom, fh.references)

        lpos = None
        prev_pos = set()
        for read in fh.fetch(chrom, start, end):
            # Filter
            if read.pos < start:
                # Ensure that a given alignment is processed only once
                continue
            if read.flag & 4:
                continue
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                continue
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                continue
            tLen = getTLen(read)
            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
                continue
            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
                continue
            if args.ignoreDuplicates:
                # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                if tLen >= 0:
                    s = read.pos
                    e = s + tLen
                else:
                    s = read.pnext
                    e = s - tLen
                if read.reference_id != read.next_reference_id:
                    e = read.pnext
                if lpos is not None and lpos == read.reference_start \
                        and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                    continue
                if lpos != read.reference_start:
                    prev_pos.clear()
                lpos = read.reference_start
                prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
            total[idx] += 1

            # Get blocks, possibly extending
            features = gtf.findOverlaps(
                chrom,
                getBAMBlocks(read, defaultFragmentLength, args.centerReads,
                             args.Offset))

            if features is not None and len(features) > 0:
                for x in features:
                    odict[x] += 1
        olist.append(odict)
    return olist, gtf.features, total
Пример #33
0
def main(args=None):
    args = process_args(args)
    global F_gc, N_gc, R_gc

    data = np.loadtxt(args.GCbiasFrequenciesFile.name)

    F_gc = data[:, 0]
    N_gc = data[:, 1]
    R_gc = data[:, 2]

    global global_vars
    global_vars = {}
    global_vars['2bit'] = args.genome
    global_vars['bam'] = args.bamfile

    # compute the probability to find more than one read (a redundant read)
    # at a certain position based on the gc of the read fragment
    # the binomial function is used for that
    max_dup_gc = [
        binom.isf(1e-7, F_gc[x], 1.0 /
                  N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1
        for x in range(len(F_gc))
    ]

    global_vars['max_dup_gc'] = max_dup_gc

    tbit = py2bit.open(global_vars['2bit'])
    bam, mapped, unmapped, stats = openBam(args.bamfile,
                                           returnStats=True,
                                           nThreads=args.numberOfProcessors)

    global_vars['genome_size'] = sum(tbit.chroms().values())
    global_vars['total_reads'] = mapped
    global_vars['reads_per_bp'] = \
        float(global_vars['total_reads']) / args.effectiveGenomeSize

    # apply correction
    print("applying correction")
    # divide the genome in fragments containing about 4e5 reads.
    # This amount of reads takes about 20 seconds
    # to process per core (48 cores, 256 Gb memory)
    chunkSize = int(4e5 / global_vars['reads_per_bp'])

    # chromSizes: list of tuples
    chromSizes = [(bam.references[i], bam.lengths[i])
                  for i in range(len(bam.references))]

    regionStart = 0
    if args.region:
        chromSizes, regionStart, regionEnd, chunkSize = \
            mapReduce.getUserRegion(chromSizes, args.region,
                                    max_chunk_size=chunkSize)

    print("genome partition size for multiprocessing: {}".format(chunkSize))
    print("using region {}".format(args.region))
    mp_args = []
    bedGraphStep = args.binSize
    chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()),
                                       bam.references)
    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
    print(chrNameBitToBam, chrNameBamToBit)
    c = 1
    for chrom, size in chromSizes:
        start = 0 if regionStart == 0 else regionStart
        for i in range(start, size, chunkSize):
            try:
                chrNameBamToBit[chrom]
            except KeyError:
                print("no sequence information for ")
                "chromosome {} in 2bit file".format(chrom)
                print("Reads in this chromosome will be skipped")
                continue
            length = min(size, i + chunkSize)
            mp_args.append(
                (chrom, chrNameBamToBit[chrom], i, length, bedGraphStep))
            c += 1

    pool = multiprocessing.Pool(args.numberOfProcessors)

    if args.correctedFile.name.endswith('bam'):
        if len(mp_args) > 1 and args.numberOfProcessors > 1:
            print(("using {} processors for {} "
                   "number of tasks".format(args.numberOfProcessors,
                                            len(mp_args))))

            res = pool.map_async(writeCorrectedSam_wrapper,
                                 mp_args).get(9999999)
        else:
            res = list(map(writeCorrectedSam_wrapper, mp_args))

        if len(res) == 1:
            command = "cp {} {}".format(res[0], args.correctedFile.name)
            run_shell_command(command)
        else:
            print("concatenating (sorted) intermediate BAMs")
            header = pysam.Samfile(res[0])
            of = pysam.Samfile(args.correctedFile.name, "wb", template=header)
            header.close()
            for f in res:
                f = pysam.Samfile(f)
                for e in f.fetch(until_eof=True):
                    of.write(e)
                f.close()
            of.close()

        print("indexing BAM")
        pysam.index(args.correctedFile.name)

        for tempFileName in res:
            os.remove(tempFileName)

    if args.correctedFile.name.endswith('bg') or \
            args.correctedFile.name.endswith('bw'):

        if len(mp_args) > 1 and args.numberOfProcessors > 1:

            res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999)
        else:
            res = list(map(writeCorrected_wrapper, mp_args))

        oname = args.correctedFile.name
        args.correctedFile.close()
        if oname.endswith('bg'):
            f = open(oname, 'wb')
            for tempFileName in res:
                if tempFileName:
                    shutil.copyfileobj(open(tempFileName, 'rb'), f)
                    os.remove(tempFileName)
            f.close()
        else:
            chromSizes = [(k, v) for k, v in tbit.chroms().items()]
            writeBedGraph.bedGraphToBigWig(chromSizes, res, oname)
Пример #34
0
def getFiltered_worker(arglist):
    chrom, start, end, args = arglist
    # Fix the bounds
    if end - start > args.binSize and end - start > args.distanceBetweenBins:
        end -= args.distanceBetweenBins
    if end <= start:
        end = start + 1

    o = []
    for fname in args.bamfiles:
        fh = bamHandler.openBam(fname)
        chromUse = utilities.mungeChromosome(chrom, fh.references)
        prev_pos = set()
        lpos = None

        minMapq = 0
        samFlagInclude = 0
        samFlagExclude = 0
        internalDupes = 0
        externalDupes = 0
        singletons = 0
        filterRNAstrand = 0
        nFiltered = 0
        total = 0  # This is only used to estimate the percentage affected
        for read in fh.fetch(chromUse, start, end):
            filtered = 0
            if read.pos < start:
                # ensure that we never double count (in case distanceBetweenBins == 0)
                continue

            if read.flag & 4:
                # Ignore unmapped reads, they were counted already
                continue

            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                filtered = 1
                minMapq += 1
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                filtered = 1
                samFlagInclude += 1
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                filtered = 1
                samFlagExclude += 1
            if args.ignoreDuplicates:
                # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                if read.tlen >= 0:
                    s = read.pos
                    e = s + read.tlen
                else:
                    s = read.pnext
                    e = s - read.tlen
                if read.reference_id != read.next_reference_id:
                    e = read.pnext
                if lpos is not None and lpos == read.reference_start \
                        and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                    filtered = 1
                    internalDupes += 1
                if lpos != read.reference_start:
                    prev_pos.clear()
                lpos = read.reference_start
                prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
            if read.is_duplicate:
                filtered = 1
                externalDupes += 1
            if read.is_paired and read.mate_is_unmapped:
                filtered = 1
                singletons += 1

            # filterRNAstrand
            if args.filterRNAstrand:
                if read.is_paired:
                    if args.filterRNAstrand == 'forward':
                        if read.flag & 144 == 128 or read.flag & 96 == 64:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1
                    elif args.filterRNAstrand == 'reverse':
                        if read.flag & 144 == 144 or read.flag & 96 == 96:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1
                else:
                    if args.filterRNAstrand == 'forward':
                        if read.flag & 16 == 16:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1
                    elif args.filterRNAstrand == 'reverse':
                        if read.flag & 16 == 0:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1

            total += 1
            nFiltered += filtered
        fh.close()

        # Append a tuple to the output
        tup = (total, nFiltered, minMapq, samFlagInclude, samFlagExclude, internalDupes, externalDupes, singletons, filterRNAstrand)
        o.append(tup)
    return o
Пример #35
0
def main(args=None):
    args = parseArguments().parse_args(args)

    if not args.sampleLabels and args.smartLabels:
        args.sampleLabels = smartLabels(args.bamfiles)

    if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles):
        sys.stderr.write("\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n")
        sys.exit(1)

    if args.outFile is None:
        of = sys.stdout
    else:
        of = open(args.outFile, "w")

    bhs = [bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles]
    mapped = [x[1] for x in bhs]
    unmappedList = [x[2] for x in bhs]
    bhs = [x[0] for x in bhs]

    # Get the reads in blacklisted regions
    if args.blackListFileName:
        blacklisted = []
        for bh in bhs:
            blacklisted.append(utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors))
    else:
        blacklisted = [0] * len(bhs)

    # Get the total and mapped reads
    total = [x + y for x, y in list(zip(mapped, unmappedList))]

    chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths))
    for x in bhs:
        x.close()

    # Get the remaining metrics
    res = mapReduce([args],
                    getFiltered_worker,
                    chrom_sizes,
                    genomeChunkLength=args.binSize + args.distanceBetweenBins,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    totals = [0] * len(args.bamfiles)
    nFiltered = [0] * len(args.bamfiles)
    MAPQs = [0] * len(args.bamfiles)
    flagIncludes = [0] * len(args.bamfiles)
    flagExcludes = [0] * len(args.bamfiles)
    internalDupes = [0] * len(args.bamfiles)
    externalDupes = [0] * len(args.bamfiles)
    singletons = [0] * len(args.bamfiles)
    rnaStrand = [0] * len(args.bamfiles)
    for x in res:
        for idx, r in enumerate(x):
            totals[idx] += r[0]
            nFiltered[idx] += r[1]
            MAPQs[idx] += r[2]
            flagIncludes[idx] += r[3]
            flagExcludes[idx] += r[4]
            internalDupes[idx] += r[5]
            externalDupes[idx] += r[6]
            singletons[idx] += r[7]
            rnaStrand[idx] += r[8]

    # Print some output
    of.write("Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n")
    for idx, _ in enumerate(args.bamfiles):
        if args.sampleLabels:
            of.write(args.sampleLabels[idx])
        else:
            of.write(args.bamfiles[idx])
        of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx], blacklisted[idx]))
        # nFiltered
        metric = 0.0
        if totals[idx] > 0:
            metric = blacklisted[idx] + float(nFiltered[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # MAPQ
        metric = 0.0
        if totals[idx] > 0:
            metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # samFlagInclude
        metric = 0.0
        if totals[idx] > 0:
            metric = float(flagIncludes[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # samFlagExclude
        metric = 0.0
        if totals[idx] > 0:
            metric = float(flagExcludes[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Internally determined duplicates
        metric = 0.0
        if totals[idx] > 0:
            metric = float(internalDupes[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Externally marked duplicates
        metric = 0.0
        if totals[idx] > 0:
            metric = float(externalDupes[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Singletons
        metric = 0.0
        if totals[idx] > 0:
            metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # filterRNAstrand
        metric = 0.0
        if totals[idx] > 0:
            metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        of.write("\n")

    if args.outFile is not None:
        of.close()

    return 0
def writeBedGraph(
        bamOrBwFileList, outputFileName, fragmentLength,
        func, funcArgs, tileSize=25, region=None, numberOfProcessors=None,
        format="bedgraph", extendPairedEnds=True, missingDataAsZero=False,
        smoothLength=0, fixed_step=False):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    """

    bamHandlers = [bamHandler.openBam(indexedFile) for
                   indexedFile,
                   fileFormat in bamOrBwFileList if fileFormat == 'bam']
    if len(bamHandlers):
        genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=False)
    else:
        genomeChunkLength = int(10e6)
        bigwigs = [fileName for fileName,
                   fileFormat in bamOrBwFileList if fileFormat == 'bigwig']
        cCommon = []
        chromNamesAndSize = {}
        for bw in bigwigs:
            bwh = pyBigWig.open(bw)
            for chromName, size in bwh.chroms().items():
                if chromName in chromNamesAndSize:
                    cCommon.append(chromName)
                    if chromNamesAndSize[chromName] != size:
                        print "\nWARNING\n" \
                            "Chromosome {} length reported in the " \
                            "bigwig files differ.\n{} for {}\n" \
                            "{} for {}.\n\nThe smallest " \
                            "length will be used".format(
                                chromName, chromNamesAndSize[chromName],
                                bigwigs[0], size, bw)
                        chromNamesAndSize[chromName] = min(
                            chromNamesAndSize[chromName], size)
                else:
                    chromNamesAndSize[chromName] = size
            bwh.close()

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems()
                             if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList,
                               func, funcArgs, extendPairedEnds, smoothLength,
                               missingDataAsZero, fixed_step),
                              writeBedGraph_wrapper,
                              chromNamesAndSize,
                              genomeChunkLength=genomeChunkLength,
                              region=region,
                              numberOfProcessors=numberOfProcessors)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, 'rb'), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(
            chromNamesAndSize, bedGraphFile, outputFileName, True)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)
Пример #37
0
def main(args=None):
    args = process_args(args)

    global debug
    if args.verbose:
        sys.stderr.write("Specified --scaleFactor: {}\n".format(
            args.scaleFactor))
        debug = 1
    else:
        debug = 0

    if args.normalizeUsing == 'None':
        args.normalizeUsing = None  # For the sake of sanity

    if args.normalizeUsing:
        # if a normalization is required then compute the scale factors
        bam, mapped, unmapped, stats = openBam(
            args.bam, returnStats=True, nThreads=args.numberOfProcessors)
        bam.close()
        scale_factor = get_scale_factor(args, stats)
    else:
        scale_factor = args.scaleFactor

    func_args = {'scaleFactor': scale_factor}

    # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used
    if args.filterRNAstrand and not args.Offset:
        args.Offset = [1, -1]

    if args.MNase:
        # check that library is paired end
        # using getFragmentAndReadSize
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(
            args.bam,
            return_lengths=False,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose)
        if frag_len_dict is None:
            sys.exit(
                "*Error*: For the --MNAse function a paired end library is required. "
            )

        # Set some default fragment length bounds
        if args.minFragmentLength == 0:
            args.minFragmentLength = 130
        if args.maxFragmentLength == 0:
            args.maxFragmentLength = 200

        wr = CenterFragment(
            [args.bam],
            binLength=args.binSize,
            stepSize=args.binSize,
            region=args.region,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            extendReads=args.extendReads,
            minMappingQuality=args.minMappingQuality,
            ignoreDuplicates=args.ignoreDuplicates,
            center_read=args.centerReads,
            zerosToNans=args.skipNonCoveredRegions,
            samFlag_include=args.samFlagInclude,
            samFlag_exclude=args.samFlagExclude,
            minFragmentLength=args.minFragmentLength,
            maxFragmentLength=args.maxFragmentLength,
            chrsToSkip=args.ignoreForNormalization,
            verbose=args.verbose,
        )

    elif args.Offset:
        if len(args.Offset) > 1:
            if args.Offset[0] == 0:
                sys.exit(
                    "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment."
                )
            if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]:
                sys.exir(
                    "'Error*: The right side bound is less than the left-side bound. This is inappropriate."
                )
        else:
            if args.Offset[0] == 0:
                sys.exit(
                    "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment."
                )
        wr = OffsetFragment([args.bam],
                            binLength=args.binSize,
                            stepSize=args.binSize,
                            region=args.region,
                            numberOfProcessors=args.numberOfProcessors,
                            extendReads=args.extendReads,
                            minMappingQuality=args.minMappingQuality,
                            ignoreDuplicates=args.ignoreDuplicates,
                            center_read=args.centerReads,
                            zerosToNans=args.skipNonCoveredRegions,
                            samFlag_include=args.samFlagInclude,
                            samFlag_exclude=args.samFlagExclude,
                            minFragmentLength=args.minFragmentLength,
                            maxFragmentLength=args.maxFragmentLength,
                            chrsToSkip=args.ignoreForNormalization,
                            verbose=args.verbose)
        wr.filter_strand = args.filterRNAstrand
        wr.Offset = args.Offset
    else:
        wr = writeBedGraph.WriteBedGraph(
            [args.bam],
            binLength=args.binSize,
            stepSize=args.binSize,
            region=args.region,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            extendReads=args.extendReads,
            minMappingQuality=args.minMappingQuality,
            ignoreDuplicates=args.ignoreDuplicates,
            center_read=args.centerReads,
            zerosToNans=args.skipNonCoveredRegions,
            samFlag_include=args.samFlagInclude,
            samFlag_exclude=args.samFlagExclude,
            minFragmentLength=args.minFragmentLength,
            maxFragmentLength=args.maxFragmentLength,
            chrsToSkip=args.ignoreForNormalization,
            verbose=args.verbose,
        )

    wr.run(writeBedGraph.scaleCoverage,
           func_args,
           args.outFileName,
           blackListFileName=args.blackListFileName,
           format=args.outFileFormat,
           smoothLength=args.smoothLength)
Пример #38
0
def main(args=None):
    args = process_args(args)

    global debug
    if args.verbose:
        sys.stderr.write("Specified --scaleFactor: {}\n".format(args.scaleFactor))
        debug = 1
    else:
        debug = 0

    if args.normalizeUsing == 'None':
        args.normalizeUsing = None  # For the sake of sanity
    elif args.normalizeUsing == 'RPGC' and not args.effectiveGenomeSize:
        sys.exit("RPGC normalization requires an --effectiveGenomeSize!\n")

    if args.normalizeUsing:
        # if a normalization is required then compute the scale factors
        bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
        bam.close()
        scale_factor = get_scale_factor(args, stats)
    else:
        scale_factor = args.scaleFactor

    func_args = {'scaleFactor': scale_factor}

    # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used
    if args.filterRNAstrand and not args.Offset:
        args.Offset = [1, -1]

    if args.MNase:
        # check that library is paired end
        # using getFragmentAndReadSize
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
                                                                    return_lengths=False,
                                                                    blackListFileName=args.blackListFileName,
                                                                    numberOfProcessors=args.numberOfProcessors,
                                                                    verbose=args.verbose)
        if frag_len_dict is None:
            sys.exit("*Error*: For the --MNAse function a paired end library is required. ")

        # Set some default fragment length bounds
        if args.minFragmentLength == 0:
            args.minFragmentLength = 130
        if args.maxFragmentLength == 0:
            args.maxFragmentLength = 200

        wr = CenterFragment([args.bam],
                            binLength=args.binSize,
                            stepSize=args.binSize,
                            region=args.region,
                            blackListFileName=args.blackListFileName,
                            numberOfProcessors=args.numberOfProcessors,
                            extendReads=args.extendReads,
                            minMappingQuality=args.minMappingQuality,
                            ignoreDuplicates=args.ignoreDuplicates,
                            center_read=args.centerReads,
                            zerosToNans=args.skipNonCoveredRegions,
                            samFlag_include=args.samFlagInclude,
                            samFlag_exclude=args.samFlagExclude,
                            minFragmentLength=args.minFragmentLength,
                            maxFragmentLength=args.maxFragmentLength,
                            chrsToSkip=args.ignoreForNormalization,
                            verbose=args.verbose,
                            )

    elif args.Offset:
        if len(args.Offset) > 1:
            if args.Offset[0] == 0:
                sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
            if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]:
                sys.exir("'Error*: The right side bound is less than the left-side bound. This is inappropriate.")
        else:
            if args.Offset[0] == 0:
                sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
        wr = OffsetFragment([args.bam],
                            binLength=args.binSize,
                            stepSize=args.binSize,
                            region=args.region,
                            numberOfProcessors=args.numberOfProcessors,
                            extendReads=args.extendReads,
                            minMappingQuality=args.minMappingQuality,
                            ignoreDuplicates=args.ignoreDuplicates,
                            center_read=args.centerReads,
                            zerosToNans=args.skipNonCoveredRegions,
                            samFlag_include=args.samFlagInclude,
                            samFlag_exclude=args.samFlagExclude,
                            minFragmentLength=args.minFragmentLength,
                            maxFragmentLength=args.maxFragmentLength,
                            chrsToSkip=args.ignoreForNormalization,
                            verbose=args.verbose)
        wr.filter_strand = args.filterRNAstrand
        wr.Offset = args.Offset
    else:
        wr = writeBedGraph.WriteBedGraph([args.bam],
                                         binLength=args.binSize,
                                         stepSize=args.binSize,
                                         region=args.region,
                                         blackListFileName=args.blackListFileName,
                                         numberOfProcessors=args.numberOfProcessors,
                                         extendReads=args.extendReads,
                                         minMappingQuality=args.minMappingQuality,
                                         ignoreDuplicates=args.ignoreDuplicates,
                                         center_read=args.centerReads,
                                         zerosToNans=args.skipNonCoveredRegions,
                                         samFlag_include=args.samFlagInclude,
                                         samFlag_exclude=args.samFlagExclude,
                                         minFragmentLength=args.minFragmentLength,
                                         maxFragmentLength=args.maxFragmentLength,
                                         chrsToSkip=args.ignoreForNormalization,
                                         verbose=args.verbose,
                                         )

    wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName,
           blackListFileName=args.blackListFileName,
           format=args.outFileFormat, smoothLength=args.smoothLength)
Пример #39
0
def getFractionKept_worker(chrom, start, end, bamFile, args):
    """
    Queries the BAM file and counts the number of alignments kept/found in the
    first 50000 bases.
    """
    bam = bamHandler.openBam(bamFile)
    end = min(end, start + 50000)
    tot = 0
    filtered = 0
    prev_start_pos = None  # to store the start positions
    if chrom in bam.references:
        for read in bam.fetch(chrom, start, end):
            tot += 1
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                filtered += 1
                continue

            # filter reads based on SAM flag
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                filtered += 1
                continue
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                filtered += 1
                continue

            # fragment length filtering
            tLen = utilities.getTLen(read)
            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
                filtered += 1
                continue
            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
                filtered += 1
                continue

            # get rid of duplicate reads that have same position on each of the
            # pairs
            if args.ignoreDuplicates and prev_start_pos \
                    and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
                filtered += 1
                continue
            prev_start_pos = (read.reference_start, read.pnext, read.is_reverse)

            # If filterRNAstrand is in args, then filter accordingly
            # This is very similar to what's used in the get_fragment_from_read function in the filterRnaStrand class
            if hasattr(args, "filterRNAstrand"):
                if read.is_paired:
                    if args.filterRNAstrand == 'forward':
                        if not ((read.flag & 128 == 128 and read.flag & 16 == 0) or (read.flag & 64 == 64 and read.flag & 32 == 0)):
                            filtered += 1
                            continue
                    elif args.filterRNAstrand == 'reverse':
                        if not (read.flag & 144 == 144 or read.flag & 96 == 96):
                            filtered += 1
                            continue
                else:
                    if args.filterRNAstrand == 'forward' and read.flag & 16 == 0:
                        filtered += 1
                        continue
                    elif args.filterRNAstrand == 'reverse' and read.flag & 16 == 16:
                        filtered += 1
                        continue

    return (filtered, tot)
Пример #40
0
def tabulateGCcontent_worker(chromNameBam, start, end, stepSize,
                             fragmentLength,
                             chrNameBamToBit, verbose=False):
    r""" given genome regions, the GC content of the genome is tabulated for
    fragments of length 'fragmentLength' each 'stepSize' positions.

    >>> test = Tester()
    >>> args = test.testTabulateGCcontentWorker()
    >>> N_gc, F_gc = tabulateGCcontent_worker(*args)

    The forward read positions are:
    [1,  4,  10, 10, 16, 18]
    which correspond to a GC of
    [1,  1,  1,  1,  2,  1]

    The evaluated position are
    [0,  2,  4,  6,  8, 10, 12, 14, 16, 18]
    the corresponding GC is
    [2,  1,  1,  2,  2,  1,  2,  3,  2,  1]

    >>> print(N_gc)
    [0 4 5 1]
    >>> print(F_gc)
    [0 4 1 0]
    >>> test.set_filter_out_file()
    >>> chrNameBam2bit =  {'2L': 'chr2L'}

    Test for the filter out option
    >>> N_gc, F_gc = tabulateGCcontent_worker('2L', 0, 20, 2,
    ... {'median': 3}, chrNameBam2bit)
    >>> test.unset_filter_out_file()

    The evaluated positions are
    [ 0  2  8 10 12 14 16 18]
    >>> print(N_gc)
    [0 3 4 1]
    >>> print(F_gc)
    [0 3 1 0]

    Test for extra_sampling option
    >>> test.set_extra_sampling_file()
    >>> chrNameBam2bit =  {'2L': 'chr2L'}
    >>> res = tabulateGCcontent_worker('2L', 0, 20, 2,
    ... {'median': 3}, chrNameBam2bit)

    The new positions evaluated are
    [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18]
    and the GC is
    [2, 1, 1, 0, 1, 2, 2, 1,  2,  3,  2,  1]
    >>> print(res[0])
    [1 5 5 1]
    >>> print(res[1])
    [0 5 1 0]

    """
    if start > end:
        raise NameError("start %d bigger that end %d" % (start, end))

    chromNameBit = chrNameBamToBit[chromNameBam]

    # array to keep track of the GC from regions of length 'fragmentLength'
    # from the genome. The index of the array is used to
    # indicate the gc content. The values inside the
    # array are counts. Thus, if N_gc[10] = 3, that means
    # that 3 regions have a gc_content of 10.
    subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')
    subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')

    tbit = py2bit.open(global_vars['2bit'])
    bam = bamHandler.openBam(global_vars['bam'])
    peak = 0
    startTime = time.time()

    if verbose:
        print("[{:.3f}] computing positions to "
              "sample".format(time.time() - startTime))

    positions_to_sample = getPositionsToSample(chromNameBit,
                                               start, end, stepSize)

    read_counts = []
    # Optimize IO.
    # if the sample regions are far apart from each
    # other is faster to go to each location and fetch
    # the reads found there.
    # Otherwise, if the regions to sample are close to
    # each other, is faster to load all the reads in
    # a large region into memory and consider only
    # those falling into the positions to sample.
    # The following code gets the reads
    # that are at sampling positions that lie close together
    if np.mean(np.diff(positions_to_sample)) < 1000:
        start_pos = min(positions_to_sample)
        end_pos = max(positions_to_sample)
        if verbose:
            print("[{:.3f}] caching reads".format(time.time() - startTime))

        counts = np.bincount([r.pos - start_pos
                              for r in bam.fetch(chromNameBam, start_pos,
                                                 end_pos + 1)
                              if not r.is_reverse and r.pos >= start_pos],
                             minlength=end_pos - start_pos + 2)

        read_counts = counts[positions_to_sample - min(positions_to_sample)]
        if verbose:
            print("[{:.3f}] finish caching reads.".format(
                time.time() - startTime))

    countTime = time.time()

    c = 1
    for index in range(len(positions_to_sample)):
        i = positions_to_sample[index]
        # stop if the end of the chromosome is reached
        if i + fragmentLength['median'] > tbit.chroms(chromNameBit):
            break

        try:
            gc = getGC_content(tbit, chromNameBit, int(i), int(i + fragmentLength['median']), fraction=False)
        except Exception as detail:
            if verbose:
                print(detail)
            continue

        subN_gc[gc] += 1

        # count all reads at position 'i'
        if len(read_counts) == 0:  # case when no cache was done
            num_reads = len([x.pos for x in bam.fetch(chromNameBam, i, i + 1)
                             if x.is_reverse is False and x.pos == i])
        else:
            num_reads = read_counts[index]

        if num_reads >= global_vars['max_reads']:
            peak += 1
            continue

        subF_gc[gc] += num_reads
        if verbose:
            if index % 50000 == 0:
                endTime = time.time()
                print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" %
                      (multiprocessing.current_process().name,
                       index, index / (endTime - countTime),
                       chromNameBit, start, end, stepSize))
        c += 1

    if verbose:
        endTime = time.time()
        print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" %
              (multiprocessing.current_process().name,
               index, index / (endTime - countTime),
               chromNameBit, start, end, stepSize))
        print("%s total time %.1f @ %s:%s-%s %s" % (multiprocessing.current_process().name,
                                                    (endTime - startTime), chromNameBit, start, end, stepSize))

    return(subN_gc, subF_gc)
    def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, format="bedgraph", smoothLength=0):
        r"""
        Given a list of bamfiles, a function and a function arguments,
        this method writes a bedgraph file (or bigwig) file
        for a partition of the genome into tiles of given size
        and a value for each tile that corresponds to the given function
        and that is related to the coverage underlying the tile.

        Parameters
        ----------
        func_to_call : str
            function name to be called to convert the list of coverages computed
            for each bam file at each position into a single value. An example
            is a function that takes the ratio between the coverage of two
            bam files.
        func_args : dict
            dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0}

        out_file_name : str
            name of the file to save the resulting data.

        smoothLength : int
            Distance in bp for smoothing the coverage per tile.


        """
        self.__dict__["smoothLength"] = smoothLength
        bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList]
        genome_chunk_length = getGenomeChunkLength(bam_handlers, self.binLength)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chrom_names_and_size, non_common = getCommonChrNames(bam_handlers, verbose=False)

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        for x in self.__dict__.keys():
            sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x)))

        res = mapReduce.mapReduce([func_to_call, func_args],
                                  writeBedGraph_wrapper,
                                  chrom_names_and_size,
                                  self_=self,
                                  genomeChunkLength=genome_chunk_length,
                                  region=self.region,
                                  blackListFileName=blackListFileName,
                                  numberOfProcessors=self.numberOfProcessors)

        # concatenate intermediary bedgraph files
        out_file = open(out_file_name + ".bg", 'wb')
        for tempfilename in res:
            if tempfilename:
                # concatenate all intermediate tempfiles into one
                # bedgraph file
                shutil.copyfileobj(open(tempfilename, 'rb'), out_file)
                os.remove(tempfilename)

        bedgraph_file = out_file.name
        out_file.close()
        if format == 'bedgraph':
            os.rename(bedgraph_file, out_file_name)
            if self.verbose:
                print "output file: {}".format(out_file_name)
        else:
            bedGraphToBigWig(
                chrom_names_and_size, bedgraph_file, out_file_name, True)
            if self.verbose:
                print "output file: {}".format(out_file_name)
            os.remove(bedgraph_file)
Пример #42
0
def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step):
    r"""writes a bedgraph file containing the GC correction of
    a region from the genome

    >>> test = Tester()
    >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk())
    >>> open(tempFile, 'r').readlines()
    ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n']
    >>> os.remove(tempFile)
    """
    global R_gc
    fragmentLength = len(R_gc) - 1

    cvg_corr = np.zeros(end - start)

    i = 0

    tbit = py2bit.open(global_vars['2bit'])
    bam = openBam(global_vars['bam'])
    read_repetitions = 0
    removed_duplicated_reads = 0
    startTime = time.time()

    # caching seems to be faster
    # r.flag & 4 == 0 is to skip unmapped
    # reads that nevertheless are asigned
    # to a genomic position
    reads = [r for r in bam.fetch(chrNameBam, start, end)
             if r.flag & 4 == 0]

    bam.close()

    r_index = -1
    for read in reads:
        if read.is_unmapped:
            continue
        r_index += 1
        try:
            # calculate GC content of read fragment
            gc = getReadGCcontent(tbit, read, fragmentLength,
                                  chrNameBit)
        except Exception as detail:
            print(detail)
            """ this exception happens when the end of a
            chromosome is reached """
            continue
        if not gc:
            continue

        # is this read in the same orientation and position as the previous?
        if r_index > 0 and read.pos == reads[r_index - 1].pos and \
                read.is_reverse == reads[r_index - 1].is_reverse \
                and read.pnext == reads[r_index - 1].pnext:
            read_repetitions += 1
            if read_repetitions >= global_vars['max_dup_gc'][gc]:
                removed_duplicated_reads += 1
                continue
        else:
            read_repetitions = 0

        try:
            fragmentStart, fragmentEnd = getFragmentFromRead(read, fragmentLength, extendPairedEnds=True)
            vectorStart = max(fragmentStart - start, 0)
            vectorEnd = min(fragmentEnd - start, end - start)
        except TypeError:
            # the get_fragment_from_read functions returns None in some cases.
            # Those cases are to be skipped, hence the continue line.
            continue

        cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc]
        i += 1

    try:
        if debug:
            endTime = time.time()
            print("{}, processing {} ({:.1f} per sec) ")
            "reads @ {}:{}-{}".format(multiprocessing.current_process().name,
                                      i, i / (endTime - startTime),
                                      chrNameBit, start, end)
    except NameError:
        pass

    if i == 0:
        return None

    _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
    # save in bedgraph format
    for bin in range(0, len(cvg_corr), step):
        value = np.mean(cvg_corr[bin:min(bin + step, end)])
        if value > 0:
            writeStart = start + bin
            writeEnd = min(start + bin + step, end)
            _file.write("%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart,
                                                writeEnd, value))

    tempFileName = _file.name
    _file.close()
    return tempFileName
Пример #43
0
def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end,
                             step=None,
                             tag_but_not_change_number=False,
                             verbose=True):
    r"""
    Writes a BAM file, deleting and adding some reads in order to compensate
    for the GC bias. **This is a stochastic method.**
    >>> np.random.seed(1)
    >>> test = Tester()
    >>> args = test.testWriteCorrectedSam()
    >>> tempFile = writeCorrectedSam_worker(*args, \
    ... tag_but_not_change_number=True, verbose=False)
    >>> try:
    ...     import StringIO
    ... except ImportError:
    ...     from io import StringIO
    >>> ostdout = sys.stdout
    >>> import tempfile
    >>> sys.stdout = tempfile.TemporaryFile()
    >>> idx = pysam.index(tempFile)
    >>> sys.stdout = ostdout
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    >>> tempFile = \
    ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\
    ... tag_but_not_change_number=True, verbose=False)
    >>> sys.stdout = tempfile.TemporaryFile()
    >>> idx = pysam.index(tempFile)
    >>> sys.stdout = ostdout
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    """
    global R_gc
    fragmentLength = len(R_gc) - 1

    if verbose:
        print("Sam for %s %s %s " % (chrNameBit, start, end))
    i = 0

    tbit = py2bit.open(global_vars['2bit'])

    bam = openBam(global_vars['bam'])
    tempFileName = utilities.getTempFileName(suffix='.bam')

    outfile = pysam.Samfile(tempFileName, 'wb', template=bam)
    startTime = time.time()
    matePairs = {}
    read_repetitions = 0
    removed_duplicated_reads = 0

    # cache data
    # r.flag & 4 == 0 is to filter unmapped reads that
    # have a genomic position
    reads = [r for r in bam.fetch(chrNameBam, start, end)
             if r.pos > start and r.flag & 4 == 0]

    r_index = -1
    for read in reads:
        if read.pos <= start or read.is_unmapped:
            continue
        r_index += 1
        copies = None
        gc = None

        # check if a mate has already been procesed
        # to apply the same correction
        try:
            copies = matePairs[read.qname]['copies']
            gc = matePairs[read.qname]['gc']
            del(matePairs[read.qname])
        except:
            # this exception happens when a mate is
            # not present. This could
            # happen because of removal of the mate
            # by some filtering
            gc = getReadGCcontent(tbit, read, fragmentLength,
                                  chrNameBit)
            if gc:
                copies = numCopiesOfRead(float(1) / R_gc[gc])
            else:
                copies = 1
        # is this read in the same orientation and position as the previous?
        if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \
                and read.is_reverse == reads[r_index - 1].is_reverse \
                and read.pnext == reads[r_index - 1].pnext:
            read_repetitions += 1
            if read_repetitions >= global_vars['max_dup_gc'][gc]:
                copies = 0  # in other words do not take into account this read
                removed_duplicated_reads += 1
        else:
            read_repetitions = 0

        readName = read.qname
        # Each tag is a tuple of (tag name, value, type)
        # Note that get_tags() returns ord(type) rather than type and this must
        # be fixed!
        # It turns out that the "with_value_type" option only started working in
        # pysam-0.8.4, so we can't reliably add tags on earlier versions without
        # potentially creating BAM files that break HTSJDK/IGV/etc.

        readTag = read.get_tags(with_value_type=True)
        replace_tags = False
        if len(readTag) > 0:
            if len(readTag[0]) == 3:
                if type(readTag[2]) is int:
                    readTag = [(x[0], x[1], chr(x[2])) for x in readTag]
                replace_tags = True
        else:
            replace_tags = True

        if gc:
            GC = int(100 * np.round(float(gc) / fragmentLength,
                                    decimals=2))
            readTag.append(
                ('YC', float(round(float(1) / R_gc[gc], 2)), "f"))
            readTag.append(('YN', copies, "i"))
        else:
            GC = -1

        readTag.append(('YG', GC, "i"))
        if replace_tags:
            read.set_tags(readTag)

        if read.is_paired and read.is_proper_pair \
                and not read.mate_is_unmapped \
                and not read.is_reverse:
            matePairs[readName] = {'copies': copies,
                                   'gc': gc}

        """
        outfile.write(read)
        """
        if tag_but_not_change_number:
            outfile.write(read)
            continue

        for numCop in range(1, copies + 1):
            # the read has to be renamed such that newly
            # formed pairs will match
            if numCop > 1:
                read.qname = readName + "_%d" % (numCop)
            outfile.write(read)

        if verbose:
            if i % 500000 == 0 and i > 0:
                endTime = time.time()
                print("{},  processing {} ({:.1f} per sec) reads "
                      "@ {}:{}-{}".format(multiprocessing.current_process().name,
                                          i, i / (endTime - startTime),
                                          chrNameBit, start, end))
        i += 1

    outfile.close()
    if verbose:
        endTime = time.time()
        print("{},  processing {} ({:.1f} per sec) reads "
              "@ {}:{}-{}".format(multiprocessing.current_process().name,
                                  i, i / (endTime - startTime),
                                  chrNameBit, start, end))
        percentage = float(removed_duplicated_reads) * 100 / len(reads) \
            if len(reads) > 0 else 0
        print("duplicated reads removed %d of %d (%.2f) " %
              (removed_duplicated_reads, len(reads), percentage))

    return tempFileName
Пример #44
0
def estimateScaleFactor(bamFilesList,
                        binLength,
                        numberOfSamples,
                        normalizationLength,
                        avg_method='median',
                        blackListFileName=None,
                        numberOfProcessors=1,
                        verbose=False,
                        chrsToSkip=[]):
    r"""
    Subdivides the genome into chunks to be analyzed in parallel
    using several processors. The code handles the creation of
    workers that compute fragment counts (coverage) for different
    regions and then collect and integrates the results.

    Parameters
    ----------
    bamFilesList : list
        list of bam files to normalize
    binLength : int
        the window size in bp, where reads are going to be
        counted.
    numberOfSamples : int
        number of sites to sample from the genome. For more info see
        the documentation of the CountReadsPerBin class
    normalizationLength : int
        length, in bp, to normalize the data.
        For a value of 1, on average
        1 read per base pair is found
    avg_method : str
        defines how the different values are to be summarized.
        The options are 'mean' and 'median'
    chrsToSkip : list
        name of the chromosomes to be excluded from the
        scale estimation. Usually the chrX is included.
    blackListFileName : str
        BED file containing blacklisted regions

    Returns
    -------
    dict
        Dictionary with the following keys::
            'size_factors'
            'size_factors_based_on_mapped_reads'
            'size_factors_SES'
            'size_factors_based_on_mean'
            'size_factors_based_on_median'
            'mean'
            'meanSES'
            'median'
            'reads_per_bin'
            'std'
            'sites_sampled'


    Examples
    --------
    >>> test = Tester()
    >>> bin_length = 50
    >>> num_samples = 4
    >>> _dict = estimateScaleFactor([test.bamFile1, test.bamFile2], bin_length, num_samples,  1)
    >>> _dict['size_factors']
    array([ 1. ,  0.5])
    >>> _dict['size_factors_based_on_mean']
    array([ 1. ,  0.5])
    """

    assert len(
        bamFilesList) == 2, "SES scale factors are only defined for 2 files"

    bamFilesHandlers = [bamHandler.openBam(x) for x in bamFilesList]
    mappedReads = [x.mapped for x in bamFilesHandlers]

    sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64')

    sizeFactorBasedOnMappedReads = sizeFactorBasedOnMappedReads.min(
    ) / sizeFactorBasedOnMappedReads

    cr = countR.CountReadsPerBin(bamFilesList,
                                 binLength=binLength,
                                 numberOfSamples=numberOfSamples,
                                 extendReads=False,
                                 blackListFileName=blackListFileName,
                                 numberOfProcessors=numberOfProcessors,
                                 verbose=verbose,
                                 chrsToSkip=chrsToSkip)

    try:
        num_reads_per_bin = cr.run()
    except Exception as detail:
        exit("*ERROR*: {}".format(detail))

    sitesSampled = len(num_reads_per_bin)

    # the transpose is taken to easily iterate by columns which are now
    # converted to rows
    num_reads_per_bin = num_reads_per_bin.transpose()
    # size factors based on order statistics
    # see Signal extraction scaling (SES) method in: Diaz et al (2012)
    # Normalization, bias correction, and peak calling for ChIP-seq.
    # Statistical applications in genetics and molecular biology, 11(3).

    # using the same names as in Diaz paper
    # p refers to ChIP, q to input

    p = np.sort(num_reads_per_bin[0, :]).cumsum()
    q = np.sort(num_reads_per_bin[1, :]).cumsum()

    # p[-1] and q[-1] are the maximum values in the  arrays.
    # both p and q are normalized by this value
    diff = np.abs(p / p[-1] - q / q[-1])
    # get the lowest rank for wich the difference is the maximum
    maxIndex = np.flatnonzero(diff == diff.max())[0]
    # Take a lower rank to move to a region with probably
    # less peaks and more background.
    maxIndex = int(maxIndex * 0.8)
    while (maxIndex < len(p)):
        # in rare cases the maxIndex maps to a zero value.
        # In such cases, the next index is used until
        # a non zero value appears.
        cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])])
        if cumSum.min() > 0:
            break
        maxIndex += 1

    meanSES = [
        np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]),
        np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex])
    ]

    # the maxIndex may be too close to the the signal regions
    # so i take a more conservative approach by taking a close number

    sizeFactorsSES = cumSum.min() / cumSum
    median = np.median(num_reads_per_bin, axis=1)

    # consider only those read numbers that are below the 90
    # percentile to stimate the
    # mean and std
    mean = []
    std = []
    for values in num_reads_per_bin:
        maxNumReads = (np.percentile(values, 90))
        if maxNumReads == 0:
            maxNumReads = (np.percentile(values, 99))
            if maxNumReads == 0:
                print("all genomic regions sampled from one ")
                "of the bam files have no reads.\n"
                values = values[values <= maxNumReads]

        mean.append(np.mean(values))
        std.append(np.std(values))

    mean = np.array(mean)
    readsPerBin = mean if avg_method == 'mean' else median

    if min(median) == 0:
        idx_zero = [ix + 1 for ix, value in enumerate(median) if value == 0]
        exit(
            "\n*ERROR*: The median coverage computed is zero for sample(s) #{}\n"
            "Try selecting a larger sample size or a region with coverage\n".
            format(idx_zero))

    sizeFactor = sizeFactorsSES
    return {
        'size_factors': sizeFactor,
        'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads,
        'size_factors_SES': sizeFactorsSES,
        'size_factors_based_on_mean': mean.min() / mean,
        'size_factors_based_on_median': median.min() / median,
        'mean': mean,
        'meanSES': meanSES,
        'median': median,
        'reads_per_bin': readsPerBin,
        'std': std,
        'sites_sampled': sitesSampled
    }
Пример #45
0
def main(args=None):
    args = process_args(args)
    global F_gc, N_gc, R_gc

    data = np.loadtxt(args.GCbiasFrequenciesFile.name)

    F_gc = data[:, 0]
    N_gc = data[:, 1]
    R_gc = data[:, 2]

    global global_vars
    global_vars = {}
    global_vars['2bit'] = args.genome
    global_vars['bam'] = args.bamfile

    # compute the probability to find more than one read (a redundant read)
    # at a certain position based on the gc of the read fragment
    # the binomial function is used for that
    max_dup_gc = [binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x])
                  if F_gc[x] > 0 and N_gc[x] > 0 else 1
                  for x in range(len(F_gc))]

    global_vars['max_dup_gc'] = max_dup_gc

    tbit = py2bit.open(global_vars['2bit'])
    bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors)

    global_vars['genome_size'] = sum(tbit.chroms().values())
    global_vars['total_reads'] = mapped
    global_vars['reads_per_bp'] = \
        float(global_vars['total_reads']) / args.effectiveGenomeSize

    # apply correction
    print("applying correction")
    # divide the genome in fragments containing about 4e5 reads.
    # This amount of reads takes about 20 seconds
    # to process per core (48 cores, 256 Gb memory)
    chunkSize = int(4e5 / global_vars['reads_per_bp'])

    # chromSizes: list of tuples
    chromSizes = [(bam.references[i], bam.lengths[i])
                  for i in range(len(bam.references))]

    regionStart = 0
    if args.region:
        chromSizes, regionStart, regionEnd, chunkSize = \
            mapReduce.getUserRegion(chromSizes, args.region,
                                    max_chunk_size=chunkSize)

    print("genome partition size for multiprocessing: {}".format(chunkSize))
    print("using region {}".format(args.region))
    mp_args = []
    bedGraphStep = args.binSize
    chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references)
    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
    print(chrNameBitToBam, chrNameBamToBit)
    c = 1
    for chrom, size in chromSizes:
        start = 0 if regionStart == 0 else regionStart
        for i in range(start, size, chunkSize):
            try:
                chrNameBamToBit[chrom]
            except KeyError:
                print("no sequence information for ")
                "chromosome {} in 2bit file".format(chrom)
                print("Reads in this chromosome will be skipped")
                continue
            length = min(size, i + chunkSize)
            mp_args.append((chrom, chrNameBamToBit[chrom], i, length,
                            bedGraphStep))
            c += 1

    pool = multiprocessing.Pool(args.numberOfProcessors)

    if args.correctedFile.name.endswith('bam'):
        if len(mp_args) > 1 and args.numberOfProcessors > 1:
            print(("using {} processors for {} "
                   "number of tasks".format(args.numberOfProcessors,
                                            len(mp_args))))

            res = pool.map_async(
                writeCorrectedSam_wrapper, mp_args).get(9999999)
        else:
            res = list(map(writeCorrectedSam_wrapper, mp_args))

        if len(res) == 1:
            command = "cp {} {}".format(res[0], args.correctedFile.name)
            run_shell_command(command)
        else:
            print("concatenating (sorted) intermediate BAMs")
            header = pysam.Samfile(res[0])
            of = pysam.Samfile(args.correctedFile.name, "wb", template=header)
            header.close()
            for f in res:
                f = pysam.Samfile(f)
                for e in f.fetch(until_eof=True):
                    of.write(e)
                f.close()
            of.close()

        print("indexing BAM")
        pysam.index(args.correctedFile.name)

        for tempFileName in res:
            os.remove(tempFileName)

    if args.correctedFile.name.endswith('bg') or \
            args.correctedFile.name.endswith('bw'):

        if len(mp_args) > 1 and args.numberOfProcessors > 1:

            res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999)
        else:
            res = list(map(writeCorrected_wrapper, mp_args))

        oname = args.correctedFile.name
        args.correctedFile.close()
        if oname.endswith('bg'):
            f = open(oname, 'wb')
            for tempFileName in res:
                if tempFileName:
                    shutil.copyfileobj(open(tempFileName, 'rb'), f)
                    os.remove(tempFileName)
            f.close()
        else:
            chromSizes = [(k, v) for k, v in tbit.chroms().items()]
            writeBedGraph.bedGraphToBigWig(chromSizes, res, oname)
Пример #46
0
    def __init__(self,
                 bamFilesList,
                 binLength=50,
                 numberOfSamples=None,
                 numberOfProcessors=1,
                 verbose=False,
                 region=None,
                 bedFile=None,
                 extendReads=False,
                 blackListFileName=None,
                 minMappingQuality=None,
                 ignoreDuplicates=False,
                 chrsToSkip=[],
                 stepSize=None,
                 center_read=False,
                 samFlag_include=None,
                 samFlag_exclude=None,
                 zerosToNans=False,
                 skipZeroOverZero=False,
                 smoothLength=0,
                 minFragmentLength=0,
                 maxFragmentLength=0,
                 out_file_for_raw_data=None,
                 statsList=[],
                 mappedList=[]):

        self.bamFilesList = bamFilesList
        self.binLength = binLength
        self.numberOfSamples = numberOfSamples
        self.blackListFileName = blackListFileName
        self.statsList = statsList
        self.mappedList = mappedList
        self.skipZeroOverZero = skipZeroOverZero

        if extendReads and len(bamFilesList):
            from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
            frag_len_dict, read_len_dict = get_read_and_fragment_length(
                bamFilesList[0],
                return_lengths=False,
                blackListFileName=blackListFileName,
                numberOfProcessors=numberOfProcessors,
                verbose=verbose)
            if extendReads is True:
                # try to guess fragment length if the bam file contains paired end reads
                if frag_len_dict:
                    self.defaultFragmentLength = int(frag_len_dict['median'])
                else:
                    exit(
                        "*ERROR*: library is not paired-end. Please provide an extension length."
                    )
                if verbose:
                    print(
                        ("Fragment length based on paired en data "
                         "estimated to be {}".format(frag_len_dict['median'])))

            elif extendReads < read_len_dict['median']:
                sys.stderr.write(
                    "*WARNING*: read extension is smaller than read length (read length = {}). "
                    "Reads will not be extended.\n".format(
                        int(read_len_dict['median'])))
                self.defaultFragmentLength = 'read length'

            elif extendReads > 2000:
                exit(
                    "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                    .format(extendReads))
            else:
                self.defaultFragmentLength = int(extendReads)

        else:
            self.defaultFragmentLength = 'read length'

        self.numberOfProcessors = numberOfProcessors
        self.verbose = verbose
        self.region = region
        self.bedFile = bedFile
        self.minMappingQuality = minMappingQuality
        self.ignoreDuplicates = ignoreDuplicates
        self.chrsToSkip = chrsToSkip
        self.stepSize = stepSize
        self.center_read = center_read
        self.samFlag_include = samFlag_include
        self.samFlag_exclude = samFlag_exclude
        self.minFragmentLength = minFragmentLength
        self.maxFragmentLength = maxFragmentLength
        self.zerosToNans = zerosToNans
        self.smoothLength = smoothLength

        if out_file_for_raw_data:
            self.save_data = True
            self.out_file_for_raw_data = out_file_for_raw_data
        else:
            self.save_data = False
            self.out_file_for_raw_data = None

        # check that wither numberOfSamples or stepSize are set
        if numberOfSamples is None and stepSize is None and bedFile is None:
            raise ValueError(
                "either stepSize, numberOfSamples or bedFile have to be set")

        if self.defaultFragmentLength != 'read length':
            self.maxPairedFragmentLength = 4 * self.defaultFragmentLength
        else:
            self.maxPairedFragmentLength = 1000
        if self.maxFragmentLength > 0:
            self.maxPairedFragmentLength = self.maxFragmentLength

        if len(self.mappedList) == 0:
            try:
                for fname in self.bamFilesList:
                    bam, mapped, unmapped, stats = bamHandler.openBam(
                        fname,
                        returnStats=True,
                        nThreads=self.numberOfProcessors)
                    self.mappedList.append(mapped)
                    self.statsList.append(stats)
                    bam.close()
            except:
                self.mappedList = []
                self.statsList = []
Пример #47
0
def getFiltered_worker(arglist):
    chrom, start, end, args = arglist
    # Fix the bounds
    if end - start > args.binSize and end - start > args.distanceBetweenBins:
        end -= args.distanceBetweenBins
    if end <= start:
        end = start + 1

    o = []
    for fname in args.bamfiles:
        fh = bamHandler.openBam(fname)
        chromUse = utilities.mungeChromosome(chrom, fh.references)
        prev_pos = set()
        lpos = None

        minMapq = 0
        samFlagInclude = 0
        samFlagExclude = 0
        internalDupes = 0
        externalDupes = 0
        singletons = 0
        filterRNAstrand = 0
        nFiltered = 0
        total = 0  # This is only used to estimate the percentage affected
        for read in fh.fetch(chromUse, start, end):
            filtered = 0
            if read.pos < start:
                # ensure that we never double count (in case distanceBetweenBins == 0)
                continue

            if read.flag & 4:
                # Ignore unmapped reads, they were counted already
                continue

            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                filtered = 1
                minMapq += 1
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                filtered = 1
                samFlagInclude += 1
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                filtered = 1
                samFlagExclude += 1
            if args.ignoreDuplicates:
                # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                if read.tlen >= 0:
                    s = read.pos
                    e = s + read.tlen
                else:
                    s = read.pnext
                    e = s - read.tlen
                if read.reference_id != read.next_reference_id:
                    e = read.pnext
                if lpos is not None and lpos == read.reference_start \
                        and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                    filtered = 1
                    internalDupes += 1
                if lpos != read.reference_start:
                    prev_pos.clear()
                lpos = read.reference_start
                prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
            if read.is_duplicate:
                filtered = 1
                externalDupes += 1
            if read.is_paired and read.mate_is_unmapped:
                filtered = 1
                singletons += 1

            # filterRNAstrand
            if args.filterRNAstrand:
                if read.is_paired:
                    if args.filterRNAstrand == 'forward':
                        if read.flag & 144 == 128 or read.flag & 96 == 64:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1
                    elif args.filterRNAstrand == 'reverse':
                        if read.flag & 144 == 144 or read.flag & 96 == 96:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1
                else:
                    if args.filterRNAstrand == 'forward':
                        if read.flag & 16 == 16:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1
                    elif args.filterRNAstrand == 'reverse':
                        if read.flag & 16 == 0:
                            pass
                        else:
                            filtered = 1
                            filterRNAstrand += 1

            total += 1
            nFiltered += filtered
        fh.close()

        # Append a tuple to the output
        tup = (total, nFiltered, minMapq, samFlagInclude, samFlagExclude,
               internalDupes, externalDupes, singletons, filterRNAstrand)
        o.append(tup)
    return o
Пример #48
0
    def count_reads_in_region(self, chrom, start, end, bed_regions_list=None):
        """Counts the reads in each bam file at each 'stepSize' position
        within the interval (start, end) for a window or bin of size binLength.

        The stepSize controls the distance between bins. For example,
        a step size of 20 and a bin size of 20 will create bins next to
        each other. If the step size is smaller than the bin size the
        bins will overlap.

        If a list of bedRegions is given, then the number of reads
        that overlaps with each region is counted.

        Parameters
        ----------
        chrom : str
            Chrom name
        start : int
            start coordinate
        end : int
            end coordinate
        bed_regions_list: list
            List of list of tuples of the form (start, end)
            corresponding to bed regions to be processed.
            If not bed file was passed to the object constructor
            then this list is empty.

        Returns
        -------
        numpy array
            The result is a numpy array that as rows each bin
            and as columns each bam file.


        Examples
        --------
        Initialize some useful values

        >>> test = Tester()
        >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50)

        The transpose is used to get better looking numbers. The first line
        corresponds to the number of reads per bin in the first bamfile.

        >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200)
        >>> _array
        array([[0., 0.],
               [0., 1.],
               [1., 1.],
               [1., 2.]])

        """

        if start > end:
            raise NameError("start %d bigger that end %d" % (start, end))

        if self.stepSize is None and bed_regions_list is None:
            raise ValueError("stepSize is not set!")
        # array to keep the read counts for the regions
        subnum_reads_per_bin = []

        start_time = time.time()

        bam_handles = []
        for fname in self.bamFilesList:
            try:
                bam_handles.append(bamHandler.openBam(fname))
            except SystemExit:
                sys.exit(sys.exc_info()[1])
            except:
                bam_handles.append(pyBigWig.open(fname))

        blackList = None
        if self.blackListFileName is not None:
            blackList = GTF(self.blackListFileName)

        # A list of lists of tuples
        transcriptsToConsider = []
        if bed_regions_list is not None:
            transcriptsToConsider = [x[1] for x in bed_regions_list]
        else:
            if self.stepSize == self.binLength:
                transcriptsToConsider.append([(start, end, self.binLength)])
            else:
                for i in range(start, end, self.stepSize):
                    if i + self.binLength > end:
                        break
                    if blackList is not None and blackList.findOverlaps(
                            chrom, i, i + self.binLength):
                        continue
                    transcriptsToConsider.append([(i, i + self.binLength)])

        if self.save_data:
            _file = open(deeptools.utilities.getTempFileName(suffix='.bed'),
                         'w+t')
            _file_name = _file.name
        else:
            _file_name = ''

        for bam in bam_handles:
            for trans in transcriptsToConsider:
                tcov = self.get_coverage_of_region(bam, chrom, trans)
                if bed_regions_list is not None:
                    subnum_reads_per_bin.append(np.sum(tcov))
                else:
                    subnum_reads_per_bin.extend(tcov)

        subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(
            -1, len(self.bamFilesList), order='F')

        if self.save_data:
            idx = 0
            for i, trans in enumerate(transcriptsToConsider):
                if len(trans[0]) != 3:
                    starts = ",".join([str(x[0]) for x in trans])
                    ends = ",".join([str(x[1]) for x in trans])
                    _file.write("\t".join([chrom, starts, ends]) + "\t")
                    _file.write("\t".join(
                        ["{}".format(x)
                         for x in subnum_reads_per_bin[i, :]]) + "\n")
                else:
                    for exon in trans:
                        for startPos in range(exon[0], exon[1], exon[2]):
                            if idx >= subnum_reads_per_bin.shape[0]:
                                # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size
                                # Counts there are added to the bin before them, but range() will still try to include them.
                                break
                            _file.write("{0}\t{1}\t{2}\t".format(
                                chrom, startPos, startPos + exon[2]))
                            _file.write("\t".join([
                                "{}".format(x)
                                for x in subnum_reads_per_bin[idx, :]
                            ]) + "\n")
                            idx += 1
            _file.close()

        if self.verbose:
            endTime = time.time()
            rows = subnum_reads_per_bin.shape[0]
            print("%s countReadsInRegions_worker: processing %d "
                  "(%.1f per sec) @ %s:%s-%s" %
                  (multiprocessing.current_process().name, rows, rows /
                   (endTime - start_time), chrom, start, end))

        return subnum_reads_per_bin, _file_name
Пример #49
0
def main(args=None):
    args = parseArguments().parse_args(args)

    if not args.sampleLabels and args.smartLabels:
        args.sampleLabels = smartLabels(args.bamfiles)

    if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles):
        sys.stderr.write(
            "\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n"
        )
        sys.exit(1)

    if args.outFile is None:
        of = sys.stdout
    else:
        of = open(args.outFile, "w")

    bhs = [
        bamHandler.openBam(x,
                           returnStats=True,
                           nThreads=args.numberOfProcessors)
        for x in args.bamfiles
    ]
    mapped = [x[1] for x in bhs]
    unmappedList = [x[2] for x in bhs]
    bhs = [x[0] for x in bhs]

    # Get the reads in blacklisted regions
    if args.blackListFileName:
        blacklisted = []
        for bh in bhs:
            blacklisted.append(
                utilities.bam_blacklisted_reads(bh, None,
                                                args.blackListFileName,
                                                args.numberOfProcessors))
    else:
        blacklisted = [0] * len(bhs)

    # Get the total and mapped reads
    total = [x + y for x, y in list(zip(mapped, unmappedList))]

    chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths))
    for x in bhs:
        x.close()

    # Get the remaining metrics
    res = mapReduce([args],
                    getFiltered_worker,
                    chrom_sizes,
                    genomeChunkLength=args.binSize + args.distanceBetweenBins,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    totals = [0] * len(args.bamfiles)
    nFiltered = [0] * len(args.bamfiles)
    MAPQs = [0] * len(args.bamfiles)
    flagIncludes = [0] * len(args.bamfiles)
    flagExcludes = [0] * len(args.bamfiles)
    internalDupes = [0] * len(args.bamfiles)
    externalDupes = [0] * len(args.bamfiles)
    singletons = [0] * len(args.bamfiles)
    rnaStrand = [0] * len(args.bamfiles)
    for x in res:
        for idx, r in enumerate(x):
            totals[idx] += r[0]
            nFiltered[idx] += r[1]
            MAPQs[idx] += r[2]
            flagIncludes[idx] += r[3]
            flagExcludes[idx] += r[4]
            internalDupes[idx] += r[5]
            externalDupes[idx] += r[6]
            singletons[idx] += r[7]
            rnaStrand[idx] += r[8]

    # Print some output
    of.write(
        "Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n"
    )
    for idx, _ in enumerate(args.bamfiles):
        if args.sampleLabels:
            of.write(args.sampleLabels[idx])
        else:
            of.write(args.bamfiles[idx])
        of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx],
                                       blacklisted[idx]))
        # nFiltered
        metric = 0.0
        if totals[idx] > 0:
            metric = blacklisted[idx] + float(nFiltered[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # MAPQ
        metric = 0.0
        if totals[idx] > 0:
            metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # samFlagInclude
        metric = 0.0
        if totals[idx] > 0:
            metric = float(flagIncludes[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # samFlagExclude
        metric = 0.0
        if totals[idx] > 0:
            metric = float(flagExcludes[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Internally determined duplicates
        metric = 0.0
        if totals[idx] > 0:
            metric = float(internalDupes[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Externally marked duplicates
        metric = 0.0
        if totals[idx] > 0:
            metric = float(externalDupes[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Singletons
        metric = 0.0
        if totals[idx] > 0:
            metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # filterRNAstrand
        metric = 0.0
        if totals[idx] > 0:
            metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        of.write("\n")

    if args.outFile is not None:
        of.close()

    return 0
Пример #50
0
def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
                        normalizationLength,
                        avg_method='median', numberOfProcessors=1,
                        verbose=False, chrsToSkip=[]):
    r"""
    Subdivides the genome into chunks to be analyzed in parallel
    using several processors. The code handles the creation of
    workers that compute fragment counts (coverage) for different
    regions and then collect and integrates the results.

    Parameters
    ----------
    bamFilesList : list
        list of bam files to normalize
    binLength : int
        the window size in bp, where reads are going to be
        counted.
    numberOfSamples : int
        number of sites to sample from the genome. For more info see
        the documentation of the CountReadsPerBin class
    normalizationLength : int
        length, in bp, to normalize the data.
        For a value of 1, on average
        1 read per base pair is found
    avg_method : str
        defines how the different values are to be summarized.
        The options are 'mean' and 'median'
    chrsToSkip : list
        name of the chromosomes to be excluded from the
        scale estimation. Usually the chrX is included.

    Returns
    -------
    dict
        Dictionary with the following keys::
            'size_factors'
            'size_factors_based_on_mapped_reads'
            'size_factors_SES'
            'size_factors_based_on_mean'
            'size_factors_based_on_median'
            'mean'
            'meanSES'
            'median'
            'reads_per_bin'
            'std'
            'sites_sampled'


    Examples
    --------
    >>> test = Tester()
    >>> bin_length = 50
    >>> num_samples = 4
    >>> _dict = estimateScaleFactor([test.bamFile1, test.bamFile2], bin_length, num_samples,  1)
    >>> _dict['size_factors']
    array([ 1. ,  0.5])
    >>> _dict['size_factors_based_on_mean']
    array([ 1. ,  0.5])
    """

    assert len(bamFilesList) == 2, "SES scale factors are only defined for 2 files"

    bamFilesHandlers = [bamHandler.openBam(x) for x in bamFilesList]
    mappedReads = [x.mapped for x in bamFilesHandlers]

    sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64')

    sizeFactorBasedOnMappedReads = sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads

    cr = countR.CountReadsPerBin(bamFilesList,
                                 binLength=binLength,
                                 numberOfSamples=numberOfSamples,
                                 extendReads=False,
                                 numberOfProcessors=numberOfProcessors,
                                 verbose=verbose,
                                 chrsToSkip=chrsToSkip)

    try:
        num_reads_per_bin = cr.run()
    except Exception as detail:
        exit("*ERROR*: {}".format(detail))

    sitesSampled = len(num_reads_per_bin)

    # the transpose is taken to easily iterate by columns which are now
    # converted to rows
    num_reads_per_bin = num_reads_per_bin.transpose()
#    np.savetxt("/home/ramirez/tmp/test.num_reads", num_reads_per_bin)
    # size factors based on order statistics
    # see Signal extraction scaling (SES) method in: Diaz et al (2012)
    # Normalization, bias correction, and peak calling for ChIP-seq.
    # Statistical applications in genetics and molecular biology, 11(3).

    # using the same names as in Diaz paper
    # p refers to ChIP, q to input

    p = np.sort(num_reads_per_bin[0, :]).cumsum()
    q = np.sort(num_reads_per_bin[1, :]).cumsum()

    # p[-1] and q[-1] are the maximum values in the  arrays.
    # both p and q are normalized by this value
    diff = np.abs(p / p[-1] - q / q[-1])
    # get the lowest rank for wich the difference is the maximum
    maxIndex = np.flatnonzero(diff == diff.max())[0]
    # Take a lower rank to move to a region with probably
    # less peaks and more background.
    maxIndex = int(maxIndex * 0.8)
    while(maxIndex < len(p)):
        # in rare cases the maxIndex maps to a zero value.
        # In such cases, the next index is used until
        # a non zero value appears.
        cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])])
        if cumSum.min() > 0:
            break
        maxIndex += 1

    meanSES = [np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]),
               np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex])]

    # the maxIndex may be too close to the the signal regions
    # so i take a more conservative approach by taking a close number

    sizeFactorsSES = cumSum.min() / cumSum
    median = np.median(num_reads_per_bin, axis=1)

    # consider only those read numbers that are below the 90
    # percentile to stimate the
    # mean and std
    mean = []
    std = []
    for values in num_reads_per_bin:
        maxNumReads = (np.percentile(values, 90))
        if maxNumReads == 0:
            maxNumReads = (np.percentile(values, 99))
            if maxNumReads == 0:
                print "all genomic regions sampled from one "
                "of the bam files have no reads.\n"
                values = values[values <= maxNumReads]

        mean.append(np.mean(values))
        std.append(np.std(values))

    mean = np.array(mean)
    readsPerBin = mean if avg_method == 'mean' else median

    if min(median) == 0:
        idx_zero = [ix + 1 for ix, value in enumerate(median) if value == 0]
        exit("\n*ERROR*: The median coverage computed is zero for sample(s) #{}\n"
             "Try selecting a larger sample size or a region with coverage\n".format(idx_zero))

    sizeFactor = sizeFactorsSES
    return {'size_factors': sizeFactor,
            'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads,
            'size_factors_SES': sizeFactorsSES,
            'size_factors_based_on_mean': mean.min() / mean,
            'size_factors_based_on_median': median.min() / median,
            'mean': mean,
            'meanSES': meanSES,
            'median': median,
            'reads_per_bin': readsPerBin,
            'std': std,
            'sites_sampled': sitesSampled}
Пример #51
0
    def writeBedGraph_worker(self,
                             chrom,
                             start,
                             end,
                             func_to_call,
                             func_args,
                             bed_regions_list=None):
        r"""Writes a bedgraph based on the read coverage found on bamFiles

        The given func is called to compute the desired bedgraph value
        using the funcArgs

        Parameters
        ----------
        chrom : str
            Chrom name
        start : int
            start coordinate
        end : int
            end coordinate
        func_to_call : str
            function name to be called to convert the list of coverages computed
            for each bam file at each position into a single value. An example
            is a function that takes the ratio between the coverage of two
            bam files.
        func_args : dict
            dict of arguments to pass to `func`.
        smoothLength : int
            Distance in bp for smoothing the coverage per tile.
        bed_regions_list: list
            List of tuples of the form (chrom, start, end)
            corresponding to bed regions to be processed.
            If not bed file was passed to the object constructor
            then this list is empty.

        Returns
        -------
        temporary file with the bedgraph results for the region queried.

        Examples
        --------
        >>> test_path = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
        >>> bamFile1 = test_path +  "testA.bam"
        >>> bin_length = 50
        >>> number_of_samples = 0 # overruled by step_size
        >>> func_to_call = scaleCoverage
        >>> funcArgs = {'scaleFactor': 1.0}

        >>> c = WriteBedGraph([bamFile1], bin_length, number_of_samples, stepSize=50)
        >>> tempFile = c.writeBedGraph_worker( '3R', 0, 200, func_to_call, funcArgs)
        >>> open(tempFile, 'r').readlines()
        ['3R\t0\t100\t0.00\n', '3R\t100\t200\t1.0\n']
        >>> os.remove(tempFile)


        """
        if start > end:
            raise NameError("start position ({0}) bigger "
                            "than end position ({1})".format(start, end))

        coverage = []
        bam_handlers = [bamHandler.openBam(bam) for bam in self.bamFilesList]
        for bam in bam_handlers:
            coverage.append(
                self.get_coverage_of_region(bam, chrom, start, end,
                                            self.binLength))
            bam.close()

        _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
        previous_value = None

        length_coverage = len(coverage[0])
        for tileIndex in xrange(length_coverage):

            tileCoverage = []
            for index in range(len(self.bamFilesList)):
                if self.smoothLength > 0:
                    vector_start, vector_end = self.getSmoothRange(
                        tileIndex, self.binLength, self.smoothLength,
                        length_coverage)
                    tileCoverage.append(
                        np.mean(coverage[index][vector_start:vector_end]))
                else:
                    tileCoverage.append(coverage[index][tileIndex])

            value = func_to_call(tileCoverage, func_args)
            """
            # uncomment this lines if fixed step bedgraph is wanted
            if not  np.isnan(value):
                writeStart = start + tileIndex*self.binLength
                writeEnd  =  min(writeStart+self.binLength, end)
                _file.write( "%s\t%d\t%d\t%.2f\n" % (chrom, writeStart,
                                                     writeEnd, value) )
            """

            if previous_value is None:
                writeStart = start + tileIndex * self.binLength
                writeEnd = min(writeStart + self.binLength, end)
                previous_value = value

            elif previous_value == value:
                writeEnd = min(writeEnd + self.binLength, end)

            elif previous_value != value:
                if not np.isnan(previous_value):
                    _file.write("{}\t{}\t{}\t{:.2f}\n".format(
                        chrom, writeStart, writeEnd, previous_value))
                previous_value = value
                writeStart = writeEnd
                writeEnd = min(writeStart + self.binLength, end)

        # write remaining value if not a nan
        if previous_value and writeStart != end and not np.isnan(
                previous_value):
            _file.write("%s\t%d\t%d\t%.1f\n" %
                        (chrom, writeStart, end, previous_value))

        tempfilename = _file.name
        _file.close()
        return tempfilename
Пример #52
0
def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None,
                                 binSize=50000, distanceBetweenBins=1000000,
                                 numberOfProcessors=None, verbose=False):
    """
    Estimates the fragment length and read length through sampling

    Parameters
    ----------
    bamFile : str
        BAM file name
    return_lengths : bool
    numberOfProcessors : int
    verbose : bool
    binSize : int
    distanceBetweenBins : int

    Returns
    -------
    d : dict
        tuple of two dictionaries, one for the fragment length and the other
        for the read length. The dictionaries summarise the mean, median etc. values
    """

    bam_handle = bamHandler.openBam(bamFile)
    chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))

    distanceBetweenBins *= 2
    fl = []
    while len(fl) < 1000 and distanceBetweenBins > 1:
        distanceBetweenBins /= 2
        stepsize = binSize + distanceBetweenBins
        imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
                                       getFragmentLength_wrapper,
                                       chrom_sizes,
                                       genomeChunkLength=stepsize,
                                       blackListFileName=blackListFileName,
                                       numberOfProcessors=numberOfProcessors,
                                       verbose=verbose)

        fl = np.concatenate(imap_res)

    if len(fl):
        fragment_length = fl[:, 0]
        read_length = fl[:, 1]
        if fragment_length.mean() > 0:
            fragment_len_dict = {'sample_size': len(fragment_length),
                                 'min': fragment_length.min(),
                                 'qtile25': np.percentile(fragment_length, 25),
                                 'mean': np.mean(fragment_length),
                                 'median': np.median(fragment_length),
                                 'qtile75': np.percentile(fragment_length, 75),
                                 'max': fragment_length.max(),
                                 'std': np.std(fragment_length)}
        else:
            fragment_len_dict = None

        if return_lengths and fragment_len_dict is not None:
            fragment_len_dict['lengths'] = fragment_length

        read_len_dict = {'sample_size': len(read_length),
                         'min': read_length.min(),
                         'qtile25': np.percentile(read_length, 25),
                         'mean': np.mean(read_length),
                         'median': np.median(read_length),
                         'qtile75': np.percentile(read_length, 75),
                         'max': read_length.max(),
                         'std': np.std(read_length)}
        if return_lengths:
            read_len_dict['lengths'] = read_length
    else:
        fragment_len_dict = None
        read_len_dict = None

    return fragment_len_dict, read_len_dict
Пример #53
0
def compareSignal(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, 
                  outFileName, outFileFormat, outFileNameLambda=None, region=None,
                  extendPairedEnds=True,
                  numberOfProcessors=1, Nsigmas = 2, maxSignalRatio=10, verbose=False):
    
    bam1 = bamHandler.openBam(bamFilesList[0])
    genomeSize = sum(bam1.lengths)

    bam2 = bamHandler.openBam(bamFilesList[1])

    treatmentMapped = bam1.mapped
    controlMapped  =  bam2.mapped
    treatmentControlRatioMapped = float(treatmentMapped) / controlMapped

    # 1. Get a table containing number of reads in a sample from the genome.
    #    Only regions for which both samples have non zero counts are considered

    num_reads_per_region = getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors, skipZeros=True, verbose=verbose)

    if verbose:
        print "number of non-zero regions sampled: {}".format(num_reads_per_region.shape[0])
    
    # 2. get Mean and std of treatment (col1) and control (col2)

    treatmentMean, controlMean = np.mean(num_reads_per_region, axis=0) # axis=0: that measn by column
    treatmentStd, controlStd   = np.std(num_reads_per_region, axis=0)
    treatmentTotal, controlTotal   = np.sum(num_reads_per_region, axis=0)

    # 3. Calculate residual in treatment & control data, at regions for which treatment
    #    signal exceeds mean + std * Nsigmas
    #    (these are expected to be the regions at which the signal > mean-signal, 
    #    so the residual signal is positive)

    overRows = np.where(num_reads_per_region[:,0].copy() >= treatmentMean + treatmentStd*Nsigmas )[0]
    over_Nsigma_regions = num_reads_per_region[overRows, :]
    
    treatmentSigMean, controlSigMean = np.mean(over_Nsigma_regions, axis=0)

    treatmentExtraSignal = treatmentSigMean - treatmentMean
    controlExtraSignal   = controlSigMean - controlMean

    treatmentControlRatio = float(treatmentTotal) / controlTotal
    adjSignalRatio = maxSignalRatio * treatmentControlRatio;
    treatmentSignalRatio = float(treatmentExtraSignal) / controlExtraSignal

    if treatmentSignalRatio < adjSignalRatio and treatmentSignalRatio > 0:
        treatmentSignalRatio = adjSignalRatio

    if treatmentSignalRatio < 1:
        raise NameError("estimated signal in control file {} is greater than estimated signal in treatmant file {}. Perhaps the file names are swapped?".format(bamFilesList[0], bamFilesList[1]))

    else:
        controlSignalRatio = 1.0/treatmentSignalRatio

    controlRatio = 1.0 / treatmentControlRatio

    print "Treatment mean: {:.2f}, Treatment total:{:.2f}".format(treatmentMean, treatmentTotal)
    print "Control mean: {:.2f}, Control total:{}".format(controlMean, controlTotal)
    print "the ratio of treatment vs. control for enriched regions is: {:.2f}".format(treatmentSignalRatio)
    print "the ratio of treatment vs. control ratio: {:.2f} (if based on mapped reads: {:.2f})".format(treatmentControlRatio, treatmentControlRatioMapped)

    

    funcArgs = {'controlMean': controlMean,
                'treatmentMean': treatmentMean,
                'controlSignalRatio': controlSignalRatio,
                'controlRatio': controlRatio,
                'treatmentControlRatio': treatmentControlRatio
                }


    writeBedGraph.writeBedGraph( bamFilesList,
                                 outFileName,
                                 defaultFragmentLength, computePvalue, 
                                 funcArgs, tileSize=binLength, region=region,
                                 format=outFileFormat,
                                 zerosToNans = False,
                                 numberOfProcessors=numberOfProcessors,
                                 extendPairedEnds=extendPairedEnds)

    if outFileNameLambda:
        writeBedGraph.writeBedGraph( bamFilesList,
                                     outFileNameLambda,
                                     defaultFragmentLength, computeLambda, 
                                     funcArgs, tileSize=binLength, region=region,
                                     format=outFileFormat,
                                     zerosToNans = False,
                                     numberOfProcessors=numberOfProcessors,
                                     extendPairedEnds=extendPairedEnds)
def writeBedGraph_worker(
        chrom, start, end, tileSize, defaultFragmentLength,
        bamOrBwFileList, func, funcArgs, extendPairedEnds=True, smoothLength=0,
        missingDataAsZero=False, fixed_step=False):
    r"""
    Writes a bedgraph having as base a number of bam files.

    The given func is called to compute the desired bedgraph value
    using the funcArgs

    tileSize
    """
    if start > end:
        raise NameError("start position ({0}) bigger than "
                        "end position ({1})".format(start, end))

    coverage = []

    for indexFile, fileFormat in bamOrBwFileList:
        if fileFormat == 'bam':
            bamHandle = bamHandler.openBam(indexFile)
            coverage.append(getCoverageFromBam(
                bamHandle, chrom, start, end, tileSize,
                defaultFragmentLength, extendPairedEnds,
                True))
            bamHandle.close()
        elif fileFormat == 'bigwig':
            bigwigHandle = pyBigWig.open(indexFile)
            coverage.append(
                getCoverageFromBigwig(
                    bigwigHandle, chrom, start, end,
                    tileSize, missingDataAsZero))
            bigwigHandle.close()

    # is /dev/shm available?
    # working in this directory speeds the process
    try:
        _file = tempfile.NamedTemporaryFile(dir="/dev/shm", delete=False)
    except OSError:
        _file = tempfile.NamedTemporaryFile(delete=False)

    previousValue = None
    lengthCoverage = len(coverage[0])
    for tileIndex in xrange(lengthCoverage):

        tileCoverage = []
        for index in range(len(bamOrBwFileList)):
            if smoothLength > 0:
                vectorStart, vectorEnd = getSmoothRange(
                    tileIndex, tileSize, smoothLength, lengthCoverage)
                tileCoverage.append(
                    np.mean(coverage[index][vectorStart:vectorEnd]))
            else:
                try:
                    tileCoverage.append(coverage[index][tileIndex])
                except IndexError:
                    print "Chromosome {} probably not in one of the bigwig " \
                        "files. Remove this chromosome from the bigwig file " \
                        "to continue".format(chrom)
                    exit(0)

#        if  zerosToNans == True and sum(tileCoverage) == 0.0:
#            continue

        value = func(tileCoverage, funcArgs)

        if fixed_step:
            writeStart = start + tileIndex * tileSize
            writeEnd = min(writeStart + tileSize, end)
            try:
                _file.write("%s\t%d\t%d\t%.2f\n" % (chrom, writeStart,
                                                    writeEnd, value))
            except TypeError:
                _file.write("{}\t{}\t{}\t{}\n".format(chrom, writeStart,
                                                      writeEnd, value))
        else:
            if previousValue is None:
                writeStart = start + tileIndex * tileSize
                writeEnd = min(writeStart + tileSize, end)
                previousValue = value

            elif previousValue == value:
                writeEnd = min(writeEnd + tileSize, end)

            elif previousValue != value:
                if not np.isnan(previousValue):
                    _file.write(
                        "%s\t%d\t%d\t%.2f\n" % (chrom, writeStart,
                                                writeEnd, previousValue))
                previousValue = value
                writeStart = writeEnd
                writeEnd = min(writeStart + tileSize, end)

    if not fixed_step:
        # write remaining value if not a nan
        if previousValue and writeStart != end and \
                not np.isnan(previousValue):
            _file.write("%s\t%d\t%d\t%.1f\n" % (chrom, writeStart,
                                                end, previousValue))

#        """
    tempFileName = _file.name
    _file.close()
    return(tempFileName)
def writeBedGraph(
    bamOrBwFileList,
    outputFileName,
    fragmentLength,
    func,
    funcArgs,
    tileSize=25,
    region=None,
    numberOfProcessors=None,
    format="bedgraph",
    extendPairedEnds=True,
    missingDataAsZero=False,
    smoothLength=0,
    fixed_step=False,
):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    """

    bamHandlers = [
        bamHandler.openBam(indexedFile) for indexedFile, fileFormat in bamOrBwFileList if fileFormat == "bam"
    ]
    if len(bamHandlers):
        genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=False)
    else:
        genomeChunkLength = int(10e6)
        bigwigs = [fileName for fileName, fileFormat in bamOrBwFileList if fileFormat == "bigwig"]
        cCommon = []
        chromNamesAndSize = {}
        for bw in bigwigs:
            bwh = pyBigWig.open(bw)
            for chromName, size in bwh.chroms().items():
                if chromName in chromNamesAndSize:
                    cCommon.append(chromName)
                    if chromNamesAndSize[chromName] != size:
                        print "\nWARNING\n" "Chromosome {} length reported in the " "bigwig files differ.\n{} for {}\n" "{} for {}.\n\nThe smallest " "length will be used".format(
                            chromName, chromNamesAndSize[chromName], bigwigs[0], size, bw
                        )
                        chromNamesAndSize[chromName] = min(chromNamesAndSize[chromName], size)
                else:
                    chromNamesAndSize[chromName] = size
            bwh.close()

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems() if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce(
        (
            tileSize,
            fragmentLength,
            bamOrBwFileList,
            func,
            funcArgs,
            extendPairedEnds,
            smoothLength,
            missingDataAsZero,
            fixed_step,
        ),
        writeBedGraph_wrapper,
        chromNamesAndSize,
        genomeChunkLength=genomeChunkLength,
        region=region,
        numberOfProcessors=numberOfProcessors,
    )

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", "wb")
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, "rb"), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == "bedgraph":
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(chromNamesAndSize, bedGraphFile, outputFileName, True)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)
Пример #56
0
    def run(self,
            func_to_call,
            func_args,
            out_file_name,
            blackListFileName=None,
            format="bedgraph",
            smoothLength=0):
        r"""
        Given a list of bamfiles, a function and a function arguments,
        this method writes a bedgraph file (or bigwig) file
        for a partition of the genome into tiles of given size
        and a value for each tile that corresponds to the given function
        and that is related to the coverage underlying the tile.

        Parameters
        ----------
        func_to_call : str
            function name to be called to convert the list of coverages computed
            for each bam file at each position into a single value. An example
            is a function that takes the ratio between the coverage of two
            bam files.
        func_args : dict
            dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0}

        out_file_name : str
            name of the file to save the resulting data.

        smoothLength : int
            Distance in bp for smoothing the coverage per tile.


        """
        self.__dict__["smoothLength"] = smoothLength
        bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList]
        genome_chunk_length = getGenomeChunkLength(bam_handlers,
                                                   self.binLength)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chrom_names_and_size, non_common = getCommonChrNames(bam_handlers,
                                                             verbose=False)

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        for x in list(self.__dict__.keys()):
            sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x)))

        res = mapReduce.mapReduce([func_to_call, func_args],
                                  writeBedGraph_wrapper,
                                  chrom_names_and_size,
                                  self_=self,
                                  genomeChunkLength=genome_chunk_length,
                                  region=self.region,
                                  blackListFileName=blackListFileName,
                                  numberOfProcessors=self.numberOfProcessors)

        # concatenate intermediary bedgraph files
        out_file = open(out_file_name + ".bg", 'wb')
        for tempfilename in res:
            if tempfilename:
                # concatenate all intermediate tempfiles into one
                # bedgraph file
                _foo = open(tempfilename, 'rb')
                shutil.copyfileobj(_foo, out_file)
                _foo.close()
                os.remove(tempfilename)

        bedgraph_file = out_file.name
        out_file.close()
        if format == 'bedgraph':
            os.rename(bedgraph_file, out_file_name)
            if self.verbose:
                print("output file: {}".format(out_file_name))
        else:
            bedGraphToBigWig(chrom_names_and_size, bedgraph_file,
                             out_file_name, True)
            if self.verbose:
                print("output file: {}".format(out_file_name))
            os.remove(bedgraph_file)
Пример #57
0
def get_scale_factor(args):

    scale_factor = args.scaleFactor
    bam_handle = bamHandler.openBam(args.bam, args.bamIndex)
    bam_mapped = parserCommon.bam_total_reads(bam_handle,
                                              args.ignoreForNormalization)

    if args.normalizeTo1x:
        # try to guess fragment length if the bam file contains paired end reads
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(
            args.bam,
            args.bamIndex,
            return_lengths=False,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose)
        if args.extendReads:
            if args.extendReads is True:
                # try to guess fragment length if the bam file contains paired end reads
                if frag_len_dict:
                    fragment_length = frag_len_dict['median']
                else:
                    exit(
                        "*ERROR*: library is not paired-end. Please provide an extension length."
                    )
                if args.verbose:
                    print(
                        "Fragment length based on paired en data "
                        "estimated to be {}".format(frag_len_dict['median']))

            elif args.extendReads < 1:
                exit(
                    "*ERROR*: read extension must be bigger than one. Value give: {} "
                    .format(args.extendReads))
            elif args.extendReads > 2000:
                exit(
                    "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                    .format(args.extendReads))
            else:
                fragment_length = args.extendReads

        else:
            # set as fragment length the read length
            fragment_length = int(read_len_dict['median'])
            if args.verbose:
                print "Estimated read length is {}".format(
                    int(read_len_dict['median']))

        current_coverage = \
            float(bam_mapped * fragment_length) / args.normalizeTo1x
        # the scaling sets the coverage to match 1x
        scale_factor *= 1.0 / current_coverage
        if debug:
            print "Estimated current coverage {}".format(current_coverage)
            print "Scaling factor {}".format(args.scaleFactor)

    elif args.normalizeUsingRPKM:
        # the RPKM is the # reads per tile / \
        #    ( total reads (in millions) * tile length in Kb)
        million_reads_mapped = float(bam_mapped) / 1e6
        tile_len_in_kb = float(args.binSize) / 1000

        scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb)

        if debug:
            print "scale factor using RPKM is {0}".format(args.scaleFactor)

    return scale_factor
Пример #58
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    if not args.outRawCounts and not args.plotFile:
        sys.exit(
            "Error: You need to specify at least one of --plotFile or --outRawCounts!\n"
        )

    if args.labels is None:
        args.labels = args.bamfiles
    if len(args.labels) != len(args.bamfiles):
        sys.exit(
            "Error: The number of labels ({0}) does not match the number of BAM files ({1})!"
            .format(len(args.labels), len(args.bamfiles)))

    # Get fragment size and chromosome dict
    fhs = [openBam(x) for x in args.bamfiles]
    chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose)
    for fh in fhs:
        fh.close()

    frag_len_dict, read_len_dict = get_read_and_fragment_length(
        args.bamfiles[0],
        return_lengths=False,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        verbose=args.verbose)
    if args.extendReads:
        if args.extendReads is True:
            # try to guess fragment length if the bam file contains paired end reads
            if frag_len_dict:
                defaultFragmentLength = frag_len_dict['median']
            else:
                sys.exit(
                    "*ERROR*: library is not paired-end. Please provide an extension length."
                )
            if args.verbose:
                print("Fragment length based on paired en data "
                      "estimated to be {0}".format(frag_len_dict['median']))
        elif args.extendReads < read_len_dict['median']:
            sys.stderr.write(
                "*WARNING*: read extension is smaller than read length (read length = {}). "
                "Reads will not be extended.\n".format(
                    int(read_len_dict['median'])))
            defaultFragmentLength = 'read length'
        elif args.extendReads > 2000:
            sys.exit(
                "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                .format(args.extendReads))
        else:
            defaultFragmentLength = args.extendReads
    else:
        defaultFragmentLength = 'read length'

    # Get the chunkLength
    chunkLength = getChunkLength(args, chromSize)

    # Map reduce to get the counts/file/feature
    res = mapReduce([args, defaultFragmentLength],
                    getEnrichment_worker,
                    chromSize,
                    genomeChunkLength=chunkLength,
                    region=args.region,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    features = res[0][1]
    featureCounts = []
    for i in list(range(len(args.bamfiles))):
        d = dict()
        for x in features:
            d[x] = 0
        featureCounts.append(d)

    # res is a list, with each element a list (length len(args.bamfiles)) of dicts
    totalCounts = [0] * len(args.bamfiles)
    for x in res:
        for i, y in enumerate(x[2]):
            totalCounts[i] += y
        for i, y in enumerate(x[0]):
            for k, v in y.items():
                featureCounts[i][k] += v

    # Make a plot
    if args.plotFile:
        plotEnrichment(args, featureCounts, totalCounts, features)

    # Raw counts
    if args.outRawCounts:
        of = open(args.outRawCounts, "w")
        of.write("file\tfeatureType\tpercent\n")
        for i, x in enumerate(args.labels):
            for k, v in featureCounts[i].items():
                of.write("{0}\t{1}\t{2:5.2f}\n".format(x, k, (100.0 * v) /
                                                       totalCounts[i]))
        of.close()
    def writeBedGraph_worker(self, chrom, start, end,
                             func_to_call, func_args,
                             bed_regions_list=None):
        r"""Writes a bedgraph based on the read coverage found on bamFiles

        The given func is called to compute the desired bedgraph value
        using the funcArgs

        Parameters
        ----------
        chrom : str
            Chrom name
        start : int
            start coordinate
        end : int
            end coordinate
        func_to_call : str
            function name to be called to convert the list of coverages computed
            for each bam file at each position into a single value. An example
            is a function that takes the ratio between the coverage of two
            bam files.
        func_args : dict
            dict of arguments to pass to `func`.
        smoothLength : int
            Distance in bp for smoothing the coverage per tile.
        bed_regions_list: list
            List of tuples of the form (chrom, start, end)
            corresponding to bed regions to be processed.
            If not bed file was passed to the object constructor
            then this list is empty.

        Returns
        -------
        temporary file with the bedgraph results for the region queried.

        Examples
        --------
        >>> test_path = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
        >>> bamFile1 = test_path +  "testA.bam"
        >>> bin_length = 50
        >>> number_of_samples = 0 # overruled by step_size
        >>> func_to_call = scaleCoverage
        >>> funcArgs = {'scaleFactor': 1.0}

        >>> c = WriteBedGraph([bamFile1], bin_length, number_of_samples, stepSize=50)
        >>> tempFile = c.writeBedGraph_worker( '3R', 0, 200, func_to_call, funcArgs)
        >>> open(tempFile, 'r').readlines()
        ['3R\t0\t100\t0.00\n', '3R\t100\t200\t1.0\n']
        >>> os.remove(tempFile)


        """
        if start > end:
            raise NameError("start position ({0}) bigger "
                            "than end position ({1})".format(start, end))

        coverage = []
        bam_handlers = [bamHandler.openBam(bam) for bam in self.bamFilesList]
        for bam in bam_handlers:
            coverage.append(
                self.get_coverage_of_region(bam, chrom, start, end, self.binLength))
            bam.close()

        _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
        previous_value = None

        length_coverage = len(coverage[0])
        for tileIndex in xrange(length_coverage):

            tileCoverage = []
            for index in range(len(self.bamFilesList)):
                if self.smoothLength > 0:
                    vector_start, vector_end = self.getSmoothRange(tileIndex,
                                                                   self.binLength,
                                                                   self.smoothLength,
                                                                   length_coverage)
                    tileCoverage.append(
                        np.mean(coverage[index][vector_start:vector_end]))
                else:
                    tileCoverage.append(coverage[index][tileIndex])

            value = func_to_call(tileCoverage, func_args)
            """
            # uncomment this lines if fixed step bedgraph is wanted
            if not  np.isnan(value):
                writeStart = start + tileIndex*self.binLength
                writeEnd  =  min(writeStart+self.binLength, end)
                _file.write( "%s\t%d\t%d\t%.2f\n" % (chrom, writeStart,
                                                     writeEnd, value) )
            """

            if previous_value is None:
                writeStart = start + tileIndex * self.binLength
                writeEnd = min(writeStart + self.binLength, end)
                previous_value = value

            elif previous_value == value:
                writeEnd = min(writeEnd + self.binLength, end)

            elif previous_value != value:
                if not np.isnan(previous_value):
                    _file.write(
                        "{}\t{}\t{}\t{:.2f}\n".format(chrom, writeStart,
                                                      writeEnd, previous_value))
                previous_value = value
                writeStart = writeEnd
                writeEnd = min(writeStart + self.binLength, end)

        # write remaining value if not a nan
        if previous_value and writeStart != end and not np.isnan(previous_value):
            _file.write("%s\t%d\t%d\t%.1f\n" % (chrom, writeStart,
                                                end, previous_value))

        tempfilename = _file.name
        _file.close()
        return tempfilename