Пример #1
0
def get_scale_factors(args, statsList, mappedList):

    if args.scaleFactors:
        scale_factors = list(map(float, args.scaleFactors.split(":")))
    elif args.scaleFactorsMethod == 'SES':
        scalefactors_dict = estimateScaleFactor(
            [args.bamfile1, args.bamfile2],
            args.sampleLength,
            args.numberOfSamples,
            1,
            mappingStatsList=mappedList,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose,
            chrsToSkip=args.ignoreForNormalization)

        scale_factors = scalefactors_dict['size_factors']

        if args.verbose:
            print("Size factors using SES: {}".format(scale_factors))
            print("%s regions of size %s where used " %
                  (scalefactors_dict['sites_sampled'], args.sampleLength))

            print(
                "ignoring filtering/blacklists, size factors if the number of mapped "
                "reads would have been used:")
            print(tuple(float(min(mappedList)) / np.array(mappedList)))

    elif args.scaleFactorsMethod == 'readCount':
        # change the scaleFactor to 1.0
        args.scaleFactor = 1.0
        # get num of kept reads for bam file 1
        args.bam = args.bamfile1
        bam1_mapped, _ = get_num_kept_reads(args, statsList[0])
        # get num of kept reads for bam file 2
        args.bam = args.bamfile2
        bam2_mapped, _ = get_num_kept_reads(args, statsList[1])

        mapped_reads = [bam1_mapped, bam2_mapped]

        # new scale_factors (relative to min of two bams)
        scale_factors = float(min(bam1_mapped,
                                  bam2_mapped)) / np.array(mapped_reads)
        if args.verbose:
            print("Size factors using total number "
                  "of mapped reads: {}".format(scale_factors))

    elif args.scaleFactorsMethod == 'None':
        scale_factors = None

    return scale_factors
Пример #2
0
def get_scale_factors(args, statsList, mappedList):

    if args.scaleFactors:
        scale_factors = list(map(float, args.scaleFactors.split(":")))
    elif args.scaleFactorsMethod == 'SES':
        scalefactors_dict = estimateScaleFactor(
            [args.bamfile1, args.bamfile2],
            args.sampleLength, args.numberOfSamples,
            1,
            mappingStatsList=mappedList,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose,
            chrsToSkip=args.ignoreForNormalization)

        scale_factors = scalefactors_dict['size_factors']

        if args.verbose:
            print("Size factors using SES: {}".format(scale_factors))
            print("%s regions of size %s where used " %
                  (scalefactors_dict['sites_sampled'],
                   args.sampleLength))

            print("ignoring filtering/blacklists, size factors if the number of mapped "
                  "reads would have been used:")
            print(tuple(
                float(min(mappedList)) / np.array(mappedList)))

    elif args.scaleFactorsMethod == 'readCount':
        # change the scaleFactor to 1.0
        args.scaleFactor = 1.0
        # get num of kept reads for bam file 1
        args.bam = args.bamfile1
        bam1_mapped, _ = get_num_kept_reads(args, statsList[0])
        # get num of kept reads for bam file 2
        args.bam = args.bamfile2
        bam2_mapped, _ = get_num_kept_reads(args, statsList[1])

        mapped_reads = [bam1_mapped, bam2_mapped]

        # new scale_factors (relative to min of two bams)
        scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(mapped_reads)
        if args.verbose:
            print("Size factors using total number "
                  "of mapped reads: {}".format(scale_factors))

    elif args.scaleFactorsMethod == 'None':
        scale_factors = None

    return scale_factors
Пример #3
0
def get_scale_factors(args):

    bam1 = bamHandler.openBam(args.bamfile1)
    bam2 = bamHandler.openBam(args.bamfile2)

    bam1_mapped = parserCommon.bam_total_reads(bam1, args.ignoreForNormalization)
    bam2_mapped = parserCommon.bam_total_reads(bam2, args.ignoreForNormalization)

    if args.scaleFactors:
        scale_factors = map(float, args.scaleFactors.split(":"))
    else:
        if args.scaleFactorsMethod == 'SES':
            scalefactors_dict = estimateScaleFactor(
                [bam1.filename, bam2.filename],
                args.sampleLength, args.numberOfSamples,
                1,
                numberOfProcessors=args.numberOfProcessors,
                verbose=args.verbose,
                chrsToSkip=args.ignoreForNormalization)

            scale_factors = scalefactors_dict['size_factors']

            if args.verbose:
                print "Size factors using SES: {}".format(scale_factors)
                print "%s regions of size %s where used " % \
                    (scalefactors_dict['sites_sampled'],
                     args.sampleLength)

                print "size factor if the number of mapped " \
                    "reads would have been used:"
                print tuple(
                    float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped]))

        elif args.scaleFactorsMethod == 'readCount':
            scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array([bam1_mapped, bam2_mapped])
            if args.verbose:
                print "Size factors using total number " \
                    "of mapped reads: {}".format(scale_factors)

    # in case the subtract method is used, the final difference
    # would be normalized according to the given method
    if args.ratio == 'subtract':
        # The next lines identify which of the samples is not scaled down.
        # The normalization using RPKM or normalize to 1x would use
        # as reference such sample. Since the other sample would be
        # scaled to match the un-scaled one, the normalization factor
        # for both samples should be based on the unscaled one.
        # For example, if sample A is unscaled and sample B is scaled by 0.5,
        # then normalizing factor for A to report RPKM read counts
        # is also applied to B.
        if scale_factors[0] == 1:
            mappedReads = bam1_mapped
            bamfile = args.bamfile1
        else:
            mappedReads = bam2_mapped
            bamfile = args.bamfile2

        if args.scaleFactors is None:
            if args.normalizeTo1x:
                # try to guess fragment length if the bam file contains paired end reads
                from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
                frag_len_dict, read_len_dict = get_read_and_fragment_length(bamfile,
                                                                            return_lengths=False,
                                                                            numberOfProcessors=args.numberOfProcessors,
                                                                            verbose=args.verbose)
                if args.extendReads:
                    if args.extendReads is True:
                        # try to guess fragment length if the bam file contains paired end reads
                        if frag_len_dict:
                            fragment_length = frag_len_dict['median']
                        else:
                            exit("*ERROR*: library is not paired-end. Please provide an extension length.")
                        if args.verbose:
                            print("Fragment length based on paired en data "
                                  "estimated to be {}".format(frag_len_dict['median']))

                    elif args.extendReads < 1:
                        exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads))
                    elif args.extendReads > 2000:
                        exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
                    else:
                        fragment_length = args.extendReads

                else:
                    # set as fragment length the read length
                    fragment_length = int(read_len_dict['median'])
                    if args.verbose:
                        print "Estimated read length is {}".format(int(read_len_dict['median']))

                current_coverage = float(mappedReads * fragment_length) / args.normalizeTo1x
                # the coverage scale factor is 1 / coverage,
                coverage_scale_factor = 1.0 / current_coverage
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print "Estimated current coverage {}".format(current_coverage)
                    print "Scale factor to convert " \
                          "current coverage to 1: {}".format(coverage_scale_factor)
            else:
                # by default normalize using RPKM
                # the RPKM is:
                # Num reads per tile/(total reads (in millions)*tile length in Kb)
                millionReadsMapped = float(mappedReads) / 1e6
                tileLengthInKb = float(args.binSize) / 1000
                coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb)
                scale_factors = np.array(scale_factors) * coverage_scale_factor

                if args.verbose:
                    print "scale factor for   "
                    "RPKM is {0}".format(coverage_scale_factor)

    return scale_factors
Пример #4
0
def correctReadCounts(bamFilesList,
                      binLength,
                      numberOfSamples,
                      defaultFragmentLength,
                      outFileName,
                      outFileFormat,
                      outFileNameCorr=None,
                      region=None,
                      extendPairedEnds=True,
                      numberOfProcessors=1,
                      Nsigmas=2,
                      maxSignalRatio=10,
                      blackListFileName=None,
                      verbose=False):

    bam1 = writeBedGraph.openBam(bamFilesList[0])
    bam2 = writeBedGraph.openBam(bamFilesList[1])

    treatmentMapped = bam1.mapped
    controlMapped = bam2.mapped
    treatmentControlRatioMapped = float(treatmentMapped) / controlMapped

    # 1. Get a table containing number of reads in a sample from the genome.
    #    Only regions for which both samples have more than zero counts are considered

    scaleFactorsDict = estimateScaleFactor(
        bamFilesList,
        binLength,
        numberOfSamples,
        defaultFragmentLength,
        1,
        blackListFileName=blackListFileName,
        numberOfProcessors=numberOfProcessors,
        verbose=verbose)
    """
    num_reads_per_region = getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors, skipZeros=True, verbose=verbose)
    if verbose:
        print "number of non-zero regions sampled: {}".format(num_reads_per_region.shape[0])

    # 2. get Mean and std of treatment (col1) and control (col2)

    treatmentMean, controlMean = np.mean(num_reads_per_region, axis=0) # axis=0: that measn by column
    treatmentStd, controlStd   = np.std(num_reads_per_region, axis=0)
    treatmentTotal, controlTotal   = np.sum(num_reads_per_region, axis=0)

    # 3. Calculate residual in treatment & control data, at regions for which treatment
    #    signal exceeds mean + std * Nsigmas
    #    (these are expected to be the regions at which the signal > mean-signal,
    #    so the residual signal is positive)

    overRows = np.where(num_reads_per_region[:,0].copy() >= treatmentMean + treatmentStd*Nsigmas )[0]
    over_Nsigma_regions = num_reads_per_region[overRows, :]

    treatmentSigMean, controlSigMean = np.mean(over_Nsigma_regions, axis=0)

    treatmentExtraSignal = treatmentSigMean - treatmentMean
    controlExtraSignal   = controlSigMean - controlMean

    treatmentControlRatio = float(treatmentTotal) / controlTotal
    adjSignalRatio = maxSignalRatio * treatmentControlRatio;
    treatmentSignalRatio = float(treatmentExtraSignal) / controlExtraSignal

    if treatmentSignalRatio < adjSignalRatio and treatmentSignalRatio > 0:
        treatmentSignalRatio = adjSignalRatio

    if treatmentSignalRatio < 1:
        raise NameError("estimated signal in control file {} is greater than estimated signal in treatmant file {}. Perhaps the file names are swapped?".format(bamFilesList[0], bamFilesList[1]))

    else:
        controlSignalRatio = 1.0/treatmentSignalRatio

    controlRatio = 1.0 / treatmentControlRatio

    """

    #    scaleFactors = scaleFactorsDict['size_factors']

    treatmentMean, controlMean = scaleFactorsDict['meanSES']
    treatmentControlRatio = scaleFactorsDict['size_factors'][
        1] / scaleFactorsDict['size_factors'][0]
    treatmentSignalRatio = treatmentControlRatio
    controlRatio = controlSignalRatio = 1.0 / treatmentControlRatio
    treatmentTotal = treatmentMapped
    controlTotal = controlMapped

    print("Treatment mean: {:.2f}, Treatment total:{:.2f}".format(
        treatmentMean, treatmentTotal))
    print("Control mean: {:.2f}, Control total:{}".format(
        controlMean, controlTotal))
    print("the ratio of treatment vs. control for enriched regions is: {:.2f}".
          format(treatmentSignalRatio))
    print(
        "the ratio of treatment vs. control ratio: {:.2f} (if based on mapped reads: {:.2f})"
        .format(treatmentControlRatio, treatmentControlRatioMapped))

    funcArgs = {
        'controlMean': controlMean,
        'treatmentMean': treatmentMean,
        'controlSignalRatio': controlSignalRatio,
        'controlRatio': controlRatio,
        'treatmentControlRatio': treatmentControlRatio
    }

    writeBedGraph.writeBedGraph(bamFilesList,
                                outFileName,
                                defaultFragmentLength,
                                computePvalue,
                                funcArgs,
                                tileSize=binLength,
                                region=region,
                                format=outFileFormat,
                                zerosToNans=False,
                                blackListFileName=blackListFileName,
                                numberOfProcessors=numberOfProcessors,
                                extendPairedEnds=extendPairedEnds)

    if outFileNameCorr:
        writeBedGraph.writeBedGraph(bamFilesList,
                                    outFileNameCorr,
                                    defaultFragmentLength,
                                    computeCorrectedReadcounts,
                                    funcArgs,
                                    tileSize=binLength,
                                    region=region,
                                    format=outFileFormat,
                                    zerosToNans=False,
                                    blackListFileName=blackListFileName,
                                    numberOfProcessors=numberOfProcessors,
                                    extendPairedEnds=extendPairedEnds)
Пример #5
0
def correctReadCounts(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, 
                  outFileName, outFileFormat, outFileNameCorr=None, region=None,
                  extendPairedEnds=True,
                  numberOfProcessors=1, Nsigmas = 2, maxSignalRatio=10, verbose=False):
    
    bam1 = writeBedGraph.openBam(bamFilesList[0])
    genomeSize = sum(bam1.lengths)

    bam2 = writeBedGraph.openBam(bamFilesList[1])

    treatmentMapped = bam1.mapped
    controlMapped  =  bam2.mapped
    treatmentControlRatioMapped = float(treatmentMapped) / controlMapped

    # 1. Get a table containing number of reads in a sample from the genome.
    #    Only regions for which both samples have more than zero counts are considered

    scaleFactorsDict = estimateScaleFactor(bamFilesList,
                                           binLength,
                                           numberOfSamples, 
                                           defaultFragmentLength, 
                                           1, 
                                           numberOfProcessors=numberOfProcessors,
                                           verbose=verbose)

    """
    num_reads_per_region = getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors, skipZeros=True, verbose=verbose)
    if verbose:
        print "number of non-zero regions sampled: {}".format(num_reads_per_region.shape[0])
    
    # 2. get Mean and std of treatment (col1) and control (col2)

    treatmentMean, controlMean = np.mean(num_reads_per_region, axis=0) # axis=0: that measn by column
    treatmentStd, controlStd   = np.std(num_reads_per_region, axis=0)
    treatmentTotal, controlTotal   = np.sum(num_reads_per_region, axis=0)

    # 3. Calculate residual in treatment & control data, at regions for which treatment
    #    signal exceeds mean + std * Nsigmas
    #    (these are expected to be the regions at which the signal > mean-signal, 
    #    so the residual signal is positive)

    overRows = np.where(num_reads_per_region[:,0].copy() >= treatmentMean + treatmentStd*Nsigmas )[0]
    over_Nsigma_regions = num_reads_per_region[overRows, :]
    
    treatmentSigMean, controlSigMean = np.mean(over_Nsigma_regions, axis=0)

    treatmentExtraSignal = treatmentSigMean - treatmentMean
    controlExtraSignal   = controlSigMean - controlMean

    treatmentControlRatio = float(treatmentTotal) / controlTotal
    adjSignalRatio = maxSignalRatio * treatmentControlRatio;
    treatmentSignalRatio = float(treatmentExtraSignal) / controlExtraSignal

    if treatmentSignalRatio < adjSignalRatio and treatmentSignalRatio > 0:
        treatmentSignalRatio = adjSignalRatio

    if treatmentSignalRatio < 1:
        raise NameError("estimated signal in control file {} is greater than estimated signal in treatmant file {}. Perhaps the file names are swapped?".format(bamFilesList[0], bamFilesList[1]))

    else:
        controlSignalRatio = 1.0/treatmentSignalRatio

    controlRatio = 1.0 / treatmentControlRatio

    """

#    scaleFactors = scaleFactorsDict['size_factors']

    treatmentMean, controlMean = scaleFactorsDict['meanSES']
    treatmentControlRatio = scaleFactorsDict['size_factors'][1]/scaleFactorsDict['size_factors'][0]
    treatmentSignalRatio = treatmentControlRatio
    controlRatio = controlSignalRatio = 1.0 /treatmentControlRatio
    treatmentTotal = treatmentMapped
    controlTotal = controlMapped

    print "Treatment mean: {:.2f}, Treatment total:{:.2f}".format(treatmentMean, treatmentTotal)
    print "Control mean: {:.2f}, Control total:{}".format(controlMean, controlTotal)
    print "the ratio of treatment vs. control for enriched regions is: {:.2f}".format(treatmentSignalRatio)
    print "the ratio of treatment vs. control ratio: {:.2f} (if based on mapped reads: {:.2f})".format(treatmentControlRatio, treatmentControlRatioMapped)

    

    funcArgs = {'controlMean': controlMean,
                'treatmentMean': treatmentMean,
                'controlSignalRatio': controlSignalRatio,
                'controlRatio': controlRatio,
                'treatmentControlRatio': treatmentControlRatio
                }


    writeBedGraph.writeBedGraph( bamFilesList,
                                 outFileName,
                                 defaultFragmentLength, computePvalue, 
                                 funcArgs, tileSize=binLength, region=region,
                                 format=outFileFormat,
                                 zerosToNans = False,
                                 numberOfProcessors=numberOfProcessors,
                                 extendPairedEnds=extendPairedEnds)

    if outFileNameCorr:
        writeBedGraph.writeBedGraph( bamFilesList,
                                     outFileNameCorr,
                                     defaultFragmentLength, computeCorrectedReadcounts, 
                                     funcArgs, tileSize=binLength, region=region,
                                     format=outFileFormat,
                                     zerosToNans = False,
                                     numberOfProcessors=numberOfProcessors,
                                     extendPairedEnds=extendPairedEnds)
Пример #6
0
def get_scale_factors(args):

    bam1 = bamHandler.openBam(args.bamfile1, args.bamIndex1)
    bam2 = bamHandler.openBam(args.bamfile2, args.bamIndex2)

    bam1_mapped = parserCommon.bam_total_reads(bam1,
                                               args.ignoreForNormalization)
    bam2_mapped = parserCommon.bam_total_reads(bam2,
                                               args.ignoreForNormalization)

    if args.scaleFactors:
        scale_factors = map(float, args.scaleFactors.split(":"))
    else:
        if args.scaleFactorsMethod == 'SES':
            scalefactors_dict = estimateScaleFactor(
                [bam1.filename, bam2.filename],
                args.sampleLength,
                args.numberOfSamples,
                1,
                numberOfProcessors=args.numberOfProcessors,
                verbose=args.verbose,
                chrsToSkip=args.ignoreForNormalization)

            scale_factors = scalefactors_dict['size_factors']

            if args.verbose:
                print "Size factors using SES: {}".format(scale_factors)
                print "%s regions of size %s where used " % \
                    (scalefactors_dict['sites_sampled'],
                     args.sampleLength)

                print "size factor if the number of mapped " \
                    "reads would have been used:"
                print tuple(
                    float(min(bam1.mapped, bam2.mapped)) /
                    np.array([bam1.mapped, bam2.mapped]))

        elif args.scaleFactorsMethod == 'readCount':
            scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(
                [bam1_mapped, bam2_mapped])
            if args.verbose:
                print "Size factors using total number " \
                    "of mapped reads: {}".format(scale_factors)

    # in case the subtract method is used, the final difference
    # would be normalized according to the given method
    if args.ratio == 'subtract':
        # The next lines identify which of the samples is not scaled down.
        # The normalization using RPKM or normalize to 1x would use
        # as reference such sample. Since the other sample would be
        # scaled to match the un-scaled one, the normalization factor
        # for both samples should be based on the unscaled one.
        # For example, if sample A is unscaled and sample B is scaled by 0.5,
        # then normalizing factor for A to report RPKM read counts
        # is also applied to B.
        if scale_factors[0] == 1:
            mappedReads = bam1_mapped
            bamfile = args.bamfile1
            bamindex = args.bamIndex1
        else:
            mappedReads = bam2_mapped
            bamfile = args.bamfile2
            bamindex = args.bamIndex2

        if args.scaleFactors is None:
            if args.normalizeTo1x:
                # try to guess fragment length if the bam file contains paired end reads
                from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
                frag_len_dict, read_len_dict = get_read_and_fragment_length(
                    bamfile,
                    bamindex,
                    return_lengths=False,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)
                if args.extendReads:
                    if args.extendReads is True:
                        # try to guess fragment length if the bam file contains paired end reads
                        if frag_len_dict:
                            fragment_length = frag_len_dict['median']
                        else:
                            exit(
                                "*ERROR*: library is not paired-end. Please provide an extension length."
                            )
                        if args.verbose:
                            print(
                                "Fragment length based on paired en data "
                                "estimated to be {}".format(
                                    frag_len_dict['median']))

                    elif args.extendReads < 1:
                        exit(
                            "*ERROR*: read extension must be bigger than one. Value give: {} "
                            .format(args.extendReads))
                    elif args.extendReads > 2000:
                        exit(
                            "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                            .format(args.extendReads))
                    else:
                        fragment_length = args.extendReads

                else:
                    # set as fragment length the read length
                    fragment_length = int(read_len_dict['median'])
                    if args.verbose:
                        print "Estimated read length is {}".format(
                            int(read_len_dict['median']))

                current_coverage = float(
                    mappedReads * fragment_length) / args.normalizeTo1x
                # the coverage scale factor is 1 / coverage,
                coverage_scale_factor = 1.0 / current_coverage
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print "Estimated current coverage {}".format(
                        current_coverage)
                    print "Scale factor to convert " \
                          "current coverage to 1: {}".format(coverage_scale_factor)
            else:
                # by default normalize using RPKM
                # the RPKM is:
                # Num reads per tile/(total reads (in millions)*tile length in Kb)
                millionReadsMapped = float(mappedReads) / 1e6
                tileLengthInKb = float(args.binSize) / 1000
                coverage_scale_factor = 1.0 / (millionReadsMapped *
                                               tileLengthInKb)
                scale_factors = np.array(scale_factors) * coverage_scale_factor

                if args.verbose:
                    print "scale factor for   "
                    "RPKM is {0}".format(coverage_scale_factor)

    return scale_factors
Пример #7
0
def get_scale_factors(args):
    if args.ratio == 'subtract':
        # We need raw counts in this case
        normalizeTo1x = args.normalizeTo1x
        normalizeUsingRPKM = args.normalizeUsingRPKM
        args.normalizeTo1x = False
        args.normalizeUsingRPKM = False

    # This is only used if we subtract
    mapped_reads = [None, None]

    if args.scaleFactors:
        scale_factors = list(map(float, args.scaleFactors.split(":")))
    elif args.scaleFactorsMethod == 'SES':
        scalefactors_dict = estimateScaleFactor(
            [args.bamfile1, args.bamfile2],
            args.sampleLength, args.numberOfSamples,
            1,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose,
            chrsToSkip=args.ignoreForNormalization)

        scale_factors = scalefactors_dict['size_factors']

        if args.verbose:
            bam1 = bamHandler.openBam(args.bamfile1)
            bam2 = bamHandler.openBam(args.bamfile2)

            print("Size factors using SES: {}".format(scale_factors))
            print("%s regions of size %s where used " %
                  (scalefactors_dict['sites_sampled'],
                   args.sampleLength))

            print("ignoring filtering/blacklists, size factors if the number of mapped "
                  "reads would have been used:")
            print(tuple(
                float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped])))
            bam1.close()
            bam2.close()

    elif args.scaleFactorsMethod == 'readCount':
        args.bam = args.bamfile1
        args.scaleFactor = 1.0
        bam1_mapped, _ = get_num_kept_reads(args)
        args.bam = args.bamfile2
        bam2_mapped, _ = get_num_kept_reads(args)
        scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array([bam1_mapped, bam2_mapped])
        mapped_reads = [bam1_mapped, bam2_mapped]
        if args.verbose:
            print("Size factors using total number "
                  "of mapped reads: {}".format(scale_factors))

    # in case the subtract method is used, the final difference
    # would be normalized according to the given method
    if args.ratio == 'subtract':
        # The next lines identify which of the samples is not scaled down.
        # The normalization using RPKM or normalize to 1x would use
        # as reference such sample. Since the other sample would be
        # scaled to match the un-scaled one, the normalization factor due to RPKM or normalize1x
        # for both samples should be based on the unscaled one.
        # For example, if sample A is unscaled and sample B is scaled by 0.5,
        # then normalizing factor for A to report RPKM read counts
        # is also applied to B.

        if args.scaleFactors is None:
            # check which of the two samples is not scaled down
            if scale_factors[0] == 1:
                args.bam = args.bamfile1
                mapped_reads = mapped_reads[0]
            else:
                args.bam = args.bamfile2
                mapped_reads = mapped_reads[1]
            if mapped_reads is None:
                mapped_reads, _ = get_num_kept_reads(args)

        # Replace the arguments
        args.normalizeTo1x = normalizeTo1x
        args.normalizeUsingRPKM = normalizeUsingRPKM

        if args.scaleFactors is None:
            if args.normalizeTo1x:
                # try to guess fragment length if the bam file contains paired end reads
                from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
                frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
                                                                            return_lengths=False,
                                                                            blackListFileName=args.blackListFileName,
                                                                            numberOfProcessors=args.numberOfProcessors,
                                                                            verbose=args.verbose)
                if args.extendReads:
                    if args.extendReads is True:
                        # try to guess fragment length if the bam file contains paired end reads
                        if frag_len_dict:
                            fragment_length = frag_len_dict['median']
                        else:
                            exit("*ERROR*: library is not paired-end. Please provide an extension length.")
                        if args.verbose:
                            print(("Fragment length based on paired en data "
                                  "estimated to be {}".format(frag_len_dict['median'])))

                    elif args.extendReads < 1:
                        exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads))
                    elif args.extendReads > 2000:
                        exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
                    else:
                        fragment_length = args.extendReads

                else:
                    # set as fragment length the read length
                    fragment_length = int(read_len_dict['median'])
                    if args.verbose:
                        print("Estimated read length is {}".format(int(read_len_dict['median'])))

                current_coverage = float(mapped_reads * fragment_length) / args.normalizeTo1x
                # the coverage scale factor is 1 / coverage,
                coverage_scale_factor = 1.0 / current_coverage
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print("Estimated current coverage {}".format(current_coverage))
                    print("Scale factor to convert "
                          "current coverage to 1: {}".format(coverage_scale_factor))
            else:
                # by default normalize using RPKM
                # the RPKM is:
                # Num reads per tile/(total reads (in millions)*tile length in Kb)
                millionReadsMapped = float(mapped_reads) / 1e6
                tileLengthInKb = float(args.binSize) / 1000
                coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb)
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print("Scale factor for RPKM is {0}".format(coverage_scale_factor))

    return scale_factors
Пример #8
0
def get_scale_factors(args):
    if args.ratio == 'subtract':
        # We need raw counts in this case
        normalizeTo1x = args.normalizeTo1x
        normalizeUsingRPKM = args.normalizeUsingRPKM
        args.normalizeTo1x = False
        args.normalizeUsingRPKM = False

    # This is only used if we subtract
    mapped_reads = [None, None]

    if args.scaleFactors:
        scale_factors = list(map(float, args.scaleFactors.split(":")))
    elif args.scaleFactorsMethod == 'SES':
        scalefactors_dict = estimateScaleFactor(
            [args.bamfile1, args.bamfile2],
            args.sampleLength,
            args.numberOfSamples,
            1,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose,
            chrsToSkip=args.ignoreForNormalization)

        scale_factors = scalefactors_dict['size_factors']

        if args.verbose:
            bam1 = bamHandler.openBam(args.bamfile1)
            bam2 = bamHandler.openBam(args.bamfile2)

            print("Size factors using SES: {}".format(scale_factors))
            print("%s regions of size %s where used " %
                  (scalefactors_dict['sites_sampled'], args.sampleLength))

            print(
                "ignoring filtering/blacklists, size factors if the number of mapped "
                "reads would have been used:")
            print(
                tuple(
                    float(min(bam1.mapped, bam2.mapped)) /
                    np.array([bam1.mapped, bam2.mapped])))
            bam1.close()
            bam2.close()

    elif args.scaleFactorsMethod == 'readCount':
        args.bam = args.bamfile1
        args.scaleFactor = 1.0
        bam1_mapped, _ = get_num_kept_reads(args)
        args.bam = args.bamfile2
        bam2_mapped, _ = get_num_kept_reads(args)
        scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(
            [bam1_mapped, bam2_mapped])
        mapped_reads = [bam1_mapped, bam2_mapped]
        if args.verbose:
            print("Size factors using total number "
                  "of mapped reads: {}".format(scale_factors))

    # in case the subtract method is used, the final difference
    # would be normalized according to the given method
    if args.ratio == 'subtract':
        # The next lines identify which of the samples is not scaled down.
        # The normalization using RPKM or normalize to 1x would use
        # as reference such sample. Since the other sample would be
        # scaled to match the un-scaled one, the normalization factor due to RPKM or normalize1x
        # for both samples should be based on the unscaled one.
        # For example, if sample A is unscaled and sample B is scaled by 0.5,
        # then normalizing factor for A to report RPKM read counts
        # is also applied to B.

        if args.scaleFactors is None:
            # check which of the two samples is not scaled down
            if scale_factors[0] == 1:
                args.bam = args.bamfile1
                mapped_reads = mapped_reads[0]
            else:
                args.bam = args.bamfile2
                mapped_reads = mapped_reads[1]
            if mapped_reads is None:
                mapped_reads, _ = get_num_kept_reads(args)

        # Replace the arguments
        args.normalizeTo1x = normalizeTo1x
        args.normalizeUsingRPKM = normalizeUsingRPKM

        if args.scaleFactors is None:
            if args.normalizeTo1x:
                # try to guess fragment length if the bam file contains paired end reads
                from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
                frag_len_dict, read_len_dict = get_read_and_fragment_length(
                    args.bam,
                    return_lengths=False,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)
                if args.extendReads:
                    if args.extendReads is True:
                        # try to guess fragment length if the bam file contains paired end reads
                        if frag_len_dict:
                            fragment_length = frag_len_dict['median']
                        else:
                            exit(
                                "*ERROR*: library is not paired-end. Please provide an extension length."
                            )
                        if args.verbose:
                            print(("Fragment length based on paired en data "
                                   "estimated to be {}".format(
                                       frag_len_dict['median'])))

                    elif args.extendReads < 1:
                        exit(
                            "*ERROR*: read extension must be bigger than one. Value give: {} "
                            .format(args.extendReads))
                    elif args.extendReads > 2000:
                        exit(
                            "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                            .format(args.extendReads))
                    else:
                        fragment_length = args.extendReads

                else:
                    # set as fragment length the read length
                    fragment_length = int(read_len_dict['median'])
                    if args.verbose:
                        print("Estimated read length is {}".format(
                            int(read_len_dict['median'])))

                current_coverage = float(
                    mapped_reads * fragment_length) / args.normalizeTo1x
                # the coverage scale factor is 1 / coverage,
                coverage_scale_factor = 1.0 / current_coverage
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print("Estimated current coverage {}".format(
                        current_coverage))
                    print("Scale factor to convert "
                          "current coverage to 1: {}".format(
                              coverage_scale_factor))
            else:
                # by default normalize using RPKM
                # the RPKM is:
                # Num reads per tile/(total reads (in millions)*tile length in Kb)
                millionReadsMapped = float(mapped_reads) / 1e6
                tileLengthInKb = float(args.binSize) / 1000
                coverage_scale_factor = 1.0 / (millionReadsMapped *
                                               tileLengthInKb)
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print("Scale factor for RPKM is {0}".format(
                        coverage_scale_factor))

    return scale_factors