Пример #1
0
def get_scale_factor(args):

    scale_factor = args.scaleFactor
    bam_handle = bamHandler.openBam(args.bam)
    bam_mapped = parserCommon.bam_total_reads(bam_handle, args.ignoreForNormalization)
    blacklisted = parserCommon.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization, args.blackListFileName)
    bam_mapped -= blacklisted

    if args.normalizeTo1x:
        # try to guess fragment length if the bam file contains paired end reads
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
                                                                    return_lengths=False,
                                                                    blackListFileName=args.blackListFileName,
                                                                    numberOfProcessors=args.numberOfProcessors,
                                                                    verbose=args.verbose)
        if args.extendReads:
            if args.extendReads is True:
                # try to guess fragment length if the bam file contains paired end reads
                if frag_len_dict:
                    fragment_length = frag_len_dict['median']
                else:
                    exit("*ERROR*: library is not paired-end. Please provide an extension length.")
                if args.verbose:
                    print("Fragment length based on paired en data "
                          "estimated to be {}".format(frag_len_dict['median']))

            elif args.extendReads < 1:
                exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads))
            elif args.extendReads > 2000:
                exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
            else:
                fragment_length = args.extendReads

        else:
            # set as fragment length the read length
            fragment_length = int(read_len_dict['median'])
            if args.verbose:
                print "Estimated read length is {}".format(int(read_len_dict['median']))

        current_coverage = \
            float(bam_mapped * fragment_length) / args.normalizeTo1x
        # the scaling sets the coverage to match 1x
        scale_factor *= 1.0 / current_coverage
        if debug:
            print "Estimated current coverage {}".format(current_coverage)
            print "Scaling factor {}".format(args.scaleFactor)

    elif args.normalizeUsingRPKM:
        # the RPKM is the # reads per tile / \
        #    ( total reads (in millions) * tile length in Kb)
        million_reads_mapped = float(bam_mapped) / 1e6
        tile_len_in_kb = float(args.binSize) / 1000

        scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb)

        if debug:
            print "scale factor using RPKM is {0}".format(args.scaleFactor)

    return scale_factor
Пример #2
0
def get_scale_factors(args):

    bam1 = bamHandler.openBam(args.bamfile1)
    bam2 = bamHandler.openBam(args.bamfile2)

    bam1_mapped = parserCommon.bam_total_reads(bam1, args.ignoreForNormalization)
    bam2_mapped = parserCommon.bam_total_reads(bam2, args.ignoreForNormalization)

    if args.scaleFactors:
        scale_factors = map(float, args.scaleFactors.split(":"))
    else:
        if args.scaleFactorsMethod == 'SES':
            scalefactors_dict = estimateScaleFactor(
                [bam1.filename, bam2.filename],
                args.sampleLength, args.numberOfSamples,
                1,
                numberOfProcessors=args.numberOfProcessors,
                verbose=args.verbose,
                chrsToSkip=args.ignoreForNormalization)

            scale_factors = scalefactors_dict['size_factors']

            if args.verbose:
                print "Size factors using SES: {}".format(scale_factors)
                print "%s regions of size %s where used " % \
                    (scalefactors_dict['sites_sampled'],
                     args.sampleLength)

                print "size factor if the number of mapped " \
                    "reads would have been used:"
                print tuple(
                    float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped]))

        elif args.scaleFactorsMethod == 'readCount':
            scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array([bam1_mapped, bam2_mapped])
            if args.verbose:
                print "Size factors using total number " \
                    "of mapped reads: {}".format(scale_factors)

    # in case the subtract method is used, the final difference
    # would be normalized according to the given method
    if args.ratio == 'subtract':
        # The next lines identify which of the samples is not scaled down.
        # The normalization using RPKM or normalize to 1x would use
        # as reference such sample. Since the other sample would be
        # scaled to match the un-scaled one, the normalization factor
        # for both samples should be based on the unscaled one.
        # For example, if sample A is unscaled and sample B is scaled by 0.5,
        # then normalizing factor for A to report RPKM read counts
        # is also applied to B.
        if scale_factors[0] == 1:
            mappedReads = bam1_mapped
            bamfile = args.bamfile1
        else:
            mappedReads = bam2_mapped
            bamfile = args.bamfile2

        if args.scaleFactors is None:
            if args.normalizeTo1x:
                # try to guess fragment length if the bam file contains paired end reads
                from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
                frag_len_dict, read_len_dict = get_read_and_fragment_length(bamfile,
                                                                            return_lengths=False,
                                                                            numberOfProcessors=args.numberOfProcessors,
                                                                            verbose=args.verbose)
                if args.extendReads:
                    if args.extendReads is True:
                        # try to guess fragment length if the bam file contains paired end reads
                        if frag_len_dict:
                            fragment_length = frag_len_dict['median']
                        else:
                            exit("*ERROR*: library is not paired-end. Please provide an extension length.")
                        if args.verbose:
                            print("Fragment length based on paired en data "
                                  "estimated to be {}".format(frag_len_dict['median']))

                    elif args.extendReads < 1:
                        exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads))
                    elif args.extendReads > 2000:
                        exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
                    else:
                        fragment_length = args.extendReads

                else:
                    # set as fragment length the read length
                    fragment_length = int(read_len_dict['median'])
                    if args.verbose:
                        print "Estimated read length is {}".format(int(read_len_dict['median']))

                current_coverage = float(mappedReads * fragment_length) / args.normalizeTo1x
                # the coverage scale factor is 1 / coverage,
                coverage_scale_factor = 1.0 / current_coverage
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print "Estimated current coverage {}".format(current_coverage)
                    print "Scale factor to convert " \
                          "current coverage to 1: {}".format(coverage_scale_factor)
            else:
                # by default normalize using RPKM
                # the RPKM is:
                # Num reads per tile/(total reads (in millions)*tile length in Kb)
                millionReadsMapped = float(mappedReads) / 1e6
                tileLengthInKb = float(args.binSize) / 1000
                coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb)
                scale_factors = np.array(scale_factors) * coverage_scale_factor

                if args.verbose:
                    print "scale factor for   "
                    "RPKM is {0}".format(coverage_scale_factor)

    return scale_factors
Пример #3
0
def get_scale_factor(args):

    scale_factor = args.scaleFactor
    bam_handle = bamHandler.openBam(args.bam, args.bamIndex)
    bam_mapped = parserCommon.bam_total_reads(bam_handle,
                                              args.ignoreForNormalization)

    if args.normalizeTo1x:
        # try to guess fragment length if the bam file contains paired end reads
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(
            args.bam,
            args.bamIndex,
            return_lengths=False,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose)
        if args.extendReads:
            if args.extendReads is True:
                # try to guess fragment length if the bam file contains paired end reads
                if frag_len_dict:
                    fragment_length = frag_len_dict['median']
                else:
                    exit(
                        "*ERROR*: library is not paired-end. Please provide an extension length."
                    )
                if args.verbose:
                    print(
                        "Fragment length based on paired en data "
                        "estimated to be {}".format(frag_len_dict['median']))

            elif args.extendReads < 1:
                exit(
                    "*ERROR*: read extension must be bigger than one. Value give: {} "
                    .format(args.extendReads))
            elif args.extendReads > 2000:
                exit(
                    "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                    .format(args.extendReads))
            else:
                fragment_length = args.extendReads

        else:
            # set as fragment length the read length
            fragment_length = int(read_len_dict['median'])
            if args.verbose:
                print "Estimated read length is {}".format(
                    int(read_len_dict['median']))

        current_coverage = \
            float(bam_mapped * fragment_length) / args.normalizeTo1x
        # the scaling sets the coverage to match 1x
        scale_factor *= 1.0 / current_coverage
        if debug:
            print "Estimated current coverage {}".format(current_coverage)
            print "Scaling factor {}".format(args.scaleFactor)

    elif args.normalizeUsingRPKM:
        # the RPKM is the # reads per tile / \
        #    ( total reads (in millions) * tile length in Kb)
        million_reads_mapped = float(bam_mapped) / 1e6
        tile_len_in_kb = float(args.binSize) / 1000

        scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb)

        if debug:
            print "scale factor using RPKM is {0}".format(args.scaleFactor)

    return scale_factor
Пример #4
0
def get_scale_factors(args):

    bam1 = bamHandler.openBam(args.bamfile1, args.bamIndex1)
    bam2 = bamHandler.openBam(args.bamfile2, args.bamIndex2)

    bam1_mapped = parserCommon.bam_total_reads(bam1,
                                               args.ignoreForNormalization)
    bam2_mapped = parserCommon.bam_total_reads(bam2,
                                               args.ignoreForNormalization)

    if args.scaleFactors:
        scale_factors = map(float, args.scaleFactors.split(":"))
    else:
        if args.scaleFactorsMethod == 'SES':
            scalefactors_dict = estimateScaleFactor(
                [bam1.filename, bam2.filename],
                args.sampleLength,
                args.numberOfSamples,
                1,
                numberOfProcessors=args.numberOfProcessors,
                verbose=args.verbose,
                chrsToSkip=args.ignoreForNormalization)

            scale_factors = scalefactors_dict['size_factors']

            if args.verbose:
                print "Size factors using SES: {}".format(scale_factors)
                print "%s regions of size %s where used " % \
                    (scalefactors_dict['sites_sampled'],
                     args.sampleLength)

                print "size factor if the number of mapped " \
                    "reads would have been used:"
                print tuple(
                    float(min(bam1.mapped, bam2.mapped)) /
                    np.array([bam1.mapped, bam2.mapped]))

        elif args.scaleFactorsMethod == 'readCount':
            scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(
                [bam1_mapped, bam2_mapped])
            if args.verbose:
                print "Size factors using total number " \
                    "of mapped reads: {}".format(scale_factors)

    # in case the subtract method is used, the final difference
    # would be normalized according to the given method
    if args.ratio == 'subtract':
        # The next lines identify which of the samples is not scaled down.
        # The normalization using RPKM or normalize to 1x would use
        # as reference such sample. Since the other sample would be
        # scaled to match the un-scaled one, the normalization factor
        # for both samples should be based on the unscaled one.
        # For example, if sample A is unscaled and sample B is scaled by 0.5,
        # then normalizing factor for A to report RPKM read counts
        # is also applied to B.
        if scale_factors[0] == 1:
            mappedReads = bam1_mapped
            bamfile = args.bamfile1
            bamindex = args.bamIndex1
        else:
            mappedReads = bam2_mapped
            bamfile = args.bamfile2
            bamindex = args.bamIndex2

        if args.scaleFactors is None:
            if args.normalizeTo1x:
                # try to guess fragment length if the bam file contains paired end reads
                from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
                frag_len_dict, read_len_dict = get_read_and_fragment_length(
                    bamfile,
                    bamindex,
                    return_lengths=False,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)
                if args.extendReads:
                    if args.extendReads is True:
                        # try to guess fragment length if the bam file contains paired end reads
                        if frag_len_dict:
                            fragment_length = frag_len_dict['median']
                        else:
                            exit(
                                "*ERROR*: library is not paired-end. Please provide an extension length."
                            )
                        if args.verbose:
                            print(
                                "Fragment length based on paired en data "
                                "estimated to be {}".format(
                                    frag_len_dict['median']))

                    elif args.extendReads < 1:
                        exit(
                            "*ERROR*: read extension must be bigger than one. Value give: {} "
                            .format(args.extendReads))
                    elif args.extendReads > 2000:
                        exit(
                            "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                            .format(args.extendReads))
                    else:
                        fragment_length = args.extendReads

                else:
                    # set as fragment length the read length
                    fragment_length = int(read_len_dict['median'])
                    if args.verbose:
                        print "Estimated read length is {}".format(
                            int(read_len_dict['median']))

                current_coverage = float(
                    mappedReads * fragment_length) / args.normalizeTo1x
                # the coverage scale factor is 1 / coverage,
                coverage_scale_factor = 1.0 / current_coverage
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print "Estimated current coverage {}".format(
                        current_coverage)
                    print "Scale factor to convert " \
                          "current coverage to 1: {}".format(coverage_scale_factor)
            else:
                # by default normalize using RPKM
                # the RPKM is:
                # Num reads per tile/(total reads (in millions)*tile length in Kb)
                millionReadsMapped = float(mappedReads) / 1e6
                tileLengthInKb = float(args.binSize) / 1000
                coverage_scale_factor = 1.0 / (millionReadsMapped *
                                               tileLengthInKb)
                scale_factors = np.array(scale_factors) * coverage_scale_factor

                if args.verbose:
                    print "scale factor for   "
                    "RPKM is {0}".format(coverage_scale_factor)

    return scale_factors