Пример #1
0
def get_num_kept_reads(args):
    """
    Substracts from the total number of mapped reads in a bamfile
    the proportion of reads that fall into blacklisted regions
    or that are filtered

    :return: integer
    """
    bam_handle = bamHandler.openBam(args.bam)
    bam_mapped_total = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization)
    if args.blackListFileName:
        blacklisted = utilities.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization,
                                                      args.blackListFileName, args.numberOfProcessors)
        print("There are {0} alignments, of which {1} are completely "
              "within a blacklist region.".format(bam_mapped_total, blacklisted))
        num_kept_reads = bam_mapped_total - blacklisted
    else:
        num_kept_reads = bam_mapped_total
    ftk = fraction_kept(args)
    if ftk < 1:
        num_kept_reads *= ftk
        print("Due to filtering, {0}% of the aforementioned alignments "
              "will be used {1}".format(100 * ftk, num_kept_reads))

    return num_kept_reads, bam_mapped_total
Пример #2
0
def get_num_kept_reads(args):
    """
    Substracts from the total number of mapped reads in a bamfile
    the proportion of reads that fall into blacklisted regions
    or that are filtered

    :return: integer
    """
    bam_handle = bamHandler.openBam(args.bam)
    bam_mapped_total = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization)
    if args.blackListFileName:
        blacklisted = utilities.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization,
                                                      args.blackListFileName, args.numberOfProcessors)
        print("There are {0} alignments, of which {1} are completely "
              "within a blacklist region.".format(bam_mapped_total, blacklisted))
        num_kept_reads = bam_mapped_total - blacklisted
    else:
        num_kept_reads = bam_mapped_total
    ftk = fraction_kept(args)
    if ftk < 1:
        num_kept_reads *= ftk
        print("Due to filtering, {0}% of the aforementioned alignments "
              "will be used {1}".format(100 * ftk, num_kept_reads))

    return num_kept_reads, bam_mapped_total
Пример #3
0
def main(args=None):
    args = parseArguments().parse_args(args)

    if not args.sampleLabels and args.smartLabels:
        args.sampleLabels = smartLabels(args.bamfiles)

    if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles):
        sys.stderr.write(
            "\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n"
        )
        sys.exit(1)

    if args.outFile is None:
        of = sys.stdout
    else:
        of = open(args.outFile, "w")

    bhs = [
        bamHandler.openBam(x,
                           returnStats=True,
                           nThreads=args.numberOfProcessors)
        for x in args.bamfiles
    ]
    mapped = [x[1] for x in bhs]
    unmappedList = [x[2] for x in bhs]
    bhs = [x[0] for x in bhs]

    # Get the reads in blacklisted regions
    if args.blackListFileName:
        blacklisted = []
        for bh in bhs:
            blacklisted.append(
                utilities.bam_blacklisted_reads(bh, None,
                                                args.blackListFileName,
                                                args.numberOfProcessors))
    else:
        blacklisted = [0] * len(bhs)

    # Get the total and mapped reads
    total = [x + y for x, y in list(zip(mapped, unmappedList))]

    chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths))
    for x in bhs:
        x.close()

    # Get the remaining metrics
    res = mapReduce([args],
                    getFiltered_worker,
                    chrom_sizes,
                    genomeChunkLength=args.binSize + args.distanceBetweenBins,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    totals = [0] * len(args.bamfiles)
    nFiltered = [0] * len(args.bamfiles)
    MAPQs = [0] * len(args.bamfiles)
    flagIncludes = [0] * len(args.bamfiles)
    flagExcludes = [0] * len(args.bamfiles)
    internalDupes = [0] * len(args.bamfiles)
    externalDupes = [0] * len(args.bamfiles)
    singletons = [0] * len(args.bamfiles)
    rnaStrand = [0] * len(args.bamfiles)
    for x in res:
        for idx, r in enumerate(x):
            totals[idx] += r[0]
            nFiltered[idx] += r[1]
            MAPQs[idx] += r[2]
            flagIncludes[idx] += r[3]
            flagExcludes[idx] += r[4]
            internalDupes[idx] += r[5]
            externalDupes[idx] += r[6]
            singletons[idx] += r[7]
            rnaStrand[idx] += r[8]

    # Print some output
    of.write(
        "Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n"
    )
    for idx, _ in enumerate(args.bamfiles):
        if args.sampleLabels:
            of.write(args.sampleLabels[idx])
        else:
            of.write(args.bamfiles[idx])
        of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx],
                                       blacklisted[idx]))
        # nFiltered
        metric = 0.0
        if totals[idx] > 0:
            metric = blacklisted[idx] + float(nFiltered[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # MAPQ
        metric = 0.0
        if totals[idx] > 0:
            metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # samFlagInclude
        metric = 0.0
        if totals[idx] > 0:
            metric = float(flagIncludes[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # samFlagExclude
        metric = 0.0
        if totals[idx] > 0:
            metric = float(flagExcludes[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Internally determined duplicates
        metric = 0.0
        if totals[idx] > 0:
            metric = float(internalDupes[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Externally marked duplicates
        metric = 0.0
        if totals[idx] > 0:
            metric = float(externalDupes[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Singletons
        metric = 0.0
        if totals[idx] > 0:
            metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # filterRNAstrand
        metric = 0.0
        if totals[idx] > 0:
            metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        of.write("\n")

    if args.outFile is not None:
        of.close()

    return 0
Пример #4
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    if args.labels is None:
        args.labels = args.bamfiles
    if len(args.labels) != len(args.bamfiles):
        sys.exit("Error: The number of labels ({0}) does not match the number of BAM files ({1})!".format(len(args.labels), len(args.bamfiles)))

    # Get the total counts, excluding blacklisted regions and filtered reads
    totalCounts = []
    fhs = [openBam(x) for x in args.bamfiles]
    for i, bam_handle in enumerate(fhs):
        bam_mapped = utilities.bam_total_reads(bam_handle, None)
        blacklisted = utilities.bam_blacklisted_reads(bam_handle, None, args.blackListFileName, args.numberOfProcessors)
        if args.verbose:
            print(("There are {0} alignments in {1}, of which {2} are completely within a blacklist region.".format(bam_mapped, args.bamfiles[i], blacklisted)))
        bam_mapped -= blacklisted
        args.bam = args.bamfiles[i]
        args.ignoreForNormalization = None
        ftk = fraction_kept(args)
        bam_mapped *= ftk
        totalCounts.append(bam_mapped)

    # Get fragment size and chromosome dict
    chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose)
    for fh in fhs:
        fh.close()

    frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bamfiles[0],
                                                                return_lengths=False,
                                                                blackListFileName=args.blackListFileName,
                                                                numberOfProcessors=args.numberOfProcessors,
                                                                verbose=args.verbose)
    if args.extendReads:
        if args.extendReads is True:
            # try to guess fragment length if the bam file contains paired end reads
            if frag_len_dict:
                defaultFragmentLength = frag_len_dict['median']
            else:
                sys.exit("*ERROR*: library is not paired-end. Please provide an extension length.")
            if args.verbose:
                print("Fragment length based on paired en data "
                      "estimated to be {0}".format(frag_len_dict['median']))
        elif args.extendReads < read_len_dict['median']:
            sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). "
                             "Reads will not be extended.\n".format(int(read_len_dict['median'])))
            defaultFragmentLength = 'read length'
        elif args.extendReads > 2000:
            sys.exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
        else:
            defaultFragmentLength = args.extendReads
    else:
        defaultFragmentLength = 'read length'

    # Get the chunkLength
    chunkLength = getChunkLength(args, chromSize)

    # Map reduce to get the counts/file/feature
    res = mapReduce([args, defaultFragmentLength],
                    getEnrichment_worker,
                    chromSize,
                    genomeChunkLength=chunkLength,
                    region=args.region,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    features = res[0][1]
    featureCounts = []
    for i in list(range(len(args.bamfiles))):
        d = dict()
        for x in features:
            d[x] = 0
        featureCounts.append(d)

    # res is a list, with each element a list (length len(args.bamfiles)) of dicts
    for x in res:
        for i, y in enumerate(x[0]):
            for k, v in y.items():
                featureCounts[i][k] += v

    # Make a plot
    plotEnrichment(args, featureCounts, totalCounts, features)

    # Raw counts
    if args.outRawCounts:
        of = open(args.outRawCounts, "w")
        of.write("file\tfeatureType\tpercent\n")
        for i, x in enumerate(args.labels):
            for k, v in featureCounts[i].items():
                of.write("{0}\t{1}\t{2:5.2f}\n".format(x, k, (100.0 * v) / totalCounts[i]))
        of.close()
Пример #5
0
def main(args=None):
    args = parseArguments().parse_args(args)

    if not args.sampleLabels and args.smartLabels:
        args.sampleLabels = smartLabels(args.bamfiles)

    if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles):
        sys.stderr.write("\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n")
        sys.exit(1)

    if args.outFile is None:
        of = sys.stdout
    else:
        of = open(args.outFile, "w")

    bhs = [bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles]
    mapped = [x[1] for x in bhs]
    unmappedList = [x[2] for x in bhs]
    bhs = [x[0] for x in bhs]

    # Get the reads in blacklisted regions
    if args.blackListFileName:
        blacklisted = []
        for bh in bhs:
            blacklisted.append(utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors))
    else:
        blacklisted = [0] * len(bhs)

    # Get the total and mapped reads
    total = [x + y for x, y in list(zip(mapped, unmappedList))]

    chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths))
    for x in bhs:
        x.close()

    # Get the remaining metrics
    res = mapReduce([args],
                    getFiltered_worker,
                    chrom_sizes,
                    genomeChunkLength=args.binSize + args.distanceBetweenBins,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    totals = [0] * len(args.bamfiles)
    nFiltered = [0] * len(args.bamfiles)
    MAPQs = [0] * len(args.bamfiles)
    flagIncludes = [0] * len(args.bamfiles)
    flagExcludes = [0] * len(args.bamfiles)
    internalDupes = [0] * len(args.bamfiles)
    externalDupes = [0] * len(args.bamfiles)
    singletons = [0] * len(args.bamfiles)
    rnaStrand = [0] * len(args.bamfiles)
    for x in res:
        for idx, r in enumerate(x):
            totals[idx] += r[0]
            nFiltered[idx] += r[1]
            MAPQs[idx] += r[2]
            flagIncludes[idx] += r[3]
            flagExcludes[idx] += r[4]
            internalDupes[idx] += r[5]
            externalDupes[idx] += r[6]
            singletons[idx] += r[7]
            rnaStrand[idx] += r[8]

    # Print some output
    of.write("Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n")
    for idx, _ in enumerate(args.bamfiles):
        if args.sampleLabels:
            of.write(args.sampleLabels[idx])
        else:
            of.write(args.bamfiles[idx])
        of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx], blacklisted[idx]))
        # nFiltered
        metric = 0.0
        if totals[idx] > 0:
            metric = blacklisted[idx] + float(nFiltered[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # MAPQ
        metric = 0.0
        if totals[idx] > 0:
            metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # samFlagInclude
        metric = 0.0
        if totals[idx] > 0:
            metric = float(flagIncludes[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # samFlagExclude
        metric = 0.0
        if totals[idx] > 0:
            metric = float(flagExcludes[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Internally determined duplicates
        metric = 0.0
        if totals[idx] > 0:
            metric = float(internalDupes[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Externally marked duplicates
        metric = 0.0
        if totals[idx] > 0:
            metric = float(externalDupes[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Singletons
        metric = 0.0
        if totals[idx] > 0:
            metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # filterRNAstrand
        metric = 0.0
        if totals[idx] > 0:
            metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        of.write("\n")

    if args.outFile is not None:
        of.close()

    return 0