def get_num_kept_reads(args): """ Substracts from the total number of mapped reads in a bamfile the proportion of reads that fall into blacklisted regions or that are filtered :return: integer """ bam_handle = bamHandler.openBam(args.bam) bam_mapped_total = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization) if args.blackListFileName: blacklisted = utilities.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization, args.blackListFileName, args.numberOfProcessors) print("There are {0} alignments, of which {1} are completely " "within a blacklist region.".format(bam_mapped_total, blacklisted)) num_kept_reads = bam_mapped_total - blacklisted else: num_kept_reads = bam_mapped_total ftk = fraction_kept(args) if ftk < 1: num_kept_reads *= ftk print("Due to filtering, {0}% of the aforementioned alignments " "will be used {1}".format(100 * ftk, num_kept_reads)) return num_kept_reads, bam_mapped_total
def main(args=None): args = parseArguments().parse_args(args) if not args.sampleLabels and args.smartLabels: args.sampleLabels = smartLabels(args.bamfiles) if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles): sys.stderr.write( "\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n" ) sys.exit(1) if args.outFile is None: of = sys.stdout else: of = open(args.outFile, "w") bhs = [ bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles ] mapped = [x[1] for x in bhs] unmappedList = [x[2] for x in bhs] bhs = [x[0] for x in bhs] # Get the reads in blacklisted regions if args.blackListFileName: blacklisted = [] for bh in bhs: blacklisted.append( utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors)) else: blacklisted = [0] * len(bhs) # Get the total and mapped reads total = [x + y for x, y in list(zip(mapped, unmappedList))] chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths)) for x in bhs: x.close() # Get the remaining metrics res = mapReduce([args], getFiltered_worker, chrom_sizes, genomeChunkLength=args.binSize + args.distanceBetweenBins, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) totals = [0] * len(args.bamfiles) nFiltered = [0] * len(args.bamfiles) MAPQs = [0] * len(args.bamfiles) flagIncludes = [0] * len(args.bamfiles) flagExcludes = [0] * len(args.bamfiles) internalDupes = [0] * len(args.bamfiles) externalDupes = [0] * len(args.bamfiles) singletons = [0] * len(args.bamfiles) rnaStrand = [0] * len(args.bamfiles) for x in res: for idx, r in enumerate(x): totals[idx] += r[0] nFiltered[idx] += r[1] MAPQs[idx] += r[2] flagIncludes[idx] += r[3] flagExcludes[idx] += r[4] internalDupes[idx] += r[5] externalDupes[idx] += r[6] singletons[idx] += r[7] rnaStrand[idx] += r[8] # Print some output of.write( "Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n" ) for idx, _ in enumerate(args.bamfiles): if args.sampleLabels: of.write(args.sampleLabels[idx]) else: of.write(args.bamfiles[idx]) of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx], blacklisted[idx])) # nFiltered metric = 0.0 if totals[idx] > 0: metric = blacklisted[idx] + float(nFiltered[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # MAPQ metric = 0.0 if totals[idx] > 0: metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagInclude metric = 0.0 if totals[idx] > 0: metric = float(flagIncludes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagExclude metric = 0.0 if totals[idx] > 0: metric = float(flagExcludes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Internally determined duplicates metric = 0.0 if totals[idx] > 0: metric = float(internalDupes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Externally marked duplicates metric = 0.0 if totals[idx] > 0: metric = float(externalDupes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Singletons metric = 0.0 if totals[idx] > 0: metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # filterRNAstrand metric = 0.0 if totals[idx] > 0: metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) of.write("\n") if args.outFile is not None: of.close() return 0
def main(args=None): args = parse_arguments().parse_args(args) if args.labels is None: args.labels = args.bamfiles if len(args.labels) != len(args.bamfiles): sys.exit("Error: The number of labels ({0}) does not match the number of BAM files ({1})!".format(len(args.labels), len(args.bamfiles))) # Get the total counts, excluding blacklisted regions and filtered reads totalCounts = [] fhs = [openBam(x) for x in args.bamfiles] for i, bam_handle in enumerate(fhs): bam_mapped = utilities.bam_total_reads(bam_handle, None) blacklisted = utilities.bam_blacklisted_reads(bam_handle, None, args.blackListFileName, args.numberOfProcessors) if args.verbose: print(("There are {0} alignments in {1}, of which {2} are completely within a blacklist region.".format(bam_mapped, args.bamfiles[i], blacklisted))) bam_mapped -= blacklisted args.bam = args.bamfiles[i] args.ignoreForNormalization = None ftk = fraction_kept(args) bam_mapped *= ftk totalCounts.append(bam_mapped) # Get fragment size and chromosome dict chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose) for fh in fhs: fh.close() frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bamfiles[0], return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: defaultFragmentLength = frag_len_dict['median'] else: sys.exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print("Fragment length based on paired en data " "estimated to be {0}".format(frag_len_dict['median'])) elif args.extendReads < read_len_dict['median']: sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). " "Reads will not be extended.\n".format(int(read_len_dict['median']))) defaultFragmentLength = 'read length' elif args.extendReads > 2000: sys.exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: defaultFragmentLength = args.extendReads else: defaultFragmentLength = 'read length' # Get the chunkLength chunkLength = getChunkLength(args, chromSize) # Map reduce to get the counts/file/feature res = mapReduce([args, defaultFragmentLength], getEnrichment_worker, chromSize, genomeChunkLength=chunkLength, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) features = res[0][1] featureCounts = [] for i in list(range(len(args.bamfiles))): d = dict() for x in features: d[x] = 0 featureCounts.append(d) # res is a list, with each element a list (length len(args.bamfiles)) of dicts for x in res: for i, y in enumerate(x[0]): for k, v in y.items(): featureCounts[i][k] += v # Make a plot plotEnrichment(args, featureCounts, totalCounts, features) # Raw counts if args.outRawCounts: of = open(args.outRawCounts, "w") of.write("file\tfeatureType\tpercent\n") for i, x in enumerate(args.labels): for k, v in featureCounts[i].items(): of.write("{0}\t{1}\t{2:5.2f}\n".format(x, k, (100.0 * v) / totalCounts[i])) of.close()
def main(args=None): args = parseArguments().parse_args(args) if not args.sampleLabels and args.smartLabels: args.sampleLabels = smartLabels(args.bamfiles) if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles): sys.stderr.write("\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n") sys.exit(1) if args.outFile is None: of = sys.stdout else: of = open(args.outFile, "w") bhs = [bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles] mapped = [x[1] for x in bhs] unmappedList = [x[2] for x in bhs] bhs = [x[0] for x in bhs] # Get the reads in blacklisted regions if args.blackListFileName: blacklisted = [] for bh in bhs: blacklisted.append(utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors)) else: blacklisted = [0] * len(bhs) # Get the total and mapped reads total = [x + y for x, y in list(zip(mapped, unmappedList))] chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths)) for x in bhs: x.close() # Get the remaining metrics res = mapReduce([args], getFiltered_worker, chrom_sizes, genomeChunkLength=args.binSize + args.distanceBetweenBins, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) totals = [0] * len(args.bamfiles) nFiltered = [0] * len(args.bamfiles) MAPQs = [0] * len(args.bamfiles) flagIncludes = [0] * len(args.bamfiles) flagExcludes = [0] * len(args.bamfiles) internalDupes = [0] * len(args.bamfiles) externalDupes = [0] * len(args.bamfiles) singletons = [0] * len(args.bamfiles) rnaStrand = [0] * len(args.bamfiles) for x in res: for idx, r in enumerate(x): totals[idx] += r[0] nFiltered[idx] += r[1] MAPQs[idx] += r[2] flagIncludes[idx] += r[3] flagExcludes[idx] += r[4] internalDupes[idx] += r[5] externalDupes[idx] += r[6] singletons[idx] += r[7] rnaStrand[idx] += r[8] # Print some output of.write("Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n") for idx, _ in enumerate(args.bamfiles): if args.sampleLabels: of.write(args.sampleLabels[idx]) else: of.write(args.bamfiles[idx]) of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx], blacklisted[idx])) # nFiltered metric = 0.0 if totals[idx] > 0: metric = blacklisted[idx] + float(nFiltered[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # MAPQ metric = 0.0 if totals[idx] > 0: metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagInclude metric = 0.0 if totals[idx] > 0: metric = float(flagIncludes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagExclude metric = 0.0 if totals[idx] > 0: metric = float(flagExcludes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Internally determined duplicates metric = 0.0 if totals[idx] > 0: metric = float(internalDupes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Externally marked duplicates metric = 0.0 if totals[idx] > 0: metric = float(externalDupes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Singletons metric = 0.0 if totals[idx] > 0: metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # filterRNAstrand metric = 0.0 if totals[idx] > 0: metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) of.write("\n") if args.outFile is not None: of.close() return 0