def process_args(args=None): args = parse_arguments().parse_args(args) if not args.labels: if args.smartLabels: args.labels = smartLabels(args.bamfiles) else: args.labels = [os.path.basename(x) for x in args.bamfiles] if args.labels and len(args.bamfiles) != len(args.labels): sys.exit("The number of labels does not match the number of BAM files.") return args
def process_args(args=None): args = parse_arguments().parse_args(args) if not args.labels: if args.smartLabels: args.labels = smartLabels(args.bamfiles) else: args.labels = [os.path.basename(x) for x in args.bamfiles] if args.labels and len(args.bamfiles) != len(args.labels): sys.exit("The number of labels does not match the number of BAM files.") return args
def process_args(args=None): args = parse_arguments().parse_args(args) if not args.labels and args.smartLabels: args.labels = smartLabels(args.bwfiles) elif not args.labels: args.labels = [] for f in args.bwfiles: if f.startswith("http://") or f.startswith("https://") or f.startswith("ftp://"): args.labels.append(f.split("/")[-1]) else: args.labels.append(os.path.basename(f)) if len(args.bwfiles) != len(args.labels): sys.exit("The number of labels does not match the number of bigWig files.") return args
def process_args(args=None): args = parse_arguments().parse_args(args) if not args.labels and args.smartLabels: args.labels = smartLabels(args.bwfiles) elif not args.labels: args.labels = [] for f in args.bwfiles: if f.startswith("http://") or f.startswith("https://") or f.startswith("ftp://"): args.labels.append(f.split("/")[-1]) else: args.labels.append(os.path.basename(f)) if len(args.bwfiles) != len(args.labels): sys.exit("The number of labels does not match the number of bigWig files.") return args
def process_args(args=None): args = parse_arguments().parse_args(args) if args.JSDsample is not None and args.JSDsample not in args.bamfiles: args.bamfiles.append(args.JSDsample) if args.labels and len(args.bamfiles) == len(args.labels) - 1: args.labels.append(args.JSDsample) if not args.labels: if args.smartLabels: args.labels = smartLabels(args.bamfiles) else: args.labels = args.bamfiles if len(args.bamfiles) != len(args.labels): sys.exit("The number of labels does not match the number of BAM files.") return args
def process_args(args=None): args = parse_arguments().parse_args(args) if args.JSDsample is not None and args.JSDsample not in args.bamfiles: args.bamfiles.append(args.JSDsample) if args.labels and len(args.bamfiles) == len(args.labels) - 1: args.labels.append(args.JSDsample) if not args.labels: if args.smartLabels: args.labels = smartLabels(args.bamfiles) else: args.labels = args.bamfiles if len(args.bamfiles) != len(args.labels): sys.exit("The number of labels does not match the number of BAM files.") return args
def main(args=None): args = parseArguments().parse_args(args) if not args.sampleLabels and args.smartLabels: args.sampleLabels = smartLabels(args.bamfiles) if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles): sys.stderr.write( "\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n" ) sys.exit(1) if args.outFile is None: of = sys.stdout else: of = open(args.outFile, "w") bhs = [ bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles ] mapped = [x[1] for x in bhs] unmappedList = [x[2] for x in bhs] bhs = [x[0] for x in bhs] # Get the reads in blacklisted regions if args.blackListFileName: blacklisted = [] for bh in bhs: blacklisted.append( utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors)) else: blacklisted = [0] * len(bhs) # Get the total and mapped reads total = [x + y for x, y in list(zip(mapped, unmappedList))] chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths)) for x in bhs: x.close() # Get the remaining metrics res = mapReduce([args], getFiltered_worker, chrom_sizes, genomeChunkLength=args.binSize + args.distanceBetweenBins, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) totals = [0] * len(args.bamfiles) nFiltered = [0] * len(args.bamfiles) MAPQs = [0] * len(args.bamfiles) flagIncludes = [0] * len(args.bamfiles) flagExcludes = [0] * len(args.bamfiles) internalDupes = [0] * len(args.bamfiles) externalDupes = [0] * len(args.bamfiles) singletons = [0] * len(args.bamfiles) rnaStrand = [0] * len(args.bamfiles) for x in res: for idx, r in enumerate(x): totals[idx] += r[0] nFiltered[idx] += r[1] MAPQs[idx] += r[2] flagIncludes[idx] += r[3] flagExcludes[idx] += r[4] internalDupes[idx] += r[5] externalDupes[idx] += r[6] singletons[idx] += r[7] rnaStrand[idx] += r[8] # Print some output of.write( "Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n" ) for idx, _ in enumerate(args.bamfiles): if args.sampleLabels: of.write(args.sampleLabels[idx]) else: of.write(args.bamfiles[idx]) of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx], blacklisted[idx])) # nFiltered metric = 0.0 if totals[idx] > 0: metric = blacklisted[idx] + float(nFiltered[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # MAPQ metric = 0.0 if totals[idx] > 0: metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagInclude metric = 0.0 if totals[idx] > 0: metric = float(flagIncludes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagExclude metric = 0.0 if totals[idx] > 0: metric = float(flagExcludes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Internally determined duplicates metric = 0.0 if totals[idx] > 0: metric = float(internalDupes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Externally marked duplicates metric = 0.0 if totals[idx] > 0: metric = float(externalDupes[idx]) / float( totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Singletons metric = 0.0 if totals[idx] > 0: metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # filterRNAstrand metric = 0.0 if totals[idx] > 0: metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) of.write("\n") if args.outFile is not None: of.close() return 0
def main(args=None): args = parse_arguments().parse_args(args) if not args.outRawCounts and not args.plotFile: sys.exit( "Error: You need to specify at least one of --plotFile or --outRawCounts!\n" ) if args.labels is None: args.labels = args.bamfiles if args.smartLabels: args.labels = smartLabels(args.bamfiles) if len(args.labels) != len(args.bamfiles): sys.exit( "Error: The number of labels ({0}) does not match the number of BAM files ({1})!" .format(len(args.labels), len(args.bamfiles))) # Ensure that if we're given an attributeKey that it's not empty if args.attributeKey and args.attributeKey == "": args.attributeKey = None global gtf if not args.regionLabels and args.smartLabels: args.regionLabels = smartLabels(args.BED) gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels, attributeKey=args.attributeKey) # Get fragment size and chromosome dict fhs = [openBam(x) for x in args.bamfiles] chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose) for fh in fhs: fh.close() frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bamfiles[0], return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: defaultFragmentLength = frag_len_dict['median'] else: sys.exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if args.verbose: print("Fragment length based on paired en data " "estimated to be {0}".format(frag_len_dict['median'])) elif args.extendReads < read_len_dict['median']: sys.stderr.write( "*WARNING*: read extension is smaller than read length (read length = {}). " "Reads will not be extended.\n".format( int(read_len_dict['median']))) defaultFragmentLength = 'read length' elif args.extendReads > 2000: sys.exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(args.extendReads)) else: defaultFragmentLength = args.extendReads else: defaultFragmentLength = 'read length' # Get the chunkLength chunkLength = getChunkLength(args, chromSize) # Map reduce to get the counts/file/feature res = mapReduce([args, defaultFragmentLength], getEnrichment_worker, chromSize, genomeChunkLength=chunkLength, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) features = res[0][1] featureCounts = [] for i in list(range(len(args.bamfiles))): d = dict() for x in features: d[x] = 0 featureCounts.append(d) # res is a list, with each element a list (length len(args.bamfiles)) of dicts totalCounts = [0] * len(args.bamfiles) for x in res: for i, y in enumerate(x[2]): totalCounts[i] += y for i, y in enumerate(x[0]): for k, v in y.items(): featureCounts[i][k] += v # Make a plot if args.plotFile: plotEnrichment(args, featureCounts, totalCounts, features) # Raw counts if args.outRawCounts: of = open(args.outRawCounts, "w") of.write( "file\tfeatureType\tpercent\tfeatureReadCount\ttotalReadCount\n") for i, x in enumerate(args.labels): for k, v in featureCounts[i].items(): of.write("{0}\t{1}\t{2:5.2f}\t{3}\t{4}\n".format( x, k, (100.0 * v) / totalCounts[i], v, totalCounts[i])) of.close()
def main(args=None): args = parseArguments().parse_args(args) if args.shift: if len(args.shift) not in [2, 4]: sys.exit( "The --shift option can accept either 2 or 4 values only.") if len(args.shift) == 2: args.shift.extend([-args.shift[1], -args.shift[0]]) elif args.ATACshift: args.shift = [4, -5, 5, -4] bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors) total = mapped + unmapped chrom_sizes = [(x, y) for x, y in zip(bam.references, bam.lengths)] chromDict = {x: y for x, y in zip(bam.references, bam.lengths)} # Filter, writing the results to a bunch of temporary files res = mapReduce([args, chromDict], filterWorker, chrom_sizes, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) res = sorted(res) # The temp files are now in order for concatenation nFiltered = sum([x[3] for x in res]) totalSeen = sum([x[2] for x in res]) # The * contig isn't queried tmpFiles = [x[4] for x in res] if not args.BED: arguments = ["-o", args.outFile] arguments.extend( tmpFiles) # [..., *someList] isn't available in python 2.7 pysam.samtools.cat(*arguments) for tmpFile in tmpFiles: os.unlink(tmpFile) else: convertBED(args.outFile, tmpFiles, chromDict) if args.filteredOutReads: tmpFiles = [x[5] for x in res] if not args.BED: arguments = ["-o", args.filteredOutReads] arguments.extend( tmpFiles) # [..., *someList] isn't available in python 2.7 pysam.samtools.cat(*arguments) for tmpFile in tmpFiles: os.unlink(tmpFile) else: convertBED(args.outFile, tmpFiles, chromDict, args) if args.filterMetrics: sampleName = args.bam if args.label: sampleName = args.label if args.smartLabels: sampleName = smartLabels([args.bam])[0] of = open(args.filterMetrics, "w") of.write("#bamFilterReads --filterMetrics\n") of.write("#File\tReads Remaining\tTotal Initial Reads\n") of.write("{}\t{}\t{}\n".format(sampleName, totalSeen - nFiltered, total)) of.close() return 0
def main(args=None): args = parseArguments().parse_args(args) if not args.sampleLabels and args.smartLabels: args.sampleLabels = smartLabels(args.bamfiles) if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles): sys.stderr.write("\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n") sys.exit(1) if args.outFile is None: of = sys.stdout else: of = open(args.outFile, "w") bhs = [bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles] mapped = [x[1] for x in bhs] unmappedList = [x[2] for x in bhs] bhs = [x[0] for x in bhs] # Get the reads in blacklisted regions if args.blackListFileName: blacklisted = [] for bh in bhs: blacklisted.append(utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors)) else: blacklisted = [0] * len(bhs) # Get the total and mapped reads total = [x + y for x, y in list(zip(mapped, unmappedList))] chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths)) for x in bhs: x.close() # Get the remaining metrics res = mapReduce([args], getFiltered_worker, chrom_sizes, genomeChunkLength=args.binSize + args.distanceBetweenBins, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) totals = [0] * len(args.bamfiles) nFiltered = [0] * len(args.bamfiles) MAPQs = [0] * len(args.bamfiles) flagIncludes = [0] * len(args.bamfiles) flagExcludes = [0] * len(args.bamfiles) internalDupes = [0] * len(args.bamfiles) externalDupes = [0] * len(args.bamfiles) singletons = [0] * len(args.bamfiles) rnaStrand = [0] * len(args.bamfiles) for x in res: for idx, r in enumerate(x): totals[idx] += r[0] nFiltered[idx] += r[1] MAPQs[idx] += r[2] flagIncludes[idx] += r[3] flagExcludes[idx] += r[4] internalDupes[idx] += r[5] externalDupes[idx] += r[6] singletons[idx] += r[7] rnaStrand[idx] += r[8] # Print some output of.write("Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n") for idx, _ in enumerate(args.bamfiles): if args.sampleLabels: of.write(args.sampleLabels[idx]) else: of.write(args.bamfiles[idx]) of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx], blacklisted[idx])) # nFiltered metric = 0.0 if totals[idx] > 0: metric = blacklisted[idx] + float(nFiltered[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # MAPQ metric = 0.0 if totals[idx] > 0: metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagInclude metric = 0.0 if totals[idx] > 0: metric = float(flagIncludes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # samFlagExclude metric = 0.0 if totals[idx] > 0: metric = float(flagExcludes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Internally determined duplicates metric = 0.0 if totals[idx] > 0: metric = float(internalDupes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Externally marked duplicates metric = 0.0 if totals[idx] > 0: metric = float(externalDupes[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # Singletons metric = 0.0 if totals[idx] > 0: metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # filterRNAstrand metric = 0.0 if totals[idx] > 0: metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx] of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) of.write("\n") if args.outFile is not None: of.close() return 0
def main(args=None): args = parse_arguments().parse_args(args) if not args.outRawCounts and not args.plotFile: sys.exit("Error: You need to specify at least one of --plotFile or --outRawCounts!\n") if args.labels is None: args.labels = args.bamfiles if args.smartLabels: args.labels = smartLabels(args.bamfiles) if len(args.labels) != len(args.bamfiles): sys.exit("Error: The number of labels ({0}) does not match the number of BAM files ({1})!".format(len(args.labels), len(args.bamfiles))) # Ensure that if we're given an attributeKey that it's not empty if args.attributeKey and args.attributeKey == "": args.attributeKey = None global gtf if not args.regionLabels and args.smartLabels: args.regionLabels = smartLabels(args.BED) gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels, attributeKey=args.attributeKey) # Get fragment size and chromosome dict fhs = [openBam(x) for x in args.bamfiles] chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose) for fh in fhs: fh.close() frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bamfiles[0], return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: defaultFragmentLength = frag_len_dict['median'] else: sys.exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print("Fragment length based on paired en data " "estimated to be {0}".format(frag_len_dict['median'])) elif args.extendReads < read_len_dict['median']: sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). " "Reads will not be extended.\n".format(int(read_len_dict['median']))) defaultFragmentLength = 'read length' elif args.extendReads > 2000: sys.exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: defaultFragmentLength = args.extendReads else: defaultFragmentLength = 'read length' # Get the chunkLength chunkLength = getChunkLength(args, chromSize) # Map reduce to get the counts/file/feature res = mapReduce([args, defaultFragmentLength], getEnrichment_worker, chromSize, genomeChunkLength=chunkLength, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) features = res[0][1] featureCounts = [] for i in list(range(len(args.bamfiles))): d = dict() for x in features: d[x] = 0 featureCounts.append(d) # res is a list, with each element a list (length len(args.bamfiles)) of dicts totalCounts = [0] * len(args.bamfiles) for x in res: for i, y in enumerate(x[2]): totalCounts[i] += y for i, y in enumerate(x[0]): for k, v in y.items(): featureCounts[i][k] += v # Make a plot if args.plotFile: plotEnrichment(args, featureCounts, totalCounts, features) # Raw counts if args.outRawCounts: of = open(args.outRawCounts, "w") of.write("file\tfeatureType\tpercent\tfeatureReadCount\ttotalReadCount\n") for i, x in enumerate(args.labels): for k, v in featureCounts[i].items(): of.write("{0}\t{1}\t{2:5.2f}\t{3}\t{4}\n".format(x, k, (100.0 * v) / totalCounts[i], v, totalCounts[i])) of.close()
def main(args=None): args = parseArguments().parse_args(args) if args.shift: if len(args.shift) not in [2, 4]: sys.exit("The --shift option can accept either 2 or 4 values only.") if len(args.shift) == 2: args.shift.extend([-args.shift[1], -args.shift[0]]) elif args.ATACshift: args.shift = [4, -5, 5, -4] bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors) total = mapped + unmapped chrom_sizes = [(x, y) for x, y in zip(bam.references, bam.lengths)] chromDict = {x: y for x, y in zip(bam.references, bam.lengths)} # Filter, writing the results to a bunch of temporary files res = mapReduce([args, chromDict], filterWorker, chrom_sizes, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) res = sorted(res) # The temp files are now in order for concatenation nFiltered = sum([x[3] for x in res]) totalSeen = sum([x[2] for x in res]) # The * contig isn't queried tmpFiles = [x[4] for x in res] if not args.BED: arguments = ["-o", args.outFile] arguments.extend(tmpFiles) # [..., *someList] isn't available in python 2.7 pysam.samtools.cat(*arguments) for tmpFile in tmpFiles: os.unlink(tmpFile) else: convertBED(args.outFile, tmpFiles, chromDict) if args.filteredOutReads: tmpFiles = [x[5] for x in res] if not args.BED: arguments = ["-o", args.filteredOutReads] arguments.extend(tmpFiles) # [..., *someList] isn't available in python 2.7 pysam.samtools.cat(*arguments) for tmpFile in tmpFiles: os.unlink(tmpFile) else: convertBED(args.outFile, tmpFiles, chromDict, args) if args.filterMetrics: sampleName = args.bam if args.label: sampleName = args.label if args.smartLabels: sampleName = smartLabels([args.bam])[0] of = open(args.filterMetrics, "w") of.write("#bamFilterReads --filterMetrics\n") of.write("#File\tReads Remaining\tTotal Initial Reads\n") of.write("{}\t{}\t{}\n".format(sampleName, totalSeen - nFiltered, total)) of.close() return 0