def main(args=None): args = parse_arguments().parse_args(args) if not args.outRawCounts and not args.plotFile: sys.exit( "Error: You need to specify at least one of --plotFile or --outRawCounts!\n" ) if args.labels is None: args.labels = args.bamfiles if len(args.labels) != len(args.bamfiles): sys.exit( "Error: The number of labels ({0}) does not match the number of BAM files ({1})!" .format(len(args.labels), len(args.bamfiles))) # Get fragment size and chromosome dict fhs = [openBam(x) for x in args.bamfiles] chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose) for fh in fhs: fh.close() frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bamfiles[0], return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: defaultFragmentLength = frag_len_dict['median'] else: sys.exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if args.verbose: print("Fragment length based on paired en data " "estimated to be {0}".format(frag_len_dict['median'])) elif args.extendReads < read_len_dict['median']: sys.stderr.write( "*WARNING*: read extension is smaller than read length (read length = {}). " "Reads will not be extended.\n".format( int(read_len_dict['median']))) defaultFragmentLength = 'read length' elif args.extendReads > 2000: sys.exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(args.extendReads)) else: defaultFragmentLength = args.extendReads else: defaultFragmentLength = 'read length' # Get the chunkLength chunkLength = getChunkLength(args, chromSize) # Map reduce to get the counts/file/feature res = mapReduce([args, defaultFragmentLength], getEnrichment_worker, chromSize, genomeChunkLength=chunkLength, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) features = res[0][1] featureCounts = [] for i in list(range(len(args.bamfiles))): d = dict() for x in features: d[x] = 0 featureCounts.append(d) # res is a list, with each element a list (length len(args.bamfiles)) of dicts totalCounts = [0] * len(args.bamfiles) for x in res: for i, y in enumerate(x[2]): totalCounts[i] += y for i, y in enumerate(x[0]): for k, v in y.items(): featureCounts[i][k] += v # Make a plot if args.plotFile: plotEnrichment(args, featureCounts, totalCounts, features) # Raw counts if args.outRawCounts: of = open(args.outRawCounts, "w") of.write("file\tfeatureType\tpercent\n") for i, x in enumerate(args.labels): for k, v in featureCounts[i].items(): of.write("{0}\t{1}\t{2:5.2f}\n".format(x, k, (100.0 * v) / totalCounts[i])) of.close()
def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, format="bedgraph", smoothLength=0): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. Parameters ---------- func_to_call : str function name to be called to convert the list of coverages computed for each bam file at each position into a single value. An example is a function that takes the ratio between the coverage of two bam files. func_args : dict dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0} out_file_name : str name of the file to save the resulting data. smoothLength : int Distance in bp for smoothing the coverage per tile. """ self.__dict__["smoothLength"] = smoothLength bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList] genome_chunk_length = getGenomeChunkLength(bam_handlers, self.binLength) # check if both bam files correspond to the same species # by comparing the chromosome names: chrom_names_and_size, non_common = getCommonChrNames(bam_handlers, verbose=False) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) for x in list(self.__dict__.keys()): sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x))) res = mapReduce.mapReduce([func_to_call, func_args], writeBedGraph_wrapper, chrom_names_and_size, self_=self, genomeChunkLength=genome_chunk_length, region=self.region, blackListFileName=blackListFileName, numberOfProcessors=self.numberOfProcessors) # concatenate intermediary bedgraph files out_file = open(out_file_name + ".bg", 'wb') for tempfilename in res: if tempfilename: # concatenate all intermediate tempfiles into one # bedgraph file _foo = open(tempfilename, 'rb') shutil.copyfileobj(_foo, out_file) _foo.close() os.remove(tempfilename) bedgraph_file = out_file.name out_file.close() if format == 'bedgraph': os.rename(bedgraph_file, out_file_name) if self.verbose: print("output file: {}".format(out_file_name)) else: bedGraphToBigWig(chrom_names_and_size, bedgraph_file, out_file_name, True) if self.verbose: print("output file: {}".format(out_file_name)) os.remove(bedgraph_file)
def writeBedGraph(bamFilesList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, numberOfProcessors=None, format="bedgraph", extendPairedEnds=True, zerosToNans=True, smoothLength=0, minMappingQuality=None, ignoreDuplicates=False, fragmentFromRead_func=None, centerRead=False): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. >>> test = Tester() >>> import tempfile >>> outFile = tempfile.NamedTemporaryFile() >>> funcArgs = {'scaleFactor': 1.0} >>> writeBedGraph( [test.bamFile1], outFile.name, ... 0, scaleCoverage, funcArgs, region='3R:0:200') >>> open(outFile.name, 'r').readlines() ['3R\t100\t200\t1.0\n'] >>> outFile.close() """ bamHandlers = [openBam(x) for x in bamFilesList] genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False) if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce( (tileSize, fragmentLength, bamFilesList, func, funcArgs, extendPairedEnds, smoothLength, zerosToNans, minMappingQuality, ignoreDuplicates, fragmentFromRead_func, centerRead), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, numberOfProcessors=numberOfProcessors) # concatenate intermediary bedgraph files outFile = open(outputFileName + ".bg", 'wb') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), outFile) os.remove(tempFileName) bedGraphFile = outFile.name outFile.close() if format == 'bedgraph': os.rename(bedGraphFile, outputFileName) if debug: print "output file: %s" % (outputFileName) else: bedGraphToBigWig(chromNamesAndSize, bedGraphFile, outputFileName, False) if debug: print "output file: %s" % (outputFileName) os.remove(bedGraphFile)
def writeBedGraph(bamOrBwFileList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, blackListFileName=None, numberOfProcessors=1, format="bedgraph", extendPairedEnds=True, missingDataAsZero=False, smoothLength=0, fixed_step=False, verbose=False): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. """ bamHandles = [] mappedList = [] for indexedFile, fileFormat in bamOrBwFileList: if fileFormat == 'bam': bam, mapped, unmapped, stats = bamHandler.openBam( indexedFile, returnStats=True, nThreads=numberOfProcessors) bamHandles.append(bam) mappedList.append(mapped) if len(bamHandles): genomeChunkLength = getGenomeChunkLength(bamHandles, tileSize, mappedList) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize, __ = getCommonChrNames(bamHandles, verbose=verbose) else: genomeChunkLength = int(10e6) cCommon = [] chromNamesAndSize = {} for fileName, fileFormat in bamOrBwFileList: if fileFormat == 'bigwig': fh = pyBigWig.open(fileName) else: continue for chromName, size in list(fh.chroms().items()): if chromName in chromNamesAndSize: cCommon.append(chromName) if chromNamesAndSize[chromName] != size: print("\nWARNING\n" "Chromosome {} length reported in the " "input files differ.\n{} for {}\n" "{} for {}.\n\nThe smallest " "length will be used".format( chromName, chromNamesAndSize[chromName], bamOrBwFileList[0][0], size, fileName)) chromNamesAndSize[chromName] = min( chromNamesAndSize[chromName], size) else: chromNamesAndSize[chromName] = size fh.close() # get the list of common chromosome names and sizes chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items() if k in cCommon] if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce( (tileSize, fragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds, smoothLength, missingDataAsZero, fixed_step), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) # Determine the sorted order of the temp files chrom_order = dict() for i, _ in enumerate(chromNamesAndSize): chrom_order[_[0]] = i res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res] res.sort() if format == 'bedgraph': of = open(outputFileName, 'wb') for r in res: if r is not None: _ = open(r[3], 'rb') shutil.copyfileobj(_, of) _.close() os.remove(r[3]) of.close() else: bedGraphToBigWig(chromNamesAndSize, [x[3] for x in res], outputFileName)
def writeBedGraph(bamOrBwFileList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, blackListFileName=None, numberOfProcessors=None, format="bedgraph", extendPairedEnds=True, missingDataAsZero=False, smoothLength=0, fixed_step=False): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. """ bamHandlers = [ bamHandler.openBam(indexedFile) for indexedFile, fileFormat in bamOrBwFileList if fileFormat == 'bam' ] if len(bamHandlers): genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=False) else: genomeChunkLength = int(10e6) bigwigs = [ fileName for fileName, fileFormat in bamOrBwFileList if fileFormat == 'bigwig' ] cCommon = [] chromNamesAndSize = {} for bw in bigwigs: bwh = pyBigWig.open(bw) for chromName, size in list(bwh.chroms().items()): if chromName in chromNamesAndSize: cCommon.append(chromName) if chromNamesAndSize[chromName] != size: print("\nWARNING\n" "Chromosome {} length reported in the " "bigwig files differ.\n{} for {}\n" "{} for {}.\n\nThe smallest " "length will be used".format( chromName, chromNamesAndSize[chromName], bigwigs[0], size, bw)) chromNamesAndSize[chromName] = min( chromNamesAndSize[chromName], size) else: chromNamesAndSize[chromName] = size bwh.close() # get the list of common chromosome names and sizes chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items() if k in cCommon] if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce( (tileSize, fragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds, smoothLength, missingDataAsZero, fixed_step), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors) # concatenate intermediary bedgraph files outFile = open(outputFileName + ".bg", 'wb') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file _foo = open(tempFileName, 'rb') shutil.copyfileobj(_foo, outFile) _foo.close() os.remove(tempFileName) bedGraphFile = outFile.name outFile.close() if format == 'bedgraph': os.rename(bedGraphFile, outputFileName) if debug: print("output file: %s" % (outputFileName)) else: bedGraphToBigWig(chromNamesAndSize, bedGraphFile, outputFileName, True) if debug: print("output file: %s" % (outputFileName)) os.remove(bedGraphFile)
def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, format="bedgraph", smoothLength=0): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. Parameters ---------- func_to_call : str function name to be called to convert the list of coverages computed for each bam file at each position into a single value. An example is a function that takes the ratio between the coverage of two bam files. func_args : dict dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0} out_file_name : str name of the file to save the resulting data. smoothLength : int Distance in bp for smoothing the coverage per tile. """ self.__dict__["smoothLength"] = smoothLength getStats = len(self.mappedList) < len(self.bamFilesList) bam_handles = [] for x in self.bamFilesList: if getStats: bam, mapped, unmapped, stats = bamHandler.openBam(x, returnStats=True, nThreads=self.numberOfProcessors) self.mappedList.append(mapped) self.statsList.append(stats) else: bam = bamHandler.openBam(x) bam_handles.append(bam) genome_chunk_length = getGenomeChunkLength(bam_handles, self.binLength, self.mappedList) # check if both bam files correspond to the same species # by comparing the chromosome names: chrom_names_and_size, non_common = getCommonChrNames(bam_handles, verbose=False) if self.region: # in case a region is used, append the tilesize self.region += ":{}".format(self.binLength) for x in list(self.__dict__.keys()): if x in ["mappedList", "statsList"]: continue sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x))) res = mapReduce.mapReduce([func_to_call, func_args], writeBedGraph_wrapper, chrom_names_and_size, self_=self, genomeChunkLength=genome_chunk_length, region=self.region, blackListFileName=blackListFileName, numberOfProcessors=self.numberOfProcessors) # Determine the sorted order of the temp files chrom_order = dict() for i, _ in enumerate(chrom_names_and_size): chrom_order[_[0]] = i res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res] res.sort() if format == 'bedgraph': out_file = open(out_file_name, 'wb') for r in res: if r[3]: _foo = open(r[3], 'rb') shutil.copyfileobj(_foo, out_file) _foo.close() os.remove(r[3]) out_file.close() else: bedGraphToBigWig(chrom_names_and_size, [x[3] for x in res], out_file_name)