def main(argv=None): """script main. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("bedgraph", "wiggle", "bigbed", "bigwig", "bed"), help="output format [default=%default]") parser.add_option("-s", "--shift-size", dest="shift", type="int", help="shift reads by a certain amount (ChIP-Seq) " "[%default]") parser.add_option("-e", "--extend", dest="extend", type="int", help="extend reads by a certain amount " "(ChIP-Seq) [%default]") parser.add_option("-p", "--wiggle-span", dest="span", type="int", help="span of a window in wiggle tracks " "[%default]") parser.add_option("-m", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge paired-ended reads into a single " "bed interval [default=%default].") parser.add_option("--scale-base", dest="scale_base", type="float", help="number of reads/pairs to scale bigwig file to. " "The default is to scale to 1M reads " "[default=%default]") parser.add_option("--scale-method", dest="scale_method", type="choice", choices=( "none", "reads", ), help="scale bigwig output. 'reads' will normalize by " "the total number reads in the bam file that are used " "to construct the bigwig file. If --merge-pairs is used " "the number of pairs output will be used for " "normalization. 'none' will not scale the bigwig file" "[default=%default]") parser.add_option("--max-insert-size", dest="max_insert_size", type="int", help="only merge if insert size less that " "# bases. 0 turns of this filter " "[default=%default].") parser.add_option("--min-insert-size", dest="min_insert_size", type="int", help="only merge paired-end reads if they are " "at least # bases apart. " "0 turns of this filter. [default=%default]") parser.set_defaults( samfile=None, output_format="wiggle", shift=0, extend=0, span=1, merge_pairs=None, min_insert_size=0, max_insert_size=0, scale_method='none', scale_base=1000000, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) >= 1: options.samfile = args[0] if len(args) == 2: options.output_filename_pattern = args[1] if not options.samfile: raise ValueError("please provide a bam file") # Read BAM file using Pysam samfile = pysam.AlignmentFile(options.samfile, "rb") # Create temporary files / folders tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) tmpfile_wig = os.path.join(tmpdir, "wig") tmpfile_sizes = os.path.join(tmpdir, "sizes") # Create dictionary of contig sizes contig_sizes = dict(list(zip(samfile.references, samfile.lengths))) # write contig sizes outfile_size = IOTools.open_file(tmpfile_sizes, "w") for contig, size in sorted(contig_sizes.items()): outfile_size.write("%s\t%s\n" % (contig, size)) outfile_size.close() # Shift and extend only available for bigwig format if options.shift or options.extend: if options.output_format != "bigwig": raise ValueError( "shift and extend only available for bigwig output") # Output filename required for bigwig / bigbed computation if options.output_format == "bigwig": if not options.output_filename_pattern: raise ValueError( "please specify an output file for bigwig computation.") # Define executable to use for binary conversion if options.output_format == "bigwig": executable_name = "wigToBigWig" else: raise ValueError("unknown output format `%s`" % options.output_format) # check required executable file is in the path executable = IOTools.which(executable_name) if not executable: raise OSError("could not find %s in path." % executable_name) # Open outout file outfile = IOTools.open_file(tmpfile_wig, "w") E.info("starting output to %s" % tmpfile_wig) else: outfile = IOTools.open_file(tmpfile_wig, "w") E.info("starting output to stdout") # Set up output write functions if options.output_format in ("wiggle", "bigwig"): # wiggle is one-based, so add 1, also step-size is 1, so need # to output all bases if options.span == 1: outf = lambda outfile, contig, start, end, val: \ outfile.write( "".join(["%i\t%i\n" % (x, val) for x in range(start + 1, end + 1)])) else: outf = SpanWriter(options.span) elif options.output_format == "bedgraph": # bed is 0-based, open-closed outf = lambda outfile, contig, start, end, val: \ outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val)) # initialise counters ninput, nskipped, ncontigs = 0, 0, 0 # set output file name output_filename_pattern = options.output_filename_pattern if output_filename_pattern: output_filename = os.path.abspath(output_filename_pattern) # shift and extend or merge pairs. Output temporay bed file if options.shift > 0 or options.extend > 0 or options.merge_pairs: # Workflow 1: convert to bed intervals and use bedtools # genomecov to build a coverage file. # Convert to bigwig with UCSC tools bedGraph2BigWig if options.merge_pairs: # merge pairs using bam2bed E.info("merging pairs to temporary file") counter = _bam2bed.merge_pairs( samfile, outfile, min_insert_size=options.min_insert_size, max_insert_size=options.max_insert_size, bed_format=3) E.info("merging results: {}".format(counter)) if counter.output == 0: raise ValueError("no pairs output after merging") else: # create bed file with shifted/extended tags shift, extend = options.shift, options.extend shift_extend = shift + extend counter = E.Counter() for contig in samfile.references: E.debug("output for %s" % contig) lcontig = contig_sizes[contig] for read in samfile.fetch(contig): pos = read.pos if read.is_reverse: start = max(0, read.pos + read.alen - shift_extend) else: start = max(0, read.pos + shift) # intervals extending beyond contig are removed if start >= lcontig: continue end = min(lcontig, start + extend) outfile.write("%s\t%i\t%i\n" % (contig, start, end)) counter.output += 1 outfile.close() if options.scale_method == "reads": scale_factor = float(options.scale_base) / counter.output E.info("scaling: method=%s scale_quantity=%i scale_factor=%f" % (options.scale_method, counter.output, scale_factor)) scale = "-scale %f" % scale_factor else: scale = "" # Convert bed file to coverage file (bedgraph) tmpfile_bed = os.path.join(tmpdir, "bed") E.info("computing coverage") # calculate coverage - format is bedgraph statement = """bedtools genomecov -bg -i %(tmpfile_wig)s %(scale)s -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals() E.run(statement) # Convert bedgraph to bigwig E.info("converting to bigwig") tmpfile_sorted = os.path.join(tmpdir, "sorted") statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;" "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s " "%(output_filename_pattern)s" % locals()) E.run(statement) else: # Workflow 2: use pysam column iterator to build a # wig file. Then convert to bigwig of bedgraph file # with UCSC tools. def column_iter(iterator): start = None end = 0 n = None for t in iterator: if t.pos - end > 1 or n != t.n: if start is not None: yield start, end, n start = t.pos end = t.pos n = t.n end = t.pos yield start, end, n if options.scale_method != "none": raise NotImplementedError( "scaling not implemented for pileup method") # Bedgraph track definition if options.output_format == "bedgraph": outfile.write("track type=bedGraph\n") for contig in samfile.references: # if contig != "chrX": continue E.debug("output for %s" % contig) lcontig = contig_sizes[contig] # Write wiggle header if options.output_format in ("wiggle", "bigwig"): outfile.write("variableStep chrom=%s span=%i\n" % (contig, options.span)) # Generate pileup per contig using pysam and iterate over columns for start, end, val in column_iter(samfile.pileup(contig)): # patch: there was a problem with bam files and reads # overextending at the end. These are usually Ns, but # need to check as otherwise wigToBigWig fails. if lcontig <= end: E.warn("read extending beyond contig: %s: %i > %i" % (contig, end, lcontig)) end = lcontig if start >= end: continue if val > 0: outf(outfile, contig, start, end, val) ncontigs += 1 # Close output file if type(outf) == type(SpanWriter): outf.flush(outfile) else: outfile.flush() E.info("finished output") # Report counters E.info("ninput=%i, ncontigs=%i, nskipped=%i" % (ninput, ncontigs, nskipped)) # Convert to binary formats if options.output_format == "bigwig": outfile.close() E.info("starting %s conversion" % executable) try: retcode = subprocess.call(" ".join( (executable, tmpfile_wig, tmpfile_sizes, output_filename_pattern)), shell=True) if retcode != 0: E.warn("%s terminated with signal: %i" % (executable, -retcode)) return -retcode except OSError as msg: E.warn("Error while executing bigwig: %s" % msg) return 1 E.info("finished bigwig conversion") else: with open(tmpfile_wig) as inf: sys.stdout.write(inf.read()) # Cleanup temp files shutil.rmtree(tmpdir) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge paired-ended reads and output interval " "for entire fragment [default=%default]. ") parser.add_option("--max-insert-size", dest="max_insert_size", type="int", help="only merge paired-end reads if they are less than " "# bases apart. " " 0 turns off this filter [default=%default]. ") parser.add_option("--min-insert-size", dest="min_insert_size", type="int", help="only merge paired-end reads if they are at " "least # bases apart. " " 0 turns off this filter [default=%default]. ") parser.add_option("--bed-format", dest="bed_format", type="choice", choices=('3', '4', '5', '6'), help="bed format to output. " " [default=%default]") parser.set_defaults( region=None, call_peaks=None, merge_pairs=None, min_insert_size=0, max_insert_size=0, bed_format='6', ) (options, args) = E.start(parser, argv=argv) if len(args) == 0: args.append("-") samfile = pysam.AlignmentFile(args[0], "rb") options.bed_format = int(options.bed_format) if options.merge_pairs is not None: counter = _bam2bed.merge_pairs(samfile, options.stdout, min_insert_size=options.min_insert_size, max_insert_size=options.max_insert_size, bed_format=options.bed_format) E.info("category\tcounts\n%s\n" % counter.asTable()) else: # use until_eof. Files from stdin have no index it = samfile.fetch(until_eof=True) # more comfortable cigar parsing will # come with the next pysam release BAM_CMATCH = 0 BAM_CDEL = 2 BAM_CREF_SKIP = 3 take = (BAM_CMATCH, BAM_CDEL, BAM_CREF_SKIP) outfile = options.stdout for read in it: if read.is_unmapped: continue t = 0 for op, l in read.cigar: if op in take: t += l if read.is_reverse: strand = "-" else: strand = "+" # IMS: converted rname to reference name outfile.write("%s\t%d\t%d\t%s\t%d\t%c\n" % (samfile.getrname(read.rname), read.pos, read.pos + t, read.qname, read.mapq, strand)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge paired-ended reads and output interval " "for entire fragment [default=%default]. ") parser.add_option("--max-insert-size", dest="max_insert_size", type="int", help="only merge paired-end reads if they are less than " "# bases apart. " " 0 turns off this filter [default=%default]. ") parser.add_option("--min-insert-size", dest="min_insert_size", type="int", help="only merge paired-end reads if they are at " "least # bases apart. " " 0 turns off this filter [default=%default]. ") parser.add_option("--bed-format", dest="bed_format", type="choice", choices=('3', '4', '5', '6'), help="bed format to output. " " [default=%default]") parser.set_defaults( region=None, call_peaks=None, merge_pairs=None, min_insert_size=0, max_insert_size=0, bed_format='6', ) (options, args) = E.Start(parser, argv=argv) if len(args) == 0: args.append("-") samfile = pysam.Samfile(args[0], "rb") options.bed_format = int(options.bed_format) if options.merge_pairs is not None: counter = _bam2bed.merge_pairs(samfile, options.stdout, min_insert_size=options.min_insert_size, max_insert_size=options.max_insert_size, bed_format=options.bed_format) E.info("category\tcounts\n%s\n" % counter.asTable()) else: # use until_eof. Files from stdin have no index it = samfile.fetch(until_eof=True) # more comfortable cigar parsing will # come with the next pysam release BAM_CMATCH = 0 BAM_CDEL = 2 BAM_CREF_SKIP = 3 take = (BAM_CMATCH, BAM_CDEL, BAM_CREF_SKIP) outfile = options.stdout for read in it: if read.is_unmapped: continue t = 0 for op, l in read.cigar: if op in take: t += l if read.is_reverse: strand = "-" else: strand = "+" # IMS: converted rname to reference name outfile.write("%s\t%d\t%d\t%s\t%d\t%c\n" % (samfile.getrname(read.rname), read.pos, read.pos + t, read.qname, read.mapq, strand)) E.Stop()
def main(argv=None): """script main. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=( "bedgraph", "wiggle", "bigbed", "bigwig", "bed"), help="output format [default=%default]") parser.add_option("-s", "--shift-size", dest="shift", type="int", help="shift reads by a certain amount (ChIP-Seq) " "[%default]") parser.add_option("-e", "--extend", dest="extend", type="int", help="extend reads by a certain amount " "(ChIP-Seq) [%default]") parser.add_option("-p", "--wiggle-span", dest="span", type="int", help="span of a window in wiggle tracks " "[%default]") parser.add_option("-m", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge paired-ended reads into a single " "bed interval [default=%default].") parser.add_option("--scale-base", dest="scale_base", type="float", help="number of reads/pairs to scale bigwig file to. " "The default is to scale to 1M reads " "[default=%default]") parser.add_option("--scale-method", dest="scale_method", type="choice", choices=("none", "reads",), help="scale bigwig output. 'reads' will normalize by " "the total number reads in the bam file that are used " "to construct the bigwig file. If --merge-pairs is used " "the number of pairs output will be used for " "normalization. 'none' will not scale the bigwig file" "[default=%default]") parser.add_option("--max-insert-size", dest="max_insert_size", type="int", help="only merge if insert size less that " "# bases. 0 turns of this filter " "[default=%default].") parser.add_option("--min-insert-size", dest="min_insert_size", type="int", help="only merge paired-end reads if they are " "at least # bases apart. " "0 turns of this filter. [default=%default]") parser.set_defaults( samfile=None, output_format="wiggle", shift=0, extend=0, span=1, merge_pairs=None, min_insert_size=0, max_insert_size=0, scale_method='none', scale_base=1000000, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if len(args) >= 1: options.samfile = args[0] if len(args) == 2: options.output_filename_pattern = args[1] if not options.samfile: raise ValueError("please provide a bam file") # Read BAM file using Pysam samfile = pysam.Samfile(options.samfile, "rb") # Create temporary files / folders tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) tmpfile_wig = os.path.join(tmpdir, "wig") tmpfile_sizes = os.path.join(tmpdir, "sizes") # Create dictionary of contig sizes contig_sizes = dict(list(zip(samfile.references, samfile.lengths))) # write contig sizes outfile_size = IOTools.openFile(tmpfile_sizes, "w") for contig, size in sorted(contig_sizes.items()): outfile_size.write("%s\t%s\n" % (contig, size)) outfile_size.close() # Shift and extend only available for bigwig format if options.shift or options.extend: if options.output_format != "bigwig": raise ValueError( "shift and extend only available for bigwig output") # Output filename required for bigwig / bigbed computation if options.output_format == "bigwig": if not options.output_filename_pattern: raise ValueError( "please specify an output file for bigwig computation.") # Define executable to use for binary conversion if options.output_format == "bigwig": executable_name = "wigToBigWig" else: raise ValueError("unknown output format `%s`" % options.output_format) # check required executable file is in the path executable = IOTools.which(executable_name) if not executable: raise OSError("could not find %s in path." % executable_name) # Open outout file outfile = IOTools.openFile(tmpfile_wig, "w") E.info("starting output to %s" % tmpfile_wig) else: outfile = IOTools.openFile(tmpfile_wig, "w") E.info("starting output to stdout") # Set up output write functions if options.output_format in ("wiggle", "bigwig"): # wiggle is one-based, so add 1, also step-size is 1, so need # to output all bases if options.span == 1: outf = lambda outfile, contig, start, end, val: \ outfile.write( "".join(["%i\t%i\n" % (x, val) for x in range(start + 1, end + 1)])) else: outf = SpanWriter(options.span) elif options.output_format == "bedgraph": # bed is 0-based, open-closed outf = lambda outfile, contig, start, end, val: \ outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val)) # initialise counters ninput, nskipped, ncontigs = 0, 0, 0 # set output file name output_filename_pattern = options.output_filename_pattern if output_filename_pattern: output_filename = os.path.abspath(output_filename_pattern) # shift and extend or merge pairs. Output temporay bed file if options.shift > 0 or options.extend > 0 or options.merge_pairs: # Workflow 1: convert to bed intervals and use bedtools # genomecov to build a coverage file. # Convert to bigwig with UCSC tools bedGraph2BigWig if options.merge_pairs: # merge pairs using bam2bed E.info("merging pairs to temporary file") counter = _bam2bed.merge_pairs( samfile, outfile, min_insert_size=options.min_insert_size, max_insert_size=options.max_insert_size, bed_format=3) E.info("merging results: {}".format(counter)) if counter.output == 0: raise ValueError("no pairs output after merging") else: # create bed file with shifted/extended tags shift, extend = options.shift, options.extend shift_extend = shift + extend counter = E.Counter() for contig in samfile.references: E.debug("output for %s" % contig) lcontig = contig_sizes[contig] for read in samfile.fetch(contig): pos = read.pos if read.is_reverse: start = max(0, read.pos + read.alen - shift_extend) else: start = max(0, read.pos + shift) # intervals extending beyond contig are removed if start >= lcontig: continue end = min(lcontig, start + extend) outfile.write("%s\t%i\t%i\n" % (contig, start, end)) counter.output += 1 outfile.close() if options.scale_method == "reads": scale_factor = float(options.scale_base) / counter.output E.info("scaling: method=%s scale_quantity=%i scale_factor=%f" % (options.scale_method, counter.output, scale_factor)) scale = "-scale %f" % scale_factor else: scale = "" # Convert bed file to coverage file (bedgraph) tmpfile_bed = os.path.join(tmpdir, "bed") E.info("computing coverage") # calculate coverage - format is bedgraph statement = """bedtools genomecov -bg -i %(tmpfile_wig)s %(scale)s -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals() E.run(statement) # Convert bedgraph to bigwig E.info("converting to bigwig") tmpfile_sorted = os.path.join(tmpdir, "sorted") statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;" "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s " "%(output_filename_pattern)s" % locals()) E.run(statement) else: # Workflow 2: use pysam column iterator to build a # wig file. Then convert to bigwig of bedgraph file # with UCSC tools. def column_iter(iterator): start = None end = 0 n = None for t in iterator: if t.pos - end > 1 or n != t.n: if start is not None: yield start, end, n start = t.pos end = t.pos n = t.n end = t.pos yield start, end, n if options.scale_method != "none": raise NotImplementedError( "scaling not implemented for pileup method") # Bedgraph track definition if options.output_format == "bedgraph": outfile.write("track type=bedGraph\n") for contig in samfile.references: # if contig != "chrX": continue E.debug("output for %s" % contig) lcontig = contig_sizes[contig] # Write wiggle header if options.output_format in ("wiggle", "bigwig"): outfile.write("variableStep chrom=%s span=%i\n" % (contig, options.span)) # Generate pileup per contig using pysam and iterate over columns for start, end, val in column_iter(samfile.pileup(contig)): # patch: there was a problem with bam files and reads # overextending at the end. These are usually Ns, but # need to check as otherwise wigToBigWig fails. if lcontig <= end: E.warn("read extending beyond contig: %s: %i > %i" % (contig, end, lcontig)) end = lcontig if start >= end: continue if val > 0: outf(outfile, contig, start, end, val) ncontigs += 1 # Close output file if type(outf) == type(SpanWriter): outf.flush(outfile) else: outfile.flush() E.info("finished output") # Report counters E.info("ninput=%i, ncontigs=%i, nskipped=%i" % (ninput, ncontigs, nskipped)) # Convert to binary formats if options.output_format == "bigwig": outfile.close() E.info("starting %s conversion" % executable) try: retcode = subprocess.call( " ".join((executable, tmpfile_wig, tmpfile_sizes, output_filename_pattern)), shell=True) if retcode != 0: E.warn("%s terminated with signal: %i" % (executable, -retcode)) return -retcode except OSError as msg: E.warn("Error while executing bigwig: %s" % msg) return 1 E.info("finished bigwig conversion") else: with open(tmpfile_wig) as inf: sys.stdout.write(inf.read()) # Cleanup temp files shutil.rmtree(tmpdir) E.Stop()