def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--methods", dest="methods", type="choice", action="append", choices=("filter", "keep-first-base", "set-nh", "set-sequence", "strip-sequence", "strip-quality", "unstrip", "unset-unmapped-mapq", "downsample-single", "downsample-paired"), help="methods to apply [%default]") parser.add_option("--strip-method", dest="strip_method", type="choice", choices=("all", "match"), help="define which sequences/qualities to strip. " "match means that stripping only applies to entries " "without mismatches (requires NM tag to be present). " "[%default]") parser.add_option("--filter-method", dest="filter_methods", action="append", type="choice", choices=('NM', 'CM', 'mapped', 'unique', "non-unique"), help="filter method to apply to remove alignments " "from a bam file. Multiple methods can be supplied " "[%default]") parser.add_option("--reference-bam-file", dest="reference_bam", type="string", help="bam-file to filter with [%default]") parser.add_option("--force-output", dest="force", action="store_true", help="force processing. Some methods such " "as strip/unstrip will stop processing if " "they think it not necessary " "[%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.add_option("--inplace", dest="inplace", action="store_true", help="modify bam files in-place. Bam files need " "to be given " "as arguments. Temporary bam files are written " "to /tmp [%default]") parser.add_option("--first-fastq-file", "-1", dest="fastq_pair1", type="string", help="fastq file with read information for first " "in pair or unpaired. Used for unstripping sequence " "and quality scores [%default]") parser.add_option("--second-fastq-file", "-2", dest="fastq_pair2", type="string", help="fastq file with read information for second " "in pair. Used for unstripping sequence " "and quality scores [%default]") parser.add_option("--downsample", dest="downsample", type="int", help="Number of reads to downsample to") parser.set_defaults(methods=[], output_sam=False, reference_bam=None, filter_methods=[], strip_method="all", force=False, inplace=False, fastq_pair1=None, fastq_pair2=None, downsample=None, random_seed=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # random.seed(options.random_seed) bamfiles = [] if options.stdin != sys.stdin: from_stdin = True bamfiles.append(options.stdin.name) else: from_stdin = False if options.inplace: bamfiles.extend(args) if len(bamfiles) == 0: raise ValueError( "please one or more bam-files as command line arguments") if "-" in bamfiles: raise ValueError( "can not read from stdin if ``--inplace`` is selected") if len(bamfiles) == 0: bamfiles = ["-"] to_stdout = False for bamfile in bamfiles: E.info('processing %s' % bamfile) if os.path.islink(bamfile): E.warn('ignoring link %s' % bamfile) continue if IOTools.isEmpty(bamfile): E.warn('ignoring empty file %s' % bamfile) continue # reading bam from stdin does not work with only the "r" tag pysam_in = pysam.AlignmentFile(bamfile, "rb") if bamfile == "-" or (from_stdin and bamfile == options.stdin.name): to_stdout = True if options.output_sam: pysam_out = pysam.AlignmentFile("-", "wh", template=pysam_in) else: pysam_out = pysam.AlignmentFile("-", "wb", template=pysam_in) else: if IOTools.isEmpty(bamfile): E.warn('skipping empty file %s' % bamfile) continue tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp") tmpfile.close() E.debug("writing temporary bam-file to %s" % tmpfile.name) pysam_out = pysam.AlignmentFile(tmpfile.name, "wb", template=pysam_in) if "filter" in options.methods: remove_mismatches, colour_mismatches = False, False if "NM" in options.filter_methods: remove_mismatches = True elif "CM" in options.filter_methods: remove_mismatches = True colour_mismatches = True if remove_mismatches: if not options.reference_bam: raise ValueError( "requiring reference bam file for removing by " "mismatches") pysam_ref = pysam.AlignmentFile(options.reference_bam, "rb") else: pysam_ref = None # filter and flags are the opposite way around c = _bam2bam.filter_bam(pysam_in, pysam_out, pysam_ref, remove_nonunique="unique" in options.filter_methods, remove_unique="non-unique" in options.filter_methods, remove_contigs=None, remove_unmapped="mapped" in options.filter_methods, remove_mismatches=remove_mismatches, colour_mismatches=colour_mismatches) if pysam_ref: pysam_ref.close() # do not write to stdlog in the middle of a SAM/BAM stdout stream. if options.stdlog != options.stdout: E.info("category\tcounts\n%s\n" % c.asTable()) else: # set up the modifying iterators it = pysam_in.fetch(until_eof=True) # function to check if processing should start pre_check_f = lambda x: None if "unset-unmapped-mapq" in options.methods: def unset_unmapped_mapq(i): for read in i: if read.is_unmapped: read.mapq = 0 yield read it = unset_unmapped_mapq(it) if "set-sequence" in options.methods: def set_sequence(i): for read in i: # can't get at length of unmapped reads if read.is_unmapped: read.seq = "A" read.qual = "F" else: read.seq = "A" * read.inferred_length read.qual = "F" * read.inferred_length yield read it = set_sequence(it) if "strip-sequence" in options.methods or "strip-quality" in \ options.methods: def strip_sequence(i): for read in i: read.seq = None yield read def check_sequence(reads): if reads[0].seq is None: return 'no sequence present' return None def strip_quality(i): for read in i: read.qual = None yield read def check_quality(reads): if reads[0].qual is None: return 'no quality information present' return None def strip_match(i): for read in i: try: nm = read.opt('NM') except KeyError: nm = 1 if nm == 0: read.seq = None yield read if options.strip_method == "all": if "strip-sequence" in options.methods: it = strip_sequence(it) pre_check_f = check_sequence elif "strip-quality" in options.methods: it = strip_quality(it) pre_check_f = check_quality elif options.strip_method == "match": it = strip_match(it) if "unstrip" in options.methods: def buildReadDictionary(filename): if not os.path.exists(filename): raise OSError("file not found: %s" % filename) fastqfile = pysam.FastxFile(filename) fastq2sequence = {} for x in fastqfile: if x.name in fastq2sequence: raise ValueError( "read %s duplicate - can not unstrip" % x.name) fastq2sequence[x.name] = (x.sequence, x.quality) return fastq2sequence if not options.fastq_pair1: raise ValueError( "please supply fastq file(s) for unstripping") fastq2sequence1 = buildReadDictionary(options.fastq_pair1) if options.fastq_pair2: fastq2sequence2 = buildReadDictionary(options.fastq_pair2) def unstrip_unpaired(i): for read in i: read.seq, read.qual = fastq2sequence1[read.qname] yield read def unstrip_pair(i): for read in i: if read.is_read1: read.seq, read.qual = fastq2sequence1[read.qname] else: read.seq, read.qual = fastq2sequence2[read.qname] yield read if options.fastq_pair2: it = unstrip_pair(it) else: it = unstrip_unpaired(it) if "set-nh" in options.methods: it = _bam2bam.SetNH(it) # keep first base of reads by changing the cigarstring to # '1M' and, in reads mapping to the reverse strand, # changes the pos to aend - 1 # Needs to be refactored to make it more general # (last base, midpoint, ..) if "keep_first_base" in options.methods: def keep_first_base(i): for read in i: if read.is_reverse: read.pos = read.aend - 1 read.cigarstring = '1M' elif not read.is_unmapped: read.cigarstring = '1M' yield read it = keep_first_base(it) # read first read and check if processing should continue # only possible when not working from stdin # Refactoring: use cache to also do a pre-check for # stdin input. if bamfile != "-": # get first read for checking pre-conditions first_reads = list(pysam_in.head(1)) msg = pre_check_f(first_reads) if msg is not None: if options.force: E.warn('proccessing continues, though: %s' % msg) else: E.warn('processing not started: %s' % msg) pysam_in.close() pysam_out.close() continue if "downsample-single" in options.methods: if not options.downsample: raise ValueError("Please provide downsample size") else: down = SubsetBam(pysam_in=it, downsample=options.downsample, paired_end=None, single_end=True, random_seed=options.random_seed) it = down.downsample_single() if "downsample-paired" in options.methods: if not options.downsample: raise ValueError("Please provide downsample size") else: down = SubsetBam(pysam_in=it, downsample=options.downsample, paired_end=True, single_end=None, random_seed=options.random_seed) it = down.downsample_paired() # continue processing till end for read in it: pysam_out.write(read) pysam_in.close() pysam_out.close() if options.inplace: # set date and file permissions according to original # Note: currently it will not update user and group. original = os.stat(bamfile) os.utime(tmpfile.name, (original.st_atime, original.st_mtime)) os.chmod(tmpfile.name, original.st_mode) # move new file over original copy shutil.move(tmpfile.name, bamfile) # re-index pysam.index(bamfile) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--set-nh", dest="set_nh", action="store_true", help= "sets the NH flag. The file needs to be sorted by readname [%default]") parser.add_option( "--unset-unmapped-mapq", dest="unset_unmapped_mapq", action="store_true", help="sets the mapping quality of unmapped reads to 0 [%default]") parser.add_option( "--set-sequence", dest="set_sequence", action="store_true", help="sets the sequence to 'A's (a valid base) and the quality to 'F's" ",which is defined in all fastq scoring schemes " "[%default]") parser.add_option( "--strip", dest="strip", type="choice", choices=("sequence", "quality", "match"), help= "remove parts of the bam-file. Note that stripping the sequence will " "also strip the quality values [%default]") parser.add_option("--unstrip", dest="unstrip", action="store_true", help="add sequence and quality into bam file [%default]") parser.add_option( "--filter", dest="filter", action="append", type="choice", choices=('NM', 'CM', 'mapped', 'unique', "non-unique"), help="filter bam file. The option denotes the property that is " "used to determine better match [%default]") parser.add_option("--reference-bam", dest="reference_bam", type="string", help="bam-file to filter with [%default]") parser.add_option("--sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.add_option( "--inplace", dest="inplace", action="store_true", help="modify bam files in-place. Bam files need to be given " "as arguments. Temporary bam files are written to /tmp [%default]") parser.add_option( "--fastq1", "-1", dest="fastq_pair1", type="string", help= "fastq file with read information for first in pair or unpaired [%default]" ) parser.add_option( "--fastq2", "-2", dest="fastq_pair2", type="string", help="fastq file with read information for second in pair [%default]") parser.add_option( "--keep-first-base", dest="keep_first_base", action="store_true", help= "keep first base of reads such that gtf2table.py will only consider the" "first base in its counts") parser.set_defaults(filter=[], set_nh=False, unset_unmapped_mapq=False, output_sam=False, reference_bam=None, strip=None, unstrip=None, set_sequence=False, inplace=False, fastq_pair1=None, fastq_pair2=None, keep_first_base=False) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.inplace: bamfiles = args if len(bamfiles) == 0: raise ValueError( "please one or more bam-files as command line arguments") if "-" in bamfiles: raise ValueError( "can not read from stdin if ``--inplace`` is selected") else: bamfiles = ["-"] for bamfile in bamfiles: E.info('processing %s' % bamfile) # reading bam from stdin does not work with only the "r" tag pysam_in = pysam.Samfile(bamfile, "rb") if bamfile == "-": if options.output_sam: pysam_out = pysam.Samfile("-", "wh", template=pysam_in) else: pysam_out = pysam.Samfile("-", "wb", template=pysam_in) else: tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp") tmpfile.close() E.debug("writing temporary bam-file to %s" % tmpfile.name) pysam_out = pysam.Samfile(tmpfile.name, "wb", template=pysam_in) if options.filter: remove_mismatches, colour_mismatches = False, False if "NM" in options.filter: remove_mismatches = True elif "CM" in options.filter: remove_mismatches = True colour_mismatches = True if remove_mismatches: if not options.reference_bam: raise ValueError( "requiring reference bam file for removing by " "mismatches") pysam_ref = pysam.Samfile(options.reference_bam, "rb") else: pysam_ref = None # filter and flags are the opposite way around c = _bam2bam.filter_bam(pysam_in, pysam_out, pysam_ref, remove_nonunique="unique" in options.filter, remove_unique="non-unique" in options.filter, remove_contigs=None, remove_unmapped="mapped" in options.filter, remove_mismatches=remove_mismatches, colour_mismatches=colour_mismatches) options.stdlog.write("category\tcounts\n%s\n" % c.asTable()) else: # set up the modifying iterators it = pysam_in.fetch(until_eof=True) if options.unset_unmapped_mapq: def unset_unmapped_mapq(i): for read in i: if read.is_unmapped: read.mapq = 0 yield read it = unset_unmapped_mapq(it) if options.set_nh and False: def set_nh(i): for key, reads in itertools.groupby(i, lambda x: x.qname): l = list(reads) nh = len(l) for read in l: if not read.is_unmapped: t = dict(read.tags) t['NH'] = nh read.tags = list(t.iteritems()) yield read it = set_nh(it) if options.set_sequence: def set_sequence(i): for read in i: # can't get at length of unmapped reads if read.is_unmapped: read.seq = "A" read.qual = "F" else: read.seq = "A" * read.inferred_length read.qual = "F" * read.inferred_length yield read it = set_sequence(it) if options.strip is not None: def strip_sequence(i): for read in i: read.seq = None yield read def strip_quality(i): for read in i: read.qual = None yield read def strip_match(i): for read in i: try: nm = read.opt('NM') except KeyError: nm = 1 if nm == 0: read.seq = None yield read if options.strip == "sequence": it = strip_sequence(it) elif options.strip == "quality": it = strip_quality(it) elif options.strip == "match": it = strip_match(it) if options.unstrip: def buildReadDictionary(filename): if not os.path.exists(filename): raise OSError("file not found: %s" % filename) fastqfile = pysam.Fastqfile(filename) fastq2sequence = {} for x in fastqfile: if x.name in fastq2sequence: raise ValueError( "read %s duplicate - can not unstrip" % x.name) fastq2sequence[x.name] = (x.sequence, x.quality) return fastq2sequence if not options.fastq_pair1: raise ValueError( "please supply fastq file(s) for unstripping") fastq2sequence1 = buildReadDictionary(options.fastq_pair1) if options.fastq_pair2: fastq2sequence2 = buildReadDictionary(options.fastq_pair2) def unstrip_unpaired(i): for read in i: read.seq, read.qual = fastq2sequence1[read.qname] yield read def unstrip_pair(i): for read in i: if read.is_read1: read.seq, read.qual = fastq2sequence1[read.qname] else: read.seq, read.qual = fastq2sequence2[read.qname] yield read if options.fastq_pair2: it = unstrip_pair(it) else: it = unstrip_unpaired(it) if options.set_nh: it = _bam2bam.SetNH(it) # keep first base of reads by changing the cigarstring to # '1M' and, in reads mapping to the reverse strand, # changes the pos to aend - 1 if options.keep_first_base: def keep_first_base(i): for read in i: if read.is_reverse: read.pos = read.aend - 1 read.cigarstring = '1M' elif not read.is_unmapped: read.cigarstring = '1M' yield read it = keep_first_base(it) # read and output for read in it: pysam_out.write(read) pysam_in.close() pysam_out.close() if options.inplace: # set date and file permissions according to original # Note: currently it will not update user and group. original = os.stat(bamfile) os.utime(tmpfile.name, (original.st_atime, original.st_mtime)) os.chmod(tmpfile.name, original.st_mode) # move new file over original copy shutil.move(tmpfile.name, bamfile) # re-index pysam.index(bamfile) # write footer and output benchmark information. E.Stop()