def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-r", "--mask-bed-file", "--mask-gff-file", dest="filename_bed", type="string", metavar='GFF', help="gff formatted file with masking locations. The number of " "reads overlapping the intervals in the given file will be " "computed. Note that the computation currently does not take " "into account indels, so it is an approximate count only. " "[%default]") parser.add_option( "-f", "--ignore-masked-reads", dest="ignore_masked_reads", action="store_true", help="as well as counting reads in the file given by --mask-bed-file, " "also remove these reads for duplicate and match statistics. " "[%default]") parser.add_option( "-i", "--num-reads", dest="input_reads", type="int", help="the number of reads - if given, used to provide percentages " "[%default]") parser.add_option( "-d", "--output-details", dest="output_details", action="store_true", help="output per-read details into a separate file. Read names are " "md5/base64 encoded [%default]") parser.add_option("--output-readmap", dest="output_readmap", action="store_true", help="output map between read name and " "md5/base64 encoded short name[%default]") parser.add_option( "--add-alignment-details", dest="add_alignment_details", action="store_true", help= "add alignment details to per-read details. Implies --output-details " "[%default]") parser.add_option( "-q", "--fastq-file", dest="filename_fastq", help="filename with sequences and quality scores. This file is only " "used to collect sequence identifiers. Thus, for paired end data a " "single file is sufficient [%default]") parser.add_option( "--basic-counts", dest="detailed_count", action="store_false", help="perform basic counting and do not compute per read stats. " "This is more memory efficient and faster stats computation, " "but only a summary counts table is output [%default]") parser.set_defaults( filename_bed=None, ignore_masked_reads=False, input_reads=0, force_output=False, filename_fastq=None, detailed_count=True, output_details=False, output_readmap=False, add_alignment_details=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if options.filename_bed: bed_mask = GTF.readAndIndex( GTF.iterator(iotools.open_file(options.filename_bed))) else: bed_mask = None if options.add_alignment_details: options.output_details = True is_stdin = True if len(args) > 0: pysam_in = pysam.AlignmentFile(args[0], "rb") if args[0] != "-": is_stdin = False elif options.stdin == sys.stdin: pysam_in = pysam.AlignmentFile("-", "rb") else: pysam_in = pysam.AlignmentFile(options.stdin, "rb") if options.stdin != "-": is_stdin = False if options.output_details: outfile_details = E.open_output_file("details", "w") else: outfile_details = None if options.output_readmap: outfile_readmap = E.open_output_file("readmap", "w") else: outfile_readmap = None if options.filename_fastq and not os.path.exists(options.filename_fastq): raise IOError("file %s does not exist" % options.filename_fastq) (counter, flags_counts, nh_filtered, nh_all, nm_filtered, nm_all, mapq, mapq_all, max_hi, details_df) = \ bam2stats_count(pysam_in, bed_mask=bed_mask, ignore_masked_reads=options.ignore_masked_reads, is_stdin=is_stdin, filename_fastq=options.filename_fastq, outfile_details=outfile_details, add_alignment_details=options.add_alignment_details, outfile_readmap=outfile_readmap, detailed_count=options.detailed_count) if max_hi > 0 and max_hi != max(nh_all.keys()): E.warn("max_hi(%i) is inconsistent with max_nh (%i) " "- counts will be corrected" % (max_hi, max(nh_all.keys()))) outs = options.stdout outs.write("category\tcounts\tpercent\tof\n") def _write(outs, text, numerator, denominator, base): percent = iotools.pretty_percent(numerator, denominator) outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base)) ############################### ############################### ############################### # Output alignment information ############################### nalignments_unmapped = flags_counts["unmapped"] nalignments_mapped = counter.alignments_input - nalignments_unmapped _write(outs, "alignments_total", counter.alignments_input, counter.alignments_input, "alignments_total") if counter.alignments_input == 0: E.warn("no alignments in BAM file - no further output") E.stop() return _write(outs, "alignments_mapped", nalignments_mapped, counter.alignments_input, 'alignments_total') _write(outs, "alignments_unmapped", nalignments_unmapped, counter.alignments_input, 'alignments_total') if nalignments_mapped == 0: E.warn("no mapped alignments - no further output") E.stop() return for flag, counts in sorted(flags_counts.items()): if flag == "unmapped": continue _write(outs, 'alignments_' + flag, counts, nalignments_mapped, 'alignments_mapped') if options.filename_bed: _write(outs, "alignments_masked", counter.alignments_masked, nalignments_mapped, 'alignments_mapped') _write(outs, "alignments_notmasked", counter.alignments_notmasked, nalignments_mapped, 'alignments_mapped') _write(outs, "alignments_filtered", counter.alignments_filtered, nalignments_mapped, "alignments_mapped") if counter.filtered == nalignments_mapped: normby = "alignments_mapped" else: normby = "alignments_filtered" if counter.filtered > 0: _write(outs, "alignments_duplicates", counter.alignments_duplicates, counter.alignments_filtered, normby) _write(outs, "alignments_unique", counter.aligmnments_filtered - counter.alignments_duplicates, counter.alignments_filtered, normby) ############################### ############################### ############################### # Output read based information ############################### # derive the number of mapped reads in file from alignment counts if options.filename_fastq or not is_stdin: nreads_total = counter.total_read _write(outs, "reads_total", counter.total_read, nreads_total, 'reads_total') _write(outs, "reads_unmapped", counter.total_read_is_unmapped, nreads_total, 'reads_total') _write(outs, "reads_mapped", counter.total_read_is_mapped, nreads_total, 'reads_total') _write(outs, "reads_missing", counter.total_read_is_missing, nreads_total, 'reads_total') _write(outs, "reads_mapped_unique", counter.total_read_is_mapped_uniq, counter.total_read_is_mapped, 'reads_mapped') _write(outs, "reads_multimapping", counter.total_read_is_mmap, counter.total_read_is_mapped, 'reads_mapped') _write(outs, "reads_mapped_supplementary", counter.total_read_has_supplementary, counter.total_read_is_mapped, 'reads_mapped') else: E.warn('inferring read counts from alignments and NH tags') nreads_unmapped = flags_counts["unmapped"] nreads_mapped = computeMappedReadsFromAlignments( nalignments_mapped, nh_all, max_hi) nreads_missing = 0 if options.input_reads: nreads_total = options.input_reads # unmapped reads in bam file? if nreads_unmapped: nreads_missing = nreads_total - nreads_unmapped - nreads_mapped else: nreads_unmapped = nreads_total - nreads_mapped elif nreads_unmapped: # if unmapped reads are in bam file, take those nreads_total = nreads_mapped + nreads_unmapped else: # otherwise normalize by mapped reads nreads_unmapped = 0 nreads_total = nreads_mapped outs.write("reads_total\t%i\t%5.2f\treads_total\n" % (nreads_total, 100.0)) outs.write("reads_mapped\t%i\t%5.2f\treads_total\n" % (nreads_mapped, 100.0 * nreads_mapped / nreads_total)) outs.write("reads_unmapped\t%i\t%5.2f\treads_total\n" % (nreads_unmapped, 100.0 * nreads_unmapped / nreads_total)) outs.write("reads_missing\t%i\t%5.2f\treads_total\n" % (nreads_missing, 100.0 * nreads_missing / nreads_total)) if len(nh_all) > 1: outs.write("reads_unique\t%i\t%5.2f\treads_mapped\n" % (nh_all[1], 100.0 * nh_all[1] / nreads_mapped)) pysam_in.close() ############################### ############################### ############################### # Output pair information ############################### if flags_counts["read2"] > 0: if options.filename_fastq: pairs_mapped = counter.total_pair_is_mapped # sanity check assert counter.total_pair_is_mapped == \ (counter.total_pair_is_proper_uniq + counter.total_pair_is_incomplete_uniq + counter.total_pair_is_incomplete_mmap + counter.total_pair_is_proper_duplicate + counter.total_pair_is_proper_mmap + counter.total_pair_not_proper_uniq + counter.total_pair_is_other) outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" % (counter.total_pairs, 100.0 * counter.total_pairs / counter.total_pairs)) outs.write( "pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 100.0 * pairs_mapped / counter.total_pairs)) outs.write("pairs_unmapped\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_unmapped, 100.0 * counter.total_pair_is_unmapped / counter.total_pairs)) outs.write( "pairs_proper_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_uniq, 100.0 * counter.total_pair_is_proper_uniq / counter.total_pairs)) outs.write( "pairs_incomplete_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_incomplete_uniq, 100.0 * counter.total_pair_is_incomplete_uniq / counter.total_pairs)) outs.write( "pairs_incomplete_multimapping\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_incomplete_mmap, 100.0 * counter.total_pair_is_incomplete_mmap / counter.total_pairs)) outs.write( "pairs_proper_duplicate\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_duplicate, 100.0 * counter.total_pair_is_proper_duplicate / counter.total_pairs)) outs.write( "pairs_proper_multimapping\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_mmap, 100.0 * counter.total_pair_is_proper_mmap / counter.total_pairs)) outs.write( "pairs_not_proper_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_not_proper_uniq, 100.0 * counter.total_pair_not_proper_uniq / counter.total_pairs)) outs.write("pairs_other\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_other, 100.0 * counter.total_pair_is_other / counter.total_pairs)) nread1_total = counter.total_read1 _write(outs, "read1_total", counter.total_read1, nread1_total, 'read1_total') _write(outs, "read1_unmapped", counter.total_read1_is_unmapped, nread1_total, 'read1_total') _write(outs, "read1_mapped", counter.total_read1_is_mapped, nread1_total, 'read1_total') _write(outs, "read1_mapped_unique", counter.total_read1_is_mapped_uniq, counter.total_read1_is_mapped, 'read1_mapped') _write(outs, "reads_multimapping", counter.total_read1_is_mmap, counter.total_read1_is_mapped, 'read1_mapped') _write(outs, "read1_missing", counter.total_read1_is_missing, counter.total_read1_is_mapped, 'read1_total') nread2_total = counter.total_read2 _write(outs, "read2_total", counter.total_read2, nread2_total, 'read2_total') _write(outs, "read2_unmapped", counter.total_read2_is_unmapped, nread2_total, 'read2_total') _write(outs, "read2_mapped", counter.total_read2_is_mapped, nread2_total, 'read2_total') _write(outs, "read2_mapped_unique", counter.total_read2_is_mapped_uniq, counter.total_read2_is_mapped, 'read2_mapped') _write(outs, "reads_multimapping", counter.total_read2_is_mmap, counter.total_read2_is_mapped, 'read2_mapped') _write(outs, "read2_missing", counter.total_read2_is_missing, counter.total_read2_is_mapped, 'read2_total') else: # approximate counts pairs_total = nreads_total // 2 pairs_mapped = flags_counts["proper_pair"] // 2 _write(outs, "pairs_total", pairs_total, pairs_total, "pairs_total") _write(outs, "pairs_mapped", pairs_mapped, pairs_total, "pairs_total") else: # no paired end data pairs_total = pairs_mapped = 0 outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" % (pairs_total, 0.0)) outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 0.0)) outs.write("error_rate\t%i\t%5.2f\tmatches+insertions\n" % (counter.error_counts, counter.error_rate * 100.0)) outs.write("insertion_rate\t%i\t%5.2f\tmatches+insertions\n" % (counter.insertion_counts, counter.insertion_rate * 100.0)) outs.write("deletion_rate\t%i\t%5.2f\tmatches+deletions\n" % (counter.deletion_counts, counter.deletion_rate * 100.0)) outs.write("mismatch_rate\t%i\t%5.2f\tmatches\n" % (counter.mismatch_counts, counter.mismatch_rate * 100.0)) outs.write("match_rate\t%i\t%5.2f\tmatches+insertions\n" % (counter.match_counts, counter.match_rate * 100.0)) if options.force_output or len(nm_filtered) > 0: outfile = E.open_output_file("nm", "w") outfile.write("NM\talignments\n") if len(nm_filtered) > 0: for x in range(0, max(nm_filtered.keys()) + 1): outfile.write("%i\t%i\n" % (x, nm_filtered[x])) else: outfile.write("0\t%i\n" % (counter.filtered)) outfile.close() if options.force_output or len(nh_all) > 1: outfile = E.open_output_file("nh_all", "w") outfile.write("NH\treads\n") if len(nh_all) > 0: writeNH(outfile, nh_all, max_hi) else: # assume all are unique if NH flag not set outfile.write("1\t%i\n" % (counter.mapped_reads)) outfile.close() if options.force_output or len(nh_filtered) > 1: outfile = E.open_output_file("nh", "w") outfile.write("NH\treads\n") if len(nh_filtered) > 0: writeNH(outfile, nh_filtered, max_hi) else: # assume all are unique if NH flag not set outfile.write("1\t%i\n" % (counter.filtered)) outfile.close() if options.force_output or len(mapq_all) > 1: outfile = E.open_output_file("mapq", "w") outfile.write("mapq\tall_reads\tfiltered_reads\n") for x in range(0, max(mapq_all.keys()) + 1): outfile.write("%i\t%i\t%i\n" % (x, mapq_all[x], mapq[x])) outfile.close() if details_df is not None: with E.open_output_file("summaries", "w") as outf: details_df.describe().transpose().to_csv(outf, sep="\t", index_label="metric") bins = numpy.arange(0, 1.01, 0.01) histogram_df = pandas.DataFrame.from_items([ (x, numpy.histogram(details_df[x].dropna(), bins=bins)[0]) for x in details_df.columns ]) histogram_df.index = numpy.arange(0, 1.0, 0.01) row_sums = histogram_df.sum(axis=1) histogram_df = histogram_df[row_sums != 0] with E.open_output_file("histogram", "w") as outf: histogram_df.to_csv(outf, sep="\t", index_label="bin") # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-e", "--exons-file", "--gtf-file", dest="filename_exons", type=str, metavar="gtf", help="gtf formatted file with non-overlapping exon " "locations (required). ") parser.set_defaults( filename_exons=None, read_length=200, ) # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, add_output_options=True, unknowns=True) exons = GTF.readAndIndex( GTF.iterator(iotools.open_file(args.filename_exons))) pysam_in = pysam.AlignmentFile("-", "rb") nspliced = 0 nspliced_ignored = 0 nspliced_nooverlap = 0 nspliced_halfoverlap = 0 nspliced_bothoverlap = 0 nspliced_overrun = [0] * 2 * (args.read_length + 10) nspliced_exact = 0 nspliced_inexact = 0 nunspliced = 0 nunspliced_overlap = 0 nunspliced_ignored = 0 nunspliced_nooverlap = 0 nunspliced_overrun = [0] * (args.read_length + 10) overrun_offset = args.read_length + 10 ninput = 0 nunmapped = 0 c = E.Counter() def _splice_overrun(start, end, overlap): '''return splicesite over/underrun. positive values: overrun negative values: underrun 0: no over/underrun ''' exon_start = min([x[0] for x in overlap]) exon_end = max([x[1] for x in overlap]) if start <= exon_start and end > exon_start: # overrun at start or match r = exon_start - start elif start < exon_end and end >= exon_end: # overrun at end or match r = end - exon_end else: # underrun - distance to closest exon boundary r = -min(start - exon_start, exon_end - end) return r for read in pysam_in: ninput += 1 if read.is_unmapped: nunmapped += 1 continue # check for BAM_CREF_SKIP code in cigar string cigar = read.cigar is_spliced = 3 in [x[0] for x in cigar] contig = pysam_in.getrname(read.tid) start = read.pos end = read.aend if is_spliced: # count both ends nspliced += 1 if len(cigar) != 3: nspliced_ignored += 1 continue start5, end5 = start, start + cigar[0][1] start3, end3 = end - cigar[2][1], end try: overlap3 = list(exons.get(contig, start3, end3)) overlap5 = list(exons.get(contig, start5, end5)) except KeyError: overlap3 = overlap5 = [] ovl3 = len(overlap3) ovl5 = len(overlap5) o3 = o5 = None if not ovl3 and not ovl5: nspliced_nooverlap += 1 elif ovl3 and not ovl5: nspliced_halfoverlap += 1 o3 = _splice_overrun(start3, end3, overlap3) elif ovl5 and not ovl3: nspliced_halfoverlap += 1 o5 = _splice_overrun(start5, end5, overlap5) else: # both overlap nspliced_bothoverlap += 1 o3 = _splice_overrun(start3, end3, overlap3) o5 = _splice_overrun(start5, end5, overlap5) if o3 is not None: if o3 == 0: nspliced_exact += 1 else: nspliced_inexact += 1 nspliced_overrun[max(0, overrun_offset + o3)] += 1 if o5 is not None: if o5 == 0: nspliced_exact += 1 else: nspliced_inexact += 1 nspliced_overrun[max(0, overrun_offset + o5)] += 1 else: nunspliced += 1 try: overlap = list(exons.get(contig, start, end)) except KeyError: overlap = [] if len(overlap) == 0: nunspliced_nooverlap += 1 elif len(overlap) >= 1: nunspliced_overlap += 1 # multiple overlap - merge exons (usually: small introns) exon_start = min([x[0] for x in overlap]) exon_end = max([x[1] for x in overlap]) ostart = max(0, exon_start - start) oend = max(0, end - exon_end) o = min(end, exon_end) - max(start, exon_start) overrun = ostart + oend nunspliced_overrun[overrun] += 1 # output histograms outfile = E.open_output_file("overrun") outfile.write( "bases\tunspliced_overrun_counts\tspliced_overrun_counts\tspliced_underrun_counts\n" ) _nspliced_overrun = nspliced_overrun[overrun_offset:] _nspliced_underrun = nspliced_overrun[:overrun_offset + 1] _nspliced_underrun.reverse() for x, v in enumerate( zip(nunspliced_overrun, _nspliced_overrun, _nspliced_underrun)): outfile.write("%i\t%s\n" % (x, "\t".join(map(str, v)))) outfile.close() # output summary # convert to counter c.input = ninput c.unmapped = nunmapped c.mapped = ninput - nunmapped c.unspliced = nunspliced c.unspliced_nooverlap = nunspliced_nooverlap c.unspliced_nooverrun = nunspliced_overrun[0] c.unspliced_overlap = nunspliced_overlap c.unspliced_overrun = sum(nunspliced_overrun[1:]) c.spliced = nspliced c.spliced_nooverlap = nspliced_nooverlap c.spliced_halfoverlap = nspliced_halfoverlap c.spliced_bothoverlap = nspliced_bothoverlap c.spliced_exact = nspliced_exact c.spliced_inexact = nspliced_inexact c.spliced_ignored = nspliced_ignored c.spliced_underrun = sum(_nspliced_underrun[1:]) c.spliced_overrun = sum(_nspliced_overrun[1:]) outfile = args.stdout outfile.write("category\tcounts\n") for k, v in sorted(c.items()): outfile.write("%s\t%i\n" % (k, v)) # write footer and output benchmark information. E.stop()