def mem(self, inReads, refDb, outAlign, options=None, min_score_to_filter=None, threads=None): options = [] if not options else options threads = threads or util.misc.available_cpu_count() samtools = tools.samtools.SamtoolsTool() fq1 = util.file.mkstempfname('.1.fastq') fq2 = util.file.mkstempfname('.2.fastq') aln_sam = util.file.mkstempfname('.sam') samtools.bam2fq(inReads, fq1, fq2) if '-t' not in options: threads = threads or utils.misc.available_cpu_count() options.extend(('-t', str(threads))) self.execute('mem', options + [refDb, fq1, fq2], stdout=aln_sam) os.unlink(fq1) os.unlink(fq2) if min_score_to_filter: # Filter reads in the alignment based on on their alignment score aln_sam_filtered = util.file.mkstempfname('.sam') self.filter_sam_on_alignment_score(aln_sam, aln_sam_filtered, min_score_to_filter, options) else: aln_sam_filtered = aln_sam samtools.sort(aln_sam_filtered, outAlign, threads=threads) os.unlink(aln_sam) # cannot index sam files; only do so if a bam/cram is desired if outAlign.endswith(".bam") or outAlign.endswith(".cram"): samtools.index(outAlign)
def mem(self, inReads, refDb, outAlign, options=None, min_qual=None, threads=None): options = [] if not options else options threads = threads or util.misc.available_cpu_count() samtools = tools.samtools.SamtoolsTool() fq1 = util.file.mkstempfname('.1.fastq') fq2 = util.file.mkstempfname('.2.fastq') aln_sam = util.file.mkstempfname('.sam') samtools.bam2fq(inReads, fq1, fq2) if '-t' not in options: threads = threads or utils.misc.available_cpu_count() options.extend(('-t', str(threads))) if '-T' not in options: min_qual = min_qual or 30 options.extend(('-T', str(min_qual))) self.execute('mem', options + [refDb, fq1, fq2], stdout=aln_sam) os.unlink(fq1) os.unlink(fq2) samtools.sort(aln_sam, outAlign, threads=threads) os.unlink(aln_sam) # cannot index sam files; only do so if a bam/cram is desired if outAlign.endswith(".bam") or outAlign.endswith(".cram"): samtools.index(outAlign)
def mem(self, inReads, refDb, outAlign, options=None, min_score_to_filter=None, threads=None, invert_filter=False, should_index=True): options = [] if not options else options threads = util.misc.sanitize_thread_count(threads) if '-t' not in options: options.extend(('-t', str(threads))) samtools = tools.samtools.SamtoolsTool() aln_sam = util.file.mkstempfname('.aligned.sam') fastq_pipe = samtools.bam2fq_pipe(inReads) self.execute('mem', options + ['-p', refDb, '-'], stdout=aln_sam, stdin=fastq_pipe.stdout) if fastq_pipe.poll(): raise subprocess.CalledProcessError(fastq_pipe.returncode, "samtools.bam2fq_pipe() for {}".format(inReads)) if min_score_to_filter: # Filter reads in the alignment based on on their alignment score aln_sam_filtered = util.file.mkstempfname('.sam') self.filter_sam_on_alignment_score(aln_sam, aln_sam_filtered, min_score_to_filter, options, invert_filter=invert_filter) os.unlink(aln_sam) else: aln_sam_filtered = aln_sam samtools.sort(aln_sam_filtered, outAlign, threads=threads) os.unlink(aln_sam_filtered) # cannot index sam files; only do so if a bam/cram is desired if should_index and (outAlign.endswith(".bam") or outAlign.endswith(".cram")): samtools.index(outAlign)
def align_cmd(self, inReads, refDb, outAlign, options=None, threads=None): options = [] if not options else options threads = util.misc.sanitize_thread_count(threads) if '-t' not in options: options.extend(('-t', str(threads))) if '-2' not in options: options.append('-2') samtools = tools.samtools.SamtoolsTool() with util.file.tempfname('.aligned.sam') as aln_sam: fastq_pipe = samtools.bam2fq_pipe(inReads) options.extend(('-a', refDb, '-', '-o', aln_sam)) self.execute(options, stdin=fastq_pipe.stdout) if fastq_pipe.wait(): raise subprocess.CalledProcessError(fastq_pipe.returncode, "samtools.bam2fq_pipe() for {}".format(inReads)) samtools.sort(aln_sam, outAlign, threads=threads) # cannot index sam files; only do so if a bam/cram is desired if (outAlign.endswith(".bam") or outAlign.endswith(".cram")): samtools.index(outAlign)
def mem(self, inReads, refDb, outAlign, options=None, min_score_to_filter=None, threads=None, invert_filter=False): options = [] if not options else options threads = util.misc.sanitize_thread_count(threads) if '-t' not in options: options.extend(('-t', str(threads))) samtools = tools.samtools.SamtoolsTool() aln_sam = util.file.mkstempfname('.aligned.sam') with samtools.bam2fq_tmp(inReads) as (fq1, fq2): self.execute('mem', options + [refDb, fq1, fq2], stdout=aln_sam) if min_score_to_filter: # Filter reads in the alignment based on on their alignment score aln_sam_filtered = util.file.mkstempfname('.sam') self.filter_sam_on_alignment_score(aln_sam, aln_sam_filtered, min_score_to_filter, options, invert_filter=invert_filter) os.unlink(aln_sam) else: aln_sam_filtered = aln_sam samtools.sort(aln_sam_filtered, outAlign, threads=threads) os.unlink(aln_sam_filtered) # cannot index sam files; only do so if a bam/cram is desired if outAlign.endswith(".bam") or outAlign.endswith(".cram"): samtools.index(outAlign)
def scaffold(self, contigs_fasta, ref_fasta, outAlign, divergence=20, options=None, threads=None): options = [] if not options else options threads = util.misc.sanitize_thread_count(threads) if '-t' not in options: options.extend(('-t', str(threads))) if '-2' not in options: options.append('-2') if divergence >= 20: options.extend(('-x', 'asm20')) elif divergence >= 10: options.extend(('-x', 'asm10')) else: options.extend(('-x', 'asm5')) with util.file.tempfname('.aligned.sam') as aln_sam: options.extend(('-a', ref_fasta, contigs_fasta, '-o', aln_sam)) self.execute(options) samtools.sort(aln_sam, outAlign, threads=threads) # cannot index sam files; only do so if a bam/cram is desired if (outAlign.endswith(".bam") or outAlign.endswith(".cram")): samtools.index(outAlign)
def align_and_plot_coverage(out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, plot_x_limits, plot_y_limits, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary, in_bam, ref_fasta, out_bam=None, sensitive=False, excludeDuplicates=False, bin_large_plots=False, binning_summary_statistic="max", JVMmemory=None, picardOptions=None, min_score_to_filter=None, aligner="bwa", aligner_options='', novoalign_license_path=None): ''' Take reads, align to reference with BWA-MEM, and generate a coverage plot ''' # TODO: use read_utils.py::align_and_fix in place of the duplicated alignment code here # The main difference is the presence/absence of GATK's local_realign if out_bam is None: bam_aligned = util.file.mkstempfname('.aligned.bam') else: bam_aligned = out_bam assert aligner in ["bwa", "novoalign"] if aligner_options is None: if aligner == "novoalign": aligner_options = '-r Random -l 40 -g 40 -x 20 -t 100 -k' elif aligner == 'bwa': aligner_options = '-1' # hidden option to work around kernel/cpu bug; disables multithreaded file read: https://github.com/lh3/bwa/issues/102 samtools = tools.samtools.SamtoolsTool() ref_indexed = util.file.mkstempfname('.reference.fasta') shutil.copyfile(ref_fasta, ref_indexed) aln_bam = util.file.mkstempfname('.bam') if aligner == "bwa": bwa = tools.bwa.Bwa() bwa.index(ref_indexed) bwa_opts = aligner_options.split() if sensitive: bwa_opts += "-k 12 -A 1 -B 1 -O 1 -E 1".split() bwa.align_mem_bam(in_bam, ref_indexed, aln_bam, options=bwa_opts, min_score_to_filter=min_score_to_filter) elif aligner == "novoalign": tools.novoalign.NovoalignTool( license_path=novoalign_license_path).index_fasta(ref_indexed) tools.novoalign.NovoalignTool( license_path=novoalign_license_path).execute( in_bam, ref_indexed, aln_bam, options=aligner_options.split(), JVMmemory=JVMmemory) aln_bam_dupe_processed = util.file.mkstempfname( '.filtered_dupe_processed.bam') if excludeDuplicates: opts = list(picardOptions) dupe_removal_out_metrics = util.file.mkstempfname('.metrics') tools.picard.MarkDuplicatesTool().execute([aln_bam], aln_bam_dupe_processed, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=JVMmemory) else: aln_bam_dupe_processed = aln_bam samtools.sort(aln_bam_dupe_processed, bam_aligned) os.unlink(aln_bam) if excludeDuplicates: os.unlink(aln_bam_dupe_processed) samtools.index(bam_aligned) # -- call plot function -- plot_coverage(bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, plot_x_limits, plot_y_limits, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, excludeDuplicates, bin_large_plots, binning_summary_statistic, out_summary) # remove the output bam, unless it is needed if out_bam is None: os.unlink(bam_aligned) # remove the files created by bwa index. # The empty extension causes the original fasta file to be removed for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]: file_to_remove = ref_indexed + ext if os.path.isfile(file_to_remove): os.unlink(file_to_remove)
def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, plot_x_limits, plot_y_limits, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, plot_only_non_duplicates=False, bin_large_plots=False, binning_summary_statistic="max", out_summary=None): ''' Generate a coverage plot from an aligned bam file ''' samtools = tools.samtools.SamtoolsTool() # check if in_bam is aligned, if not raise an error num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) if num_mapped_reads == 0: raise Exception( """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if the plot input bam file contains reads and you don't mind a simple bwa alignment. \n File: %s""" % in_bam) if out_summary is None: coverage_tsv_file = util.file.mkstempfname('.summary.tsv') else: coverage_tsv_file = out_summary bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam') if plot_only_non_duplicates: # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates) samtools.view(["-F", "1024", '-@', '3'], in_bam, bam_dupe_processed) else: bam_dupe_processed = in_bam # only sort if not sorted bam_sorted = util.file.mkstempfname('.sorted.bam') should_remove_sorted = True if not util.file.bam_is_sorted(bam_dupe_processed): samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"]) if plot_only_non_duplicates: os.unlink(bam_dupe_processed) else: bam_sorted = bam_dupe_processed if not plot_only_non_duplicates: # in this case we are passing through the original in_bam directly should_remove_sorted = False # call samtools index samtools.index(bam_sorted) # call samtools depth opts = [] opts += ['-aa'] # report coverate at "absolutely all" positions if base_q_threshold: if not plot_only_non_duplicates: # Note: "bedtools genomecov" will count depth including duplicates, but does # not expose options for filtering by quality. When duplicates # are excluded, "samtools depth" is used which does support quality filtering # We use either samtools or bedtools, because the former ignores marked duplicates # from its depth count while bedtools includes them. log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-q", str(base_q_threshold)] if mapping_q_threshold: if not plot_only_non_duplicates: log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-Q", str(mapping_q_threshold)] if max_coverage_depth: if not plot_only_non_duplicates: log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent") opts += ["-m", str(max_coverage_depth)] if read_length_threshold: if not plot_only_non_duplicates: log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent") opts += ["-l", str(read_length_threshold)] # add option here for bedtools to report coverage w/ duplicates # (and then samtools for no-dups) # # Ex. # samtools depth -aa mapped-to-ref.with-dups.tmp.bam # bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d if not plot_only_non_duplicates: bt = BedTool(bam_sorted) # "d=True" is the equivalent of passing "-d" to the bedtools CLI bt.genome_coverage(d=True).saveas(coverage_tsv_file) else: samtools.depth(bam_sorted, coverage_tsv_file, opts) # only remove the sorted bam if it is not the original input bam # which we use directly in some casess if should_remove_sorted: os.unlink(bam_sorted) # ---- create plot based on coverage_tsv_file ---- segment_depths = OrderedDict() domain_max = 0 with open(coverage_tsv_file, "r") as tabfile: for row in csv.reader(tabfile, delimiter='\t'): segment_depths.setdefault(row[0], []).append(float(row[2])) domain_max += 1 with matplotlib.pyplot.style.context(plot_style): fig = matplotlib.pyplot.gcf() DPI = plot_dpi or fig.get_dpi() fig.set_size_inches( float(plot_width) / float(DPI), float(plot_height) / float(DPI)) font_size = (2.5 * plot_height) / float(DPI) ax = matplotlib.pyplot.subplot( ) # Defines ax variable by creating an empty plot # Set the tick labels font for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(font_size) # Binning bin_size = 1 if bin_large_plots: # Bin locations and take summary value (maximum or minimum) in each bin binning_fn = { "min": min, "max": max, "mean": mean, "median": median } binning_action = binning_fn.get(binning_summary_statistic, "max") inner_plot_width_inches = ax.get_window_extent().transformed( fig.dpi_scale_trans.inverted()).width inner_plot_width_px = inner_plot_width_inches * fig.dpi # width of actual plot (sans whitespace and y axis text) bins_per_pixel = 1 # increase to make smaller (but less visible) bins bin_size = 1 + int(domain_max / (inner_plot_width_px * bins_per_pixel)) binned_segment_depths = OrderedDict() for segment_num, (segment_name, position_depths) in enumerate( segment_depths.items()): summary_depths_in_bins = [ binning_action(position_depths[i:i + bin_size]) for i in range(0, len(position_depths), bin_size) ] binned_segment_depths[segment_name] = summary_depths_in_bins segment_depths = binned_segment_depths # Plotting domain_max = 0 for segment_num, (segment_name, position_depths) in enumerate( segment_depths.items()): prior_domain_max = domain_max domain_max += len(position_depths) colors = list( matplotlib.pyplot.rcParams['axes.prop_cycle'].by_key() ['color']) # get the colors for this style segment_color = colors[ segment_num % len(colors)] # pick a color, offset by the segment index x_values = range(prior_domain_max, domain_max) x_values = [x * bin_size for x in x_values] if plot_data_style == "filled": matplotlib.pyplot.fill_between(x_values, position_depths, [0] * len(position_depths), linewidth=0, antialiased=True, color=segment_color) elif plot_data_style == "line": matplotlib.pyplot.plot(x_values, position_depths, antialiased=True, color=segment_color) elif plot_data_style == "dots": matplotlib.pyplot.plot(x_values, position_depths, 'ro', antialiased=True, color=segment_color) matplotlib.pyplot.title(plot_title, fontsize=font_size * 1.2) matplotlib.pyplot.xlabel("bp", fontsize=font_size * 1.1) ylabel = "read depth" if (bin_size > 1): ylabel = "read depth ({summary} in {size}-bp bin)".format( size=bin_size, summary=binning_summary_statistic) matplotlib.pyplot.ylabel(ylabel, fontsize=font_size * 1.1) if plot_x_limits is not None: x_min, x_max = plot_x_limits matplotlib.pyplot.xlim(x_min, x_max) if plot_y_limits is not None: y_min, y_max = plot_y_limits matplotlib.pyplot.ylim(y_min, y_max) # to squash a backend renderer error on OSX related to tight layout if matplotlib.pyplot.get_backend().lower() in ['agg', 'macosx']: fig.set_tight_layout(True) else: fig.tight_layout() matplotlib.pyplot.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') log.info("Coverage plot saved to: " + out_plot_file) if not out_summary: os.unlink(coverage_tsv_file)
def align_and_plot_coverage( out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary, in_bam, ref_fasta, out_bam=None, sensitive=False, excludeDuplicates=False, JVMmemory=None, picardOptions=None, min_score_to_output=None ): ''' Take reads, align to reference with BWA-MEM, and generate a coverage plot ''' if out_bam is None: bam_aligned = util.file.mkstempfname('.aligned.bam') else: bam_aligned = out_bam ref_indexed = util.file.mkstempfname('.reference.fasta') shutil.copyfile(ref_fasta, ref_indexed) bwa = tools.bwa.Bwa() samtools = tools.samtools.SamtoolsTool() bwa.index(ref_indexed) bwa_opts = [] if sensitive: bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split() map_threshold = min_score_to_output or 30 bwa_opts + ["-T", str(map_threshold)] aln_bam = util.file.mkstempfname('.bam') bwa.mem(in_bam, ref_indexed, aln_bam, opts=bwa_opts) # @haydenm says: # For some reason (particularly when the --sensitive option is on), bwa # doesn't listen to its '-T' flag and outputs alignments with score less # than the '-T 30' threshold. So filter these: aln_bam_filtered = util.file.mkstempfname('.filtered.bam') samtools.view(["-b", "-h", "-q", str(map_threshold)], aln_bam, aln_bam_filtered) os.unlink(aln_bam) aln_bam_dupe_processed = util.file.mkstempfname('.filtered_dupe_processed.bam') if excludeDuplicates: opts = list(picardOptions) dupe_removal_out_metrics = util.file.mkstempfname('.metrics') tools.picard.MarkDuplicatesTool().execute( [aln_bam_filtered], aln_bam_dupe_processed, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=JVMmemory ) else: aln_bam_dupe_processed = aln_bam_filtered samtools.sort(aln_bam_dupe_processed, bam_aligned) os.unlink(aln_bam_filtered) if excludeDuplicates: os.unlink(aln_bam_dupe_processed) samtools.index(bam_aligned) # -- call plot function -- plot_coverage( bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, excludeDuplicates, out_summary ) # remove the output bam, unless it is needed if out_bam is None: os.unlink(bam_aligned) # remove the files created by bwa index. # The empty extension causes the original fasta file to be removed for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]: file_to_remove = ref_indexed + ext if os.path.isfile(file_to_remove): os.unlink(file_to_remove)
def plot_coverage( in_bam, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, plot_only_non_duplicates=False, out_summary=None ): ''' Generate a coverage plot from an aligned bam file ''' # TODO: remove this: #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv" samtools = tools.samtools.SamtoolsTool() # check if in_bam is aligned, if not raise an error num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) if num_mapped_reads == 0: raise Exception( """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s""" % in_bam ) if out_summary is None: coverage_tsv_file = util.file.mkstempfname('.summary.tsv') else: coverage_tsv_file = out_summary bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam') if plot_only_non_duplicates: # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates) samtools.view(["-F", "1024"], in_bam, bam_dupe_processed) else: bam_dupe_processed = in_bam # call samtools sort bam_sorted = util.file.mkstempfname('.sorted.bam') samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"]) if plot_only_non_duplicates: os.unlink(bam_dupe_processed) # call samtools index samtools.index(bam_sorted) # call samtools depth opts = [] opts += ['-aa'] # report coverate at "absolutely all" positions if base_q_threshold: opts += ["-q", str(base_q_threshold)] if mapping_q_threshold: opts += ["-Q", str(mapping_q_threshold)] if max_coverage_depth: opts += ["-m", str(max_coverage_depth)] if read_length_threshold: opts += ["-l", str(read_length_threshold)] samtools.depth(bam_sorted, coverage_tsv_file, opts) os.unlink(bam_sorted) # ---- create plot based on coverage_tsv_file ---- segment_depths = OrderedDict() domain_max = 0 with open(coverage_tsv_file, "r") as tabfile: for row in csv.reader(tabfile, delimiter='\t'): segment_depths.setdefault(row[0], []).append(int(row[2])) domain_max += 1 domain_max = 0 with plt.style.context(plot_style): fig = plt.gcf() DPI = plot_dpi or fig.get_dpi() fig.set_size_inches(float(plot_width) / float(DPI), float(plot_height) / float(DPI)) font_size = (2.5 * plot_height) / float(DPI) ax = plt.subplot() # Defines ax variable by creating an empty plot # Set the tick labels font for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(font_size) for segment_num, (segment_name, position_depths) in enumerate(segment_depths.items()): prior_domain_max = domain_max domain_max += len(position_depths) colors = list(plt.rcParams['axes.prop_cycle'].by_key()['color']) # get the colors for this style segment_color = colors[segment_num % len(colors)] # pick a color, offset by the segment index if plot_data_style == "filled": plt.fill_between( range(prior_domain_max, domain_max), position_depths, [0] * len(position_depths), linewidth=0, antialiased=True, color=segment_color ) elif plot_data_style == "line": plt.plot(range(prior_domain_max, domain_max), position_depths, antialiased=True, color=segment_color) elif plot_data_style == "dots": plt.plot( range(prior_domain_max, domain_max), position_depths, 'ro', antialiased=True, color=segment_color ) plt.title(plot_title, fontsize=font_size * 1.2) plt.xlabel("bp", fontsize=font_size * 1.1) plt.ylabel("read depth", fontsize=font_size * 1.1) # to squash a backend renderer error on OSX related to tight layout if plt.get_backend().lower() in ['agg', 'macosx']: fig.set_tight_layout(True) else: fig.tight_layout() plt.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') log.info("Coverage plot saved to: " + out_plot_file) if not out_summary: os.unlink(coverage_tsv_file)
def align_and_plot_coverage(out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary, in_bam, ref_fasta, out_bam=None, sensitive=False, excludeDuplicates=False, JVMmemory=None, picardOptions=None, min_score_to_output=None, aligner="bwa", aligner_options='', novoalign_license_path=None): ''' Take reads, align to reference with BWA-MEM, and generate a coverage plot ''' # TODO: use read_utils.py::align_and_fix in place of the duplicated alignment code here # The main difference is the presence/absence of GATK's local_realign if out_bam is None: bam_aligned = util.file.mkstempfname('.aligned.bam') else: bam_aligned = out_bam assert aligner in ["bwa", "novoalign"] if aligner_options is None: if aligner == "novoalign": aligner_options = '-r Random -l 40 -g 40 -x 20 -t 100 -k' elif aligner == 'bwa': aligner_options = '-T 30' # quality threshold samtools = tools.samtools.SamtoolsTool() ref_indexed = util.file.mkstempfname('.reference.fasta') shutil.copyfile(ref_fasta, ref_indexed) aln_bam = util.file.mkstempfname('.bam') if aligner == "bwa": bwa = tools.bwa.Bwa() bwa.index(ref_indexed) bwa_opts = aligner_options.split() if sensitive: bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split() # get the quality threshold from the opts # for downstream filtering bwa_map_threshold = min_score_to_output or 30 if '-T' in bwa_opts: if bwa_opts.index("-T") + 1 <= len(bwa_opts): bwa_map_threshold = int(bwa_opts[bwa_opts.index("-T") + 1]) bwa.align_mem_bam(in_bam, ref_indexed, aln_bam, options=bwa_opts, min_qual=bwa_map_threshold) elif aligner == "novoalign": tools.novoalign.NovoalignTool( license_path=novoalign_license_path).index_fasta(ref_indexed) tools.novoalign.NovoalignTool( license_path=novoalign_license_path).execute( in_bam, ref_indexed, aln_bam, options=aligner_options.split(), JVMmemory=JVMmemory) aln_bam_dupe_processed = util.file.mkstempfname( '.filtered_dupe_processed.bam') if excludeDuplicates: opts = list(picardOptions) dupe_removal_out_metrics = util.file.mkstempfname('.metrics') tools.picard.MarkDuplicatesTool().execute([aln_bam], aln_bam_dupe_processed, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=JVMmemory) else: aln_bam_dupe_processed = aln_bam samtools.sort(aln_bam_dupe_processed, bam_aligned) os.unlink(aln_bam) if excludeDuplicates: os.unlink(aln_bam_dupe_processed) samtools.index(bam_aligned) # -- call plot function -- plot_coverage(bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, excludeDuplicates, out_summary) # remove the output bam, unless it is needed if out_bam is None: os.unlink(bam_aligned) # remove the files created by bwa index. # The empty extension causes the original fasta file to be removed for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]: file_to_remove = ref_indexed + ext if os.path.isfile(file_to_remove): os.unlink(file_to_remove)
def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, plot_only_non_duplicates=False, out_summary=None): ''' Generate a coverage plot from an aligned bam file ''' # TODO: remove this: #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv" samtools = tools.samtools.SamtoolsTool() # check if in_bam is aligned, if not raise an error num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) if num_mapped_reads == 0: raise Exception( """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s""" % in_bam) if out_summary is None: coverage_tsv_file = util.file.mkstempfname('.summary.tsv') else: coverage_tsv_file = out_summary bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam') if plot_only_non_duplicates: # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates) samtools.view(["-F", "1024"], in_bam, bam_dupe_processed) else: bam_dupe_processed = in_bam # call samtools sort bam_sorted = util.file.mkstempfname('.sorted.bam') samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"]) if plot_only_non_duplicates: os.unlink(bam_dupe_processed) # call samtools index samtools.index(bam_sorted) # call samtools depth opts = [] opts += ['-aa'] # report coverate at "absolutely all" positions if base_q_threshold: if not plot_only_non_duplicates: # Note: "bedtools genomecov" will count depth including duplicates, but does # not expose options for filtering by quality. When duplicates # are excluded, "samtools depth" is used which does support quality filtering # We use either samtools or bedtools, because the former ignores marked duplicates # from its depth count while bedtools includes them. log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-q", str(base_q_threshold)] if mapping_q_threshold: if not plot_only_non_duplicates: log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-Q", str(mapping_q_threshold)] if max_coverage_depth: if not plot_only_non_duplicates: log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent") opts += ["-m", str(max_coverage_depth)] if read_length_threshold: if not plot_only_non_duplicates: log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent") opts += ["-l", str(read_length_threshold)] # add option here for bedtools to report coverage w/ duplicates # (and then samtools for no-dups) # # Ex. # samtools depth -aa mapped-to-ref.with-dups.tmp.bam # bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d if not plot_only_non_duplicates: bt = BedTool(bam_sorted) # "d=True" is the equivalent of passing "-d" to the bedtools CLI bt.genome_coverage(d=True).saveas(coverage_tsv_file) else: samtools.depth(bam_sorted, coverage_tsv_file, opts) os.unlink(bam_sorted) # ---- create plot based on coverage_tsv_file ---- segment_depths = OrderedDict() domain_max = 0 with open(coverage_tsv_file, "r") as tabfile: for row in csv.reader(tabfile, delimiter='\t'): segment_depths.setdefault(row[0], []).append(int(row[2])) domain_max += 1 domain_max = 0 with plt.style.context(plot_style): fig = plt.gcf() DPI = plot_dpi or fig.get_dpi() fig.set_size_inches( float(plot_width) / float(DPI), float(plot_height) / float(DPI)) font_size = (2.5 * plot_height) / float(DPI) ax = plt.subplot() # Defines ax variable by creating an empty plot # Set the tick labels font for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(font_size) for segment_num, (segment_name, position_depths) in enumerate( segment_depths.items()): prior_domain_max = domain_max domain_max += len(position_depths) colors = list(plt.rcParams['axes.prop_cycle'].by_key() ['color']) # get the colors for this style segment_color = colors[ segment_num % len(colors)] # pick a color, offset by the segment index if plot_data_style == "filled": plt.fill_between(range(prior_domain_max, domain_max), position_depths, [0] * len(position_depths), linewidth=0, antialiased=True, color=segment_color) elif plot_data_style == "line": plt.plot(range(prior_domain_max, domain_max), position_depths, antialiased=True, color=segment_color) elif plot_data_style == "dots": plt.plot(range(prior_domain_max, domain_max), position_depths, 'ro', antialiased=True, color=segment_color) plt.title(plot_title, fontsize=font_size * 1.2) plt.xlabel("bp", fontsize=font_size * 1.1) plt.ylabel("read depth", fontsize=font_size * 1.1) # to squash a backend renderer error on OSX related to tight layout if plt.get_backend().lower() in ['agg', 'macosx']: fig.set_tight_layout(True) else: fig.tight_layout() plt.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') log.info("Coverage plot saved to: " + out_plot_file) if not out_summary: os.unlink(coverage_tsv_file)
def align_mem_bam(self, inBam, refDb, outBam, options=None, min_score_to_filter=None, threads=None, JVMmemory=None, invert_filter=False): options = options or [] samtools = tools.samtools.SamtoolsTool() # fetch list of RGs rgs = list(samtools.getReadGroups(inBam).keys()) if len(rgs) == 0: # Can't do this raise InvalidBamHeaderError("{} lacks read groups".format(inBam)) elif len(rgs) == 1: # Only one RG, keep it simple self.align_mem_one_rg(inBam, refDb, outBam, options=options, min_score_to_filter=min_score_to_filter, threads=threads, invert_filter=invert_filter) else: # Multiple RGs, align one at a time and merge align_bams = [] for rg in rgs: tmp_bam = util.file.mkstempfname('.{}.bam'.format(rg)) self.align_mem_one_rg(inBam, refDb, tmp_bam, rgid=rg, options=options, min_score_to_filter=min_score_to_filter, threads=threads, invert_filter=invert_filter) if os.path.getsize(tmp_bam) > 0: align_bams.append(tmp_bam) else: log.warning( "No alignment output for RG %s in file %s against %s", rg, inBam, refDb) if len(align_bams) == 0: util.file.touch(outBam) else: # Merge BAMs, sort, and index tools.picard.MergeSamFilesTool().execute( align_bams, outBam, picardOptions=[ 'SORT_ORDER=coordinate', 'USE_THREADING=true', 'CREATE_INDEX=true' ], JVMmemory=JVMmemory) if outBam.endswith(".bam") or outBam.endswith(".cram"): samtools.index(outBam) for bam in align_bams: os.unlink(bam)