def plot_coverage( in_bam, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, plot_only_non_duplicates=False, out_summary=None ): ''' Generate a coverage plot from an aligned bam file ''' # TODO: remove this: #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv" samtools = tools.samtools.SamtoolsTool() # check if in_bam is aligned, if not raise an error num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) if num_mapped_reads == 0: raise Exception( """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s""" % in_bam ) if out_summary is None: coverage_tsv_file = util.file.mkstempfname('.summary.tsv') else: coverage_tsv_file = out_summary bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam') if plot_only_non_duplicates: # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates) samtools.view(["-F", "1024"], in_bam, bam_dupe_processed) else: bam_dupe_processed = in_bam # call samtools sort bam_sorted = util.file.mkstempfname('.sorted.bam') samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"]) if plot_only_non_duplicates: os.unlink(bam_dupe_processed) # call samtools index samtools.index(bam_sorted) # call samtools depth opts = [] opts += ['-aa'] # report coverate at "absolutely all" positions if base_q_threshold: opts += ["-q", str(base_q_threshold)] if mapping_q_threshold: opts += ["-Q", str(mapping_q_threshold)] if max_coverage_depth: opts += ["-m", str(max_coverage_depth)] if read_length_threshold: opts += ["-l", str(read_length_threshold)] samtools.depth(bam_sorted, coverage_tsv_file, opts) os.unlink(bam_sorted) # ---- create plot based on coverage_tsv_file ---- segment_depths = OrderedDict() domain_max = 0 with open(coverage_tsv_file, "r") as tabfile: for row in csv.reader(tabfile, delimiter='\t'): segment_depths.setdefault(row[0], []).append(int(row[2])) domain_max += 1 domain_max = 0 with plt.style.context(plot_style): fig = plt.gcf() DPI = plot_dpi or fig.get_dpi() fig.set_size_inches(float(plot_width) / float(DPI), float(plot_height) / float(DPI)) font_size = (2.5 * plot_height) / float(DPI) ax = plt.subplot() # Defines ax variable by creating an empty plot # Set the tick labels font for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(font_size) for segment_num, (segment_name, position_depths) in enumerate(segment_depths.items()): prior_domain_max = domain_max domain_max += len(position_depths) colors = list(plt.rcParams['axes.prop_cycle'].by_key()['color']) # get the colors for this style segment_color = colors[segment_num % len(colors)] # pick a color, offset by the segment index if plot_data_style == "filled": plt.fill_between( range(prior_domain_max, domain_max), position_depths, [0] * len(position_depths), linewidth=0, antialiased=True, color=segment_color ) elif plot_data_style == "line": plt.plot(range(prior_domain_max, domain_max), position_depths, antialiased=True, color=segment_color) elif plot_data_style == "dots": plt.plot( range(prior_domain_max, domain_max), position_depths, 'ro', antialiased=True, color=segment_color ) plt.title(plot_title, fontsize=font_size * 1.2) plt.xlabel("bp", fontsize=font_size * 1.1) plt.ylabel("read depth", fontsize=font_size * 1.1) # to squash a backend renderer error on OSX related to tight layout if plt.get_backend().lower() in ['agg', 'macosx']: fig.set_tight_layout(True) else: fig.tight_layout() plt.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') log.info("Coverage plot saved to: " + out_plot_file) if not out_summary: os.unlink(coverage_tsv_file)
def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, plot_x_limits, plot_y_limits, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, plot_only_non_duplicates=False, bin_large_plots=False, binning_summary_statistic="max", out_summary=None): ''' Generate a coverage plot from an aligned bam file ''' samtools = tools.samtools.SamtoolsTool() # check if in_bam is aligned, if not raise an error num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) if num_mapped_reads == 0: raise Exception( """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if the plot input bam file contains reads and you don't mind a simple bwa alignment. \n File: %s""" % in_bam) if out_summary is None: coverage_tsv_file = util.file.mkstempfname('.summary.tsv') else: coverage_tsv_file = out_summary bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam') if plot_only_non_duplicates: # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates) samtools.view(["-F", "1024", '-@', '3'], in_bam, bam_dupe_processed) else: bam_dupe_processed = in_bam # only sort if not sorted bam_sorted = util.file.mkstempfname('.sorted.bam') should_remove_sorted = True if not util.file.bam_is_sorted(bam_dupe_processed): samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"]) if plot_only_non_duplicates: os.unlink(bam_dupe_processed) else: bam_sorted = bam_dupe_processed if not plot_only_non_duplicates: # in this case we are passing through the original in_bam directly should_remove_sorted = False # call samtools index samtools.index(bam_sorted) # call samtools depth opts = [] opts += ['-aa'] # report coverate at "absolutely all" positions if base_q_threshold: if not plot_only_non_duplicates: # Note: "bedtools genomecov" will count depth including duplicates, but does # not expose options for filtering by quality. When duplicates # are excluded, "samtools depth" is used which does support quality filtering # We use either samtools or bedtools, because the former ignores marked duplicates # from its depth count while bedtools includes them. log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-q", str(base_q_threshold)] if mapping_q_threshold: if not plot_only_non_duplicates: log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-Q", str(mapping_q_threshold)] if max_coverage_depth: if not plot_only_non_duplicates: log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent") opts += ["-m", str(max_coverage_depth)] if read_length_threshold: if not plot_only_non_duplicates: log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent") opts += ["-l", str(read_length_threshold)] # add option here for bedtools to report coverage w/ duplicates # (and then samtools for no-dups) # # Ex. # samtools depth -aa mapped-to-ref.with-dups.tmp.bam # bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d if not plot_only_non_duplicates: bt = BedTool(bam_sorted) # "d=True" is the equivalent of passing "-d" to the bedtools CLI bt.genome_coverage(d=True).saveas(coverage_tsv_file) else: samtools.depth(bam_sorted, coverage_tsv_file, opts) # only remove the sorted bam if it is not the original input bam # which we use directly in some casess if should_remove_sorted: os.unlink(bam_sorted) # ---- create plot based on coverage_tsv_file ---- segment_depths = OrderedDict() domain_max = 0 with open(coverage_tsv_file, "r") as tabfile: for row in csv.reader(tabfile, delimiter='\t'): segment_depths.setdefault(row[0], []).append(float(row[2])) domain_max += 1 with matplotlib.pyplot.style.context(plot_style): fig = matplotlib.pyplot.gcf() DPI = plot_dpi or fig.get_dpi() fig.set_size_inches( float(plot_width) / float(DPI), float(plot_height) / float(DPI)) font_size = (2.5 * plot_height) / float(DPI) ax = matplotlib.pyplot.subplot( ) # Defines ax variable by creating an empty plot # Set the tick labels font for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(font_size) # Binning bin_size = 1 if bin_large_plots: # Bin locations and take summary value (maximum or minimum) in each bin binning_fn = { "min": min, "max": max, "mean": mean, "median": median } binning_action = binning_fn.get(binning_summary_statistic, "max") inner_plot_width_inches = ax.get_window_extent().transformed( fig.dpi_scale_trans.inverted()).width inner_plot_width_px = inner_plot_width_inches * fig.dpi # width of actual plot (sans whitespace and y axis text) bins_per_pixel = 1 # increase to make smaller (but less visible) bins bin_size = 1 + int(domain_max / (inner_plot_width_px * bins_per_pixel)) binned_segment_depths = OrderedDict() for segment_num, (segment_name, position_depths) in enumerate( segment_depths.items()): summary_depths_in_bins = [ binning_action(position_depths[i:i + bin_size]) for i in range(0, len(position_depths), bin_size) ] binned_segment_depths[segment_name] = summary_depths_in_bins segment_depths = binned_segment_depths # Plotting domain_max = 0 for segment_num, (segment_name, position_depths) in enumerate( segment_depths.items()): prior_domain_max = domain_max domain_max += len(position_depths) colors = list( matplotlib.pyplot.rcParams['axes.prop_cycle'].by_key() ['color']) # get the colors for this style segment_color = colors[ segment_num % len(colors)] # pick a color, offset by the segment index x_values = range(prior_domain_max, domain_max) x_values = [x * bin_size for x in x_values] if plot_data_style == "filled": matplotlib.pyplot.fill_between(x_values, position_depths, [0] * len(position_depths), linewidth=0, antialiased=True, color=segment_color) elif plot_data_style == "line": matplotlib.pyplot.plot(x_values, position_depths, antialiased=True, color=segment_color) elif plot_data_style == "dots": matplotlib.pyplot.plot(x_values, position_depths, 'ro', antialiased=True, color=segment_color) matplotlib.pyplot.title(plot_title, fontsize=font_size * 1.2) matplotlib.pyplot.xlabel("bp", fontsize=font_size * 1.1) ylabel = "read depth" if (bin_size > 1): ylabel = "read depth ({summary} in {size}-bp bin)".format( size=bin_size, summary=binning_summary_statistic) matplotlib.pyplot.ylabel(ylabel, fontsize=font_size * 1.1) if plot_x_limits is not None: x_min, x_max = plot_x_limits matplotlib.pyplot.xlim(x_min, x_max) if plot_y_limits is not None: y_min, y_max = plot_y_limits matplotlib.pyplot.ylim(y_min, y_max) # to squash a backend renderer error on OSX related to tight layout if matplotlib.pyplot.get_backend().lower() in ['agg', 'macosx']: fig.set_tight_layout(True) else: fig.tight_layout() matplotlib.pyplot.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') log.info("Coverage plot saved to: " + out_plot_file) if not out_summary: os.unlink(coverage_tsv_file)
def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, plot_only_non_duplicates=False, out_summary=None): ''' Generate a coverage plot from an aligned bam file ''' # TODO: remove this: #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv" samtools = tools.samtools.SamtoolsTool() # check if in_bam is aligned, if not raise an error num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) if num_mapped_reads == 0: raise Exception( """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s""" % in_bam) if out_summary is None: coverage_tsv_file = util.file.mkstempfname('.summary.tsv') else: coverage_tsv_file = out_summary bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam') if plot_only_non_duplicates: # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates) samtools.view(["-F", "1024"], in_bam, bam_dupe_processed) else: bam_dupe_processed = in_bam # call samtools sort bam_sorted = util.file.mkstempfname('.sorted.bam') samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"]) if plot_only_non_duplicates: os.unlink(bam_dupe_processed) # call samtools index samtools.index(bam_sorted) # call samtools depth opts = [] opts += ['-aa'] # report coverate at "absolutely all" positions if base_q_threshold: if not plot_only_non_duplicates: # Note: "bedtools genomecov" will count depth including duplicates, but does # not expose options for filtering by quality. When duplicates # are excluded, "samtools depth" is used which does support quality filtering # We use either samtools or bedtools, because the former ignores marked duplicates # from its depth count while bedtools includes them. log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-q", str(base_q_threshold)] if mapping_q_threshold: if not plot_only_non_duplicates: log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-Q", str(mapping_q_threshold)] if max_coverage_depth: if not plot_only_non_duplicates: log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent") opts += ["-m", str(max_coverage_depth)] if read_length_threshold: if not plot_only_non_duplicates: log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent") opts += ["-l", str(read_length_threshold)] # add option here for bedtools to report coverage w/ duplicates # (and then samtools for no-dups) # # Ex. # samtools depth -aa mapped-to-ref.with-dups.tmp.bam # bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d if not plot_only_non_duplicates: bt = BedTool(bam_sorted) # "d=True" is the equivalent of passing "-d" to the bedtools CLI bt.genome_coverage(d=True).saveas(coverage_tsv_file) else: samtools.depth(bam_sorted, coverage_tsv_file, opts) os.unlink(bam_sorted) # ---- create plot based on coverage_tsv_file ---- segment_depths = OrderedDict() domain_max = 0 with open(coverage_tsv_file, "r") as tabfile: for row in csv.reader(tabfile, delimiter='\t'): segment_depths.setdefault(row[0], []).append(int(row[2])) domain_max += 1 domain_max = 0 with plt.style.context(plot_style): fig = plt.gcf() DPI = plot_dpi or fig.get_dpi() fig.set_size_inches( float(plot_width) / float(DPI), float(plot_height) / float(DPI)) font_size = (2.5 * plot_height) / float(DPI) ax = plt.subplot() # Defines ax variable by creating an empty plot # Set the tick labels font for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(font_size) for segment_num, (segment_name, position_depths) in enumerate( segment_depths.items()): prior_domain_max = domain_max domain_max += len(position_depths) colors = list(plt.rcParams['axes.prop_cycle'].by_key() ['color']) # get the colors for this style segment_color = colors[ segment_num % len(colors)] # pick a color, offset by the segment index if plot_data_style == "filled": plt.fill_between(range(prior_domain_max, domain_max), position_depths, [0] * len(position_depths), linewidth=0, antialiased=True, color=segment_color) elif plot_data_style == "line": plt.plot(range(prior_domain_max, domain_max), position_depths, antialiased=True, color=segment_color) elif plot_data_style == "dots": plt.plot(range(prior_domain_max, domain_max), position_depths, 'ro', antialiased=True, color=segment_color) plt.title(plot_title, fontsize=font_size * 1.2) plt.xlabel("bp", fontsize=font_size * 1.1) plt.ylabel("read depth", fontsize=font_size * 1.1) # to squash a backend renderer error on OSX related to tight layout if plt.get_backend().lower() in ['agg', 'macosx']: fig.set_tight_layout(True) else: fig.tight_layout() plt.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') log.info("Coverage plot saved to: " + out_plot_file) if not out_summary: os.unlink(coverage_tsv_file)