示例#1
0
    def mem(self, inReads, refDb, outAlign, options=None, min_score_to_filter=None,
            threads=None):
        options = [] if not options else options

        threads = threads or util.misc.available_cpu_count()
        samtools = tools.samtools.SamtoolsTool()
        fq1 = util.file.mkstempfname('.1.fastq')
        fq2 = util.file.mkstempfname('.2.fastq')
        aln_sam = util.file.mkstempfname('.sam')
        samtools.bam2fq(inReads, fq1, fq2)

        if '-t' not in options:
            threads = threads or utils.misc.available_cpu_count()
            options.extend(('-t', str(threads)))

        self.execute('mem', options + [refDb, fq1, fq2], stdout=aln_sam)
        os.unlink(fq1)
        os.unlink(fq2)

        if min_score_to_filter:
            # Filter reads in the alignment based on on their alignment score
            aln_sam_filtered = util.file.mkstempfname('.sam')
            self.filter_sam_on_alignment_score(aln_sam, aln_sam_filtered,
                                               min_score_to_filter, options)
        else:
            aln_sam_filtered = aln_sam

        samtools.sort(aln_sam_filtered, outAlign, threads=threads)
        os.unlink(aln_sam)
        # cannot index sam files; only do so if a bam/cram is desired
        if outAlign.endswith(".bam") or outAlign.endswith(".cram"):
            samtools.index(outAlign)
示例#2
0
    def mem(self,
            inReads,
            refDb,
            outAlign,
            options=None,
            min_qual=None,
            threads=None):
        options = [] if not options else options

        threads = threads or util.misc.available_cpu_count()
        samtools = tools.samtools.SamtoolsTool()
        fq1 = util.file.mkstempfname('.1.fastq')
        fq2 = util.file.mkstempfname('.2.fastq')
        aln_sam = util.file.mkstempfname('.sam')
        samtools.bam2fq(inReads, fq1, fq2)

        if '-t' not in options:
            threads = threads or utils.misc.available_cpu_count()
            options.extend(('-t', str(threads)))

        if '-T' not in options:
            min_qual = min_qual or 30
            options.extend(('-T', str(min_qual)))

        self.execute('mem', options + [refDb, fq1, fq2], stdout=aln_sam)

        os.unlink(fq1)
        os.unlink(fq2)
        samtools.sort(aln_sam, outAlign, threads=threads)
        os.unlink(aln_sam)
        # cannot index sam files; only do so if a bam/cram is desired
        if outAlign.endswith(".bam") or outAlign.endswith(".cram"):
            samtools.index(outAlign)
示例#3
0
文件: bwa.py 项目: yesimon/viral-ngs
    def mem(self, inReads, refDb, outAlign, options=None, min_qual=None, threads=None):
        options = [] if not options else options

        threads = threads or util.misc.available_cpu_count()
        samtools = tools.samtools.SamtoolsTool()
        fq1 = util.file.mkstempfname('.1.fastq')
        fq2 = util.file.mkstempfname('.2.fastq')
        aln_sam = util.file.mkstempfname('.sam')
        samtools.bam2fq(inReads, fq1, fq2)

        if '-t' not in options:
            threads = threads or utils.misc.available_cpu_count()
            options.extend(('-t', str(threads)))

        if '-T' not in options:
            min_qual = min_qual or 30
            options.extend(('-T', str(min_qual)))

        self.execute('mem', options + [refDb, fq1, fq2], stdout=aln_sam)

        os.unlink(fq1)
        os.unlink(fq2)
        samtools.sort(aln_sam, outAlign, threads=threads)
        os.unlink(aln_sam)
        # cannot index sam files; only do so if a bam/cram is desired
        if outAlign.endswith(".bam") or outAlign.endswith(".cram"):
            samtools.index(outAlign)
示例#4
0
文件: bwa.py 项目: xuwei684/viral-ngs
    def mem(self, inReads, refDb, outAlign, options=None, min_score_to_filter=None,
            threads=None, invert_filter=False, should_index=True):
        options = [] if not options else options

        threads = util.misc.sanitize_thread_count(threads)
        if '-t' not in options:
            options.extend(('-t', str(threads)))

        samtools = tools.samtools.SamtoolsTool()

        aln_sam = util.file.mkstempfname('.aligned.sam')
        fastq_pipe = samtools.bam2fq_pipe(inReads)
        self.execute('mem', options + ['-p', refDb, '-'], stdout=aln_sam, stdin=fastq_pipe.stdout)

        if fastq_pipe.poll():
            raise subprocess.CalledProcessError(fastq_pipe.returncode, "samtools.bam2fq_pipe() for {}".format(inReads))

        if min_score_to_filter:
            # Filter reads in the alignment based on on their alignment score
            aln_sam_filtered = util.file.mkstempfname('.sam')
            self.filter_sam_on_alignment_score(aln_sam, aln_sam_filtered,
                                               min_score_to_filter, options, invert_filter=invert_filter)
            os.unlink(aln_sam)
        else:
            aln_sam_filtered = aln_sam

 
        samtools.sort(aln_sam_filtered, outAlign, threads=threads)
        os.unlink(aln_sam_filtered)

        # cannot index sam files; only do so if a bam/cram is desired
        if should_index and (outAlign.endswith(".bam") or outAlign.endswith(".cram")):
            samtools.index(outAlign)
示例#5
0
    def align_cmd(self, inReads, refDb, outAlign, options=None, threads=None):
        options = [] if not options else options

        threads = util.misc.sanitize_thread_count(threads)
        if '-t' not in options:
            options.extend(('-t', str(threads)))
        if '-2' not in options:
            options.append('-2')

        samtools = tools.samtools.SamtoolsTool()

        with util.file.tempfname('.aligned.sam') as aln_sam:
            fastq_pipe = samtools.bam2fq_pipe(inReads)
            options.extend(('-a', refDb, '-', '-o', aln_sam))
            self.execute(options, stdin=fastq_pipe.stdout)
            if fastq_pipe.wait():
                raise subprocess.CalledProcessError(fastq_pipe.returncode, "samtools.bam2fq_pipe() for {}".format(inReads))
            samtools.sort(aln_sam, outAlign, threads=threads)

        # cannot index sam files; only do so if a bam/cram is desired
        if (outAlign.endswith(".bam") or outAlign.endswith(".cram")):
            samtools.index(outAlign)
示例#6
0
    def mem(self,
            inReads,
            refDb,
            outAlign,
            options=None,
            min_score_to_filter=None,
            threads=None,
            invert_filter=False):
        options = [] if not options else options

        threads = util.misc.sanitize_thread_count(threads)
        if '-t' not in options:
            options.extend(('-t', str(threads)))

        samtools = tools.samtools.SamtoolsTool()

        aln_sam = util.file.mkstempfname('.aligned.sam')
        with samtools.bam2fq_tmp(inReads) as (fq1, fq2):
            self.execute('mem', options + [refDb, fq1, fq2], stdout=aln_sam)

        if min_score_to_filter:
            # Filter reads in the alignment based on on their alignment score
            aln_sam_filtered = util.file.mkstempfname('.sam')
            self.filter_sam_on_alignment_score(aln_sam,
                                               aln_sam_filtered,
                                               min_score_to_filter,
                                               options,
                                               invert_filter=invert_filter)
            os.unlink(aln_sam)
        else:
            aln_sam_filtered = aln_sam

        samtools.sort(aln_sam_filtered, outAlign, threads=threads)
        os.unlink(aln_sam_filtered)

        # cannot index sam files; only do so if a bam/cram is desired
        if outAlign.endswith(".bam") or outAlign.endswith(".cram"):
            samtools.index(outAlign)
示例#7
0
    def scaffold(self, contigs_fasta, ref_fasta, outAlign, divergence=20, options=None, threads=None):
        options = [] if not options else options

        threads = util.misc.sanitize_thread_count(threads)
        if '-t' not in options:
            options.extend(('-t', str(threads)))
        if '-2' not in options:
            options.append('-2')

        if divergence >= 20:
            options.extend(('-x', 'asm20'))
        elif divergence >= 10:
            options.extend(('-x', 'asm10'))
        else:
            options.extend(('-x', 'asm5'))

        with util.file.tempfname('.aligned.sam') as aln_sam:
            options.extend(('-a', ref_fasta, contigs_fasta, '-o', aln_sam))
            self.execute(options)
            samtools.sort(aln_sam, outAlign, threads=threads)

        # cannot index sam files; only do so if a bam/cram is desired
        if (outAlign.endswith(".bam") or outAlign.endswith(".cram")):
            samtools.index(outAlign)
示例#8
0
def align_and_plot_coverage(out_plot_file,
                            plot_format,
                            plot_data_style,
                            plot_style,
                            plot_width,
                            plot_height,
                            plot_dpi,
                            plot_title,
                            plot_x_limits,
                            plot_y_limits,
                            base_q_threshold,
                            mapping_q_threshold,
                            max_coverage_depth,
                            read_length_threshold,
                            out_summary,
                            in_bam,
                            ref_fasta,
                            out_bam=None,
                            sensitive=False,
                            excludeDuplicates=False,
                            bin_large_plots=False,
                            binning_summary_statistic="max",
                            JVMmemory=None,
                            picardOptions=None,
                            min_score_to_filter=None,
                            aligner="bwa",
                            aligner_options='',
                            novoalign_license_path=None):
    ''' 
        Take reads, align to reference with BWA-MEM, and generate a coverage plot
    '''

    # TODO: use read_utils.py::align_and_fix in place of the duplicated alignment code here
    # The main difference is the presence/absence of GATK's local_realign

    if out_bam is None:
        bam_aligned = util.file.mkstempfname('.aligned.bam')
    else:
        bam_aligned = out_bam

    assert aligner in ["bwa", "novoalign"]
    if aligner_options is None:
        if aligner == "novoalign":
            aligner_options = '-r Random -l 40 -g 40 -x 20 -t 100 -k'
        elif aligner == 'bwa':
            aligner_options = '-1'  # hidden option to work around kernel/cpu bug; disables multithreaded file read: https://github.com/lh3/bwa/issues/102

    samtools = tools.samtools.SamtoolsTool()

    ref_indexed = util.file.mkstempfname('.reference.fasta')
    shutil.copyfile(ref_fasta, ref_indexed)

    aln_bam = util.file.mkstempfname('.bam')
    if aligner == "bwa":
        bwa = tools.bwa.Bwa()

        bwa.index(ref_indexed)

        bwa_opts = aligner_options.split()
        if sensitive:
            bwa_opts += "-k 12 -A 1 -B 1 -O 1 -E 1".split()

        bwa.align_mem_bam(in_bam,
                          ref_indexed,
                          aln_bam,
                          options=bwa_opts,
                          min_score_to_filter=min_score_to_filter)
    elif aligner == "novoalign":

        tools.novoalign.NovoalignTool(
            license_path=novoalign_license_path).index_fasta(ref_indexed)

        tools.novoalign.NovoalignTool(
            license_path=novoalign_license_path).execute(
                in_bam,
                ref_indexed,
                aln_bam,
                options=aligner_options.split(),
                JVMmemory=JVMmemory)

    aln_bam_dupe_processed = util.file.mkstempfname(
        '.filtered_dupe_processed.bam')
    if excludeDuplicates:
        opts = list(picardOptions)
        dupe_removal_out_metrics = util.file.mkstempfname('.metrics')
        tools.picard.MarkDuplicatesTool().execute([aln_bam],
                                                  aln_bam_dupe_processed,
                                                  dupe_removal_out_metrics,
                                                  picardOptions=opts,
                                                  JVMmemory=JVMmemory)
    else:
        aln_bam_dupe_processed = aln_bam

    samtools.sort(aln_bam_dupe_processed, bam_aligned)
    os.unlink(aln_bam)

    if excludeDuplicates:
        os.unlink(aln_bam_dupe_processed)

    samtools.index(bam_aligned)

    # -- call plot function --
    plot_coverage(bam_aligned, out_plot_file, plot_format, plot_data_style,
                  plot_style, plot_width, plot_height, plot_dpi, plot_title,
                  plot_x_limits, plot_y_limits, base_q_threshold,
                  mapping_q_threshold, max_coverage_depth,
                  read_length_threshold, excludeDuplicates, bin_large_plots,
                  binning_summary_statistic, out_summary)

    # remove the output bam, unless it is needed
    if out_bam is None:
        os.unlink(bam_aligned)

    # remove the files created by bwa index.
    # The empty extension causes the original fasta file to be removed
    for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]:
        file_to_remove = ref_indexed + ext
        if os.path.isfile(file_to_remove):
            os.unlink(file_to_remove)
示例#9
0
def plot_coverage(in_bam,
                  out_plot_file,
                  plot_format,
                  plot_data_style,
                  plot_style,
                  plot_width,
                  plot_height,
                  plot_dpi,
                  plot_title,
                  plot_x_limits,
                  plot_y_limits,
                  base_q_threshold,
                  mapping_q_threshold,
                  max_coverage_depth,
                  read_length_threshold,
                  plot_only_non_duplicates=False,
                  bin_large_plots=False,
                  binning_summary_statistic="max",
                  out_summary=None):
    ''' 
        Generate a coverage plot from an aligned bam file
    '''
    samtools = tools.samtools.SamtoolsTool()

    # check if in_bam is aligned, if not raise an error
    num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"])
    if num_mapped_reads == 0:
        raise Exception(
            """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if the plot input bam file contains reads and you don't mind a simple bwa alignment. \n File: %s"""
            % in_bam)

    if out_summary is None:
        coverage_tsv_file = util.file.mkstempfname('.summary.tsv')
    else:
        coverage_tsv_file = out_summary

    bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam')
    if plot_only_non_duplicates:
        # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates
        # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates)
        samtools.view(["-F", "1024", '-@', '3'], in_bam, bam_dupe_processed)
    else:
        bam_dupe_processed = in_bam

    # only sort if not sorted
    bam_sorted = util.file.mkstempfname('.sorted.bam')
    should_remove_sorted = True
    if not util.file.bam_is_sorted(bam_dupe_processed):
        samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"])
        if plot_only_non_duplicates:
            os.unlink(bam_dupe_processed)
    else:
        bam_sorted = bam_dupe_processed
        if not plot_only_non_duplicates:
            # in this case we are passing through the original in_bam directly
            should_remove_sorted = False

    # call samtools index
    samtools.index(bam_sorted)

    # call samtools depth
    opts = []
    opts += ['-aa']  # report coverate at "absolutely all" positions
    if base_q_threshold:
        if not plot_only_non_duplicates:
            # Note: "bedtools genomecov" will count depth including duplicates, but does
            # not expose options for filtering by quality. When duplicates
            # are excluded, "samtools depth" is used which does support quality filtering
            # We use either samtools or bedtools, because the former ignores marked duplicates
            # from its depth count while bedtools includes them.
            log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-q", str(base_q_threshold)]
    if mapping_q_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-Q", str(mapping_q_threshold)]
    if max_coverage_depth:
        if not plot_only_non_duplicates:
            log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-m", str(max_coverage_depth)]
    if read_length_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-l", str(read_length_threshold)]

    # add option here for bedtools to report coverage w/ duplicates
    # (and then samtools for no-dups)
    #
    # Ex.
    #   samtools depth -aa mapped-to-ref.with-dups.tmp.bam
    #   bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d
    if not plot_only_non_duplicates:
        bt = BedTool(bam_sorted)
        # "d=True" is the equivalent of passing "-d" to the bedtools CLI
        bt.genome_coverage(d=True).saveas(coverage_tsv_file)
    else:
        samtools.depth(bam_sorted, coverage_tsv_file, opts)

    # only remove the sorted bam if it is not the original input bam
    # which we use directly in some casess
    if should_remove_sorted:
        os.unlink(bam_sorted)

    # ---- create plot based on coverage_tsv_file ----

    segment_depths = OrderedDict()
    domain_max = 0
    with open(coverage_tsv_file, "r") as tabfile:
        for row in csv.reader(tabfile, delimiter='\t'):
            segment_depths.setdefault(row[0], []).append(float(row[2]))
            domain_max += 1

    with matplotlib.pyplot.style.context(plot_style):
        fig = matplotlib.pyplot.gcf()
        DPI = plot_dpi or fig.get_dpi()
        fig.set_size_inches(
            float(plot_width) / float(DPI),
            float(plot_height) / float(DPI))

        font_size = (2.5 * plot_height) / float(DPI)

        ax = matplotlib.pyplot.subplot(
        )  # Defines ax variable by creating an empty plot

        # Set the tick labels font
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontsize(font_size)

        # Binning
        bin_size = 1
        if bin_large_plots:
            # Bin locations and take summary value (maximum or minimum) in each bin
            binning_fn = {
                "min": min,
                "max": max,
                "mean": mean,
                "median": median
            }
            binning_action = binning_fn.get(binning_summary_statistic, "max")

            inner_plot_width_inches = ax.get_window_extent().transformed(
                fig.dpi_scale_trans.inverted()).width
            inner_plot_width_px = inner_plot_width_inches * fig.dpi  # width of actual plot (sans whitespace and y axis text)
            bins_per_pixel = 1  # increase to make smaller (but less visible) bins
            bin_size = 1 + int(domain_max /
                               (inner_plot_width_px * bins_per_pixel))

            binned_segment_depths = OrderedDict()
            for segment_num, (segment_name, position_depths) in enumerate(
                    segment_depths.items()):
                summary_depths_in_bins = [
                    binning_action(position_depths[i:i + bin_size])
                    for i in range(0, len(position_depths), bin_size)
                ]
                binned_segment_depths[segment_name] = summary_depths_in_bins
            segment_depths = binned_segment_depths

        # Plotting
        domain_max = 0
        for segment_num, (segment_name, position_depths) in enumerate(
                segment_depths.items()):
            prior_domain_max = domain_max
            domain_max += len(position_depths)

            colors = list(
                matplotlib.pyplot.rcParams['axes.prop_cycle'].by_key()
                ['color'])  # get the colors for this style
            segment_color = colors[
                segment_num %
                len(colors)]  # pick a color, offset by the segment index

            x_values = range(prior_domain_max, domain_max)
            x_values = [x * bin_size for x in x_values]

            if plot_data_style == "filled":
                matplotlib.pyplot.fill_between(x_values,
                                               position_depths,
                                               [0] * len(position_depths),
                                               linewidth=0,
                                               antialiased=True,
                                               color=segment_color)
            elif plot_data_style == "line":
                matplotlib.pyplot.plot(x_values,
                                       position_depths,
                                       antialiased=True,
                                       color=segment_color)
            elif plot_data_style == "dots":
                matplotlib.pyplot.plot(x_values,
                                       position_depths,
                                       'ro',
                                       antialiased=True,
                                       color=segment_color)

        matplotlib.pyplot.title(plot_title, fontsize=font_size * 1.2)
        matplotlib.pyplot.xlabel("bp", fontsize=font_size * 1.1)

        ylabel = "read depth"
        if (bin_size > 1):
            ylabel = "read depth ({summary} in {size}-bp bin)".format(
                size=bin_size, summary=binning_summary_statistic)
        matplotlib.pyplot.ylabel(ylabel, fontsize=font_size * 1.1)

        if plot_x_limits is not None:
            x_min, x_max = plot_x_limits
            matplotlib.pyplot.xlim(x_min, x_max)
        if plot_y_limits is not None:
            y_min, y_max = plot_y_limits
            matplotlib.pyplot.ylim(y_min, y_max)

        # to squash a backend renderer error on OSX related to tight layout
        if matplotlib.pyplot.get_backend().lower() in ['agg', 'macosx']:
            fig.set_tight_layout(True)
        else:
            fig.tight_layout()

        matplotlib.pyplot.savefig(out_plot_file, format=plot_format,
                                  dpi=DPI)  #, bbox_inches='tight')
        log.info("Coverage plot saved to: " + out_plot_file)

    if not out_summary:
        os.unlink(coverage_tsv_file)
示例#10
0
def align_and_plot_coverage(
    out_plot_file,
    plot_format,
    plot_data_style,
    plot_style,
    plot_width,
    plot_height,
    plot_dpi,
    plot_title,
    base_q_threshold,
    mapping_q_threshold,
    max_coverage_depth,
    read_length_threshold,
    out_summary,
    in_bam,
    ref_fasta,
    out_bam=None,
    sensitive=False,
    excludeDuplicates=False,
    JVMmemory=None,
    picardOptions=None,
    min_score_to_output=None
):
    ''' 
        Take reads, align to reference with BWA-MEM, and generate a coverage plot
    '''
    if out_bam is None:
        bam_aligned = util.file.mkstempfname('.aligned.bam')
    else:
        bam_aligned = out_bam

    ref_indexed = util.file.mkstempfname('.reference.fasta')
    shutil.copyfile(ref_fasta, ref_indexed)

    bwa = tools.bwa.Bwa()
    samtools = tools.samtools.SamtoolsTool()

    bwa.index(ref_indexed)

    bwa_opts = []
    if sensitive:
        bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split()

    map_threshold = min_score_to_output or 30

    bwa_opts + ["-T", str(map_threshold)]

    aln_bam = util.file.mkstempfname('.bam')

    bwa.mem(in_bam, ref_indexed, aln_bam, opts=bwa_opts)

    # @haydenm says:
    # For some reason (particularly when the --sensitive option is on), bwa
    # doesn't listen to its '-T' flag and outputs alignments with score less
    # than the '-T 30' threshold. So filter these:
    aln_bam_filtered = util.file.mkstempfname('.filtered.bam')
    samtools.view(["-b", "-h", "-q", str(map_threshold)], aln_bam, aln_bam_filtered)
    os.unlink(aln_bam)

    aln_bam_dupe_processed = util.file.mkstempfname('.filtered_dupe_processed.bam')
    if excludeDuplicates:
        opts = list(picardOptions)
        dupe_removal_out_metrics = util.file.mkstempfname('.metrics')
        tools.picard.MarkDuplicatesTool().execute(
            [aln_bam_filtered], aln_bam_dupe_processed,
            dupe_removal_out_metrics, picardOptions=opts,
            JVMmemory=JVMmemory
        )
    else:
        aln_bam_dupe_processed = aln_bam_filtered

    samtools.sort(aln_bam_dupe_processed, bam_aligned)
    os.unlink(aln_bam_filtered)
    
    if excludeDuplicates:
        os.unlink(aln_bam_dupe_processed)

    samtools.index(bam_aligned)

    # -- call plot function --
    plot_coverage(
        bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title,
        base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, excludeDuplicates, out_summary
    )

    # remove the output bam, unless it is needed
    if out_bam is None:
        os.unlink(bam_aligned)

    # remove the files created by bwa index. 
    # The empty extension causes the original fasta file to be removed
    for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]:
        file_to_remove = ref_indexed + ext
        if os.path.isfile(file_to_remove):
            os.unlink(file_to_remove)
示例#11
0
def plot_coverage(
    in_bam,
    out_plot_file,
    plot_format,
    plot_data_style,
    plot_style,
    plot_width,
    plot_height,
    plot_dpi,
    plot_title,
    base_q_threshold,
    mapping_q_threshold,
    max_coverage_depth,
    read_length_threshold,
    plot_only_non_duplicates=False,
    out_summary=None
):
    ''' 
        Generate a coverage plot from an aligned bam file
    '''

    # TODO: remove this:
    #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv"

    samtools = tools.samtools.SamtoolsTool()

    # check if in_bam is aligned, if not raise an error
    num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"])
    if num_mapped_reads == 0:
        raise Exception(
            """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s"""
            % in_bam
        )

    if out_summary is None:
        coverage_tsv_file = util.file.mkstempfname('.summary.tsv')
    else:
        coverage_tsv_file = out_summary

    bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam')
    if plot_only_non_duplicates:
        # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates)
        samtools.view(["-F", "1024"], in_bam, bam_dupe_processed)
    else:
        bam_dupe_processed = in_bam

    # call samtools sort
    bam_sorted = util.file.mkstempfname('.sorted.bam')
    samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"])

    if plot_only_non_duplicates:
        os.unlink(bam_dupe_processed)

    # call samtools index
    samtools.index(bam_sorted)

    # call samtools depth
    opts = []
    opts += ['-aa']    # report coverate at "absolutely all" positions
    if base_q_threshold:
        opts += ["-q", str(base_q_threshold)]
    if mapping_q_threshold:
        opts += ["-Q", str(mapping_q_threshold)]
    if max_coverage_depth:
        opts += ["-m", str(max_coverage_depth)]
    if read_length_threshold:
        opts += ["-l", str(read_length_threshold)]

    samtools.depth(bam_sorted, coverage_tsv_file, opts)
    os.unlink(bam_sorted)

    # ---- create plot based on coverage_tsv_file ----

    segment_depths = OrderedDict()
    domain_max = 0
    with open(coverage_tsv_file, "r") as tabfile:
        for row in csv.reader(tabfile, delimiter='\t'):
            segment_depths.setdefault(row[0], []).append(int(row[2]))
            domain_max += 1

    domain_max = 0
    with plt.style.context(plot_style):
        fig = plt.gcf()
        DPI = plot_dpi or fig.get_dpi()
        fig.set_size_inches(float(plot_width) / float(DPI), float(plot_height) / float(DPI))

        font_size = (2.5 * plot_height) / float(DPI)

        ax = plt.subplot()    # Defines ax variable by creating an empty plot

        # Set the tick labels font
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontsize(font_size)

        for segment_num, (segment_name, position_depths) in enumerate(segment_depths.items()):
            prior_domain_max = domain_max
            domain_max += len(position_depths)

            colors = list(plt.rcParams['axes.prop_cycle'].by_key()['color'])    # get the colors for this style
            segment_color = colors[segment_num % len(colors)]    # pick a color, offset by the segment index

            if plot_data_style == "filled":
                plt.fill_between(
                    range(prior_domain_max, domain_max),
                    position_depths, [0] * len(position_depths),
                    linewidth=0,
                    antialiased=True,
                    color=segment_color
                )
            elif plot_data_style == "line":
                plt.plot(range(prior_domain_max, domain_max), position_depths, antialiased=True, color=segment_color)
            elif plot_data_style == "dots":
                plt.plot(
                    range(prior_domain_max, domain_max),
                    position_depths,
                    'ro',
                    antialiased=True,
                    color=segment_color
                )

        plt.title(plot_title, fontsize=font_size * 1.2)
        plt.xlabel("bp", fontsize=font_size * 1.1)
        plt.ylabel("read depth", fontsize=font_size * 1.1)

        # to squash a backend renderer error on OSX related to tight layout
        if plt.get_backend().lower() in ['agg', 'macosx']:
            fig.set_tight_layout(True)
        else:
            fig.tight_layout()

        plt.savefig(out_plot_file, format=plot_format, dpi=DPI)    #, bbox_inches='tight')
        log.info("Coverage plot saved to: " + out_plot_file)

    if not out_summary:
        os.unlink(coverage_tsv_file)
示例#12
0
def align_and_plot_coverage(out_plot_file,
                            plot_format,
                            plot_data_style,
                            plot_style,
                            plot_width,
                            plot_height,
                            plot_dpi,
                            plot_title,
                            base_q_threshold,
                            mapping_q_threshold,
                            max_coverage_depth,
                            read_length_threshold,
                            out_summary,
                            in_bam,
                            ref_fasta,
                            out_bam=None,
                            sensitive=False,
                            excludeDuplicates=False,
                            JVMmemory=None,
                            picardOptions=None,
                            min_score_to_output=None,
                            aligner="bwa",
                            aligner_options='',
                            novoalign_license_path=None):
    ''' 
        Take reads, align to reference with BWA-MEM, and generate a coverage plot
    '''

    # TODO: use read_utils.py::align_and_fix in place of the duplicated alignment code here
    # The main difference is the presence/absence of GATK's local_realign

    if out_bam is None:
        bam_aligned = util.file.mkstempfname('.aligned.bam')
    else:
        bam_aligned = out_bam

    assert aligner in ["bwa", "novoalign"]
    if aligner_options is None:
        if aligner == "novoalign":
            aligner_options = '-r Random -l 40 -g 40 -x 20 -t 100 -k'
        elif aligner == 'bwa':
            aligner_options = '-T 30'  # quality threshold

    samtools = tools.samtools.SamtoolsTool()

    ref_indexed = util.file.mkstempfname('.reference.fasta')
    shutil.copyfile(ref_fasta, ref_indexed)

    aln_bam = util.file.mkstempfname('.bam')
    if aligner == "bwa":
        bwa = tools.bwa.Bwa()

        bwa.index(ref_indexed)

        bwa_opts = aligner_options.split()
        if sensitive:
            bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split()

        # get the quality threshold from the opts
        # for downstream filtering
        bwa_map_threshold = min_score_to_output or 30
        if '-T' in bwa_opts:
            if bwa_opts.index("-T") + 1 <= len(bwa_opts):
                bwa_map_threshold = int(bwa_opts[bwa_opts.index("-T") + 1])

        bwa.align_mem_bam(in_bam,
                          ref_indexed,
                          aln_bam,
                          options=bwa_opts,
                          min_qual=bwa_map_threshold)
    elif aligner == "novoalign":

        tools.novoalign.NovoalignTool(
            license_path=novoalign_license_path).index_fasta(ref_indexed)

        tools.novoalign.NovoalignTool(
            license_path=novoalign_license_path).execute(
                in_bam,
                ref_indexed,
                aln_bam,
                options=aligner_options.split(),
                JVMmemory=JVMmemory)

    aln_bam_dupe_processed = util.file.mkstempfname(
        '.filtered_dupe_processed.bam')
    if excludeDuplicates:
        opts = list(picardOptions)
        dupe_removal_out_metrics = util.file.mkstempfname('.metrics')
        tools.picard.MarkDuplicatesTool().execute([aln_bam],
                                                  aln_bam_dupe_processed,
                                                  dupe_removal_out_metrics,
                                                  picardOptions=opts,
                                                  JVMmemory=JVMmemory)
    else:
        aln_bam_dupe_processed = aln_bam

    samtools.sort(aln_bam_dupe_processed, bam_aligned)
    os.unlink(aln_bam)

    if excludeDuplicates:
        os.unlink(aln_bam_dupe_processed)

    samtools.index(bam_aligned)

    # -- call plot function --
    plot_coverage(bam_aligned, out_plot_file, plot_format, plot_data_style,
                  plot_style, plot_width, plot_height, plot_dpi, plot_title,
                  base_q_threshold, mapping_q_threshold, max_coverage_depth,
                  read_length_threshold, excludeDuplicates, out_summary)

    # remove the output bam, unless it is needed
    if out_bam is None:
        os.unlink(bam_aligned)

    # remove the files created by bwa index.
    # The empty extension causes the original fasta file to be removed
    for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]:
        file_to_remove = ref_indexed + ext
        if os.path.isfile(file_to_remove):
            os.unlink(file_to_remove)
示例#13
0
def plot_coverage(in_bam,
                  out_plot_file,
                  plot_format,
                  plot_data_style,
                  plot_style,
                  plot_width,
                  plot_height,
                  plot_dpi,
                  plot_title,
                  base_q_threshold,
                  mapping_q_threshold,
                  max_coverage_depth,
                  read_length_threshold,
                  plot_only_non_duplicates=False,
                  out_summary=None):
    ''' 
        Generate a coverage plot from an aligned bam file
    '''

    # TODO: remove this:
    #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv"

    samtools = tools.samtools.SamtoolsTool()

    # check if in_bam is aligned, if not raise an error
    num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"])
    if num_mapped_reads == 0:
        raise Exception(
            """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s"""
            % in_bam)

    if out_summary is None:
        coverage_tsv_file = util.file.mkstempfname('.summary.tsv')
    else:
        coverage_tsv_file = out_summary

    bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam')
    if plot_only_non_duplicates:
        # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates
        # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates)
        samtools.view(["-F", "1024"], in_bam, bam_dupe_processed)
    else:
        bam_dupe_processed = in_bam

    # call samtools sort
    bam_sorted = util.file.mkstempfname('.sorted.bam')
    samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"])

    if plot_only_non_duplicates:
        os.unlink(bam_dupe_processed)

    # call samtools index
    samtools.index(bam_sorted)

    # call samtools depth
    opts = []
    opts += ['-aa']  # report coverate at "absolutely all" positions
    if base_q_threshold:
        if not plot_only_non_duplicates:
            # Note: "bedtools genomecov" will count depth including duplicates, but does
            # not expose options for filtering by quality. When duplicates
            # are excluded, "samtools depth" is used which does support quality filtering
            # We use either samtools or bedtools, because the former ignores marked duplicates
            # from its depth count while bedtools includes them.
            log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-q", str(base_q_threshold)]
    if mapping_q_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-Q", str(mapping_q_threshold)]
    if max_coverage_depth:
        if not plot_only_non_duplicates:
            log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-m", str(max_coverage_depth)]
    if read_length_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-l", str(read_length_threshold)]

    # add option here for bedtools to report coverage w/ duplicates
    # (and then samtools for no-dups)
    #
    # Ex.
    #   samtools depth -aa mapped-to-ref.with-dups.tmp.bam
    #   bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d
    if not plot_only_non_duplicates:
        bt = BedTool(bam_sorted)
        # "d=True" is the equivalent of passing "-d" to the bedtools CLI
        bt.genome_coverage(d=True).saveas(coverage_tsv_file)
    else:
        samtools.depth(bam_sorted, coverage_tsv_file, opts)
    os.unlink(bam_sorted)

    # ---- create plot based on coverage_tsv_file ----

    segment_depths = OrderedDict()
    domain_max = 0
    with open(coverage_tsv_file, "r") as tabfile:
        for row in csv.reader(tabfile, delimiter='\t'):
            segment_depths.setdefault(row[0], []).append(int(row[2]))
            domain_max += 1

    domain_max = 0
    with plt.style.context(plot_style):
        fig = plt.gcf()
        DPI = plot_dpi or fig.get_dpi()
        fig.set_size_inches(
            float(plot_width) / float(DPI),
            float(plot_height) / float(DPI))

        font_size = (2.5 * plot_height) / float(DPI)

        ax = plt.subplot()  # Defines ax variable by creating an empty plot

        # Set the tick labels font
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontsize(font_size)

        for segment_num, (segment_name, position_depths) in enumerate(
                segment_depths.items()):
            prior_domain_max = domain_max
            domain_max += len(position_depths)

            colors = list(plt.rcParams['axes.prop_cycle'].by_key()
                          ['color'])  # get the colors for this style
            segment_color = colors[
                segment_num %
                len(colors)]  # pick a color, offset by the segment index

            if plot_data_style == "filled":
                plt.fill_between(range(prior_domain_max, domain_max),
                                 position_depths, [0] * len(position_depths),
                                 linewidth=0,
                                 antialiased=True,
                                 color=segment_color)
            elif plot_data_style == "line":
                plt.plot(range(prior_domain_max, domain_max),
                         position_depths,
                         antialiased=True,
                         color=segment_color)
            elif plot_data_style == "dots":
                plt.plot(range(prior_domain_max, domain_max),
                         position_depths,
                         'ro',
                         antialiased=True,
                         color=segment_color)

        plt.title(plot_title, fontsize=font_size * 1.2)
        plt.xlabel("bp", fontsize=font_size * 1.1)
        plt.ylabel("read depth", fontsize=font_size * 1.1)

        # to squash a backend renderer error on OSX related to tight layout
        if plt.get_backend().lower() in ['agg', 'macosx']:
            fig.set_tight_layout(True)
        else:
            fig.tight_layout()

        plt.savefig(out_plot_file, format=plot_format,
                    dpi=DPI)  #, bbox_inches='tight')
        log.info("Coverage plot saved to: " + out_plot_file)

    if not out_summary:
        os.unlink(coverage_tsv_file)
示例#14
0
    def align_mem_bam(self,
                      inBam,
                      refDb,
                      outBam,
                      options=None,
                      min_score_to_filter=None,
                      threads=None,
                      JVMmemory=None,
                      invert_filter=False):
        options = options or []

        samtools = tools.samtools.SamtoolsTool()

        # fetch list of RGs
        rgs = list(samtools.getReadGroups(inBam).keys())

        if len(rgs) == 0:
            # Can't do this
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))

        elif len(rgs) == 1:
            # Only one RG, keep it simple
            self.align_mem_one_rg(inBam,
                                  refDb,
                                  outBam,
                                  options=options,
                                  min_score_to_filter=min_score_to_filter,
                                  threads=threads,
                                  invert_filter=invert_filter)

        else:
            # Multiple RGs, align one at a time and merge
            align_bams = []
            for rg in rgs:
                tmp_bam = util.file.mkstempfname('.{}.bam'.format(rg))
                self.align_mem_one_rg(inBam,
                                      refDb,
                                      tmp_bam,
                                      rgid=rg,
                                      options=options,
                                      min_score_to_filter=min_score_to_filter,
                                      threads=threads,
                                      invert_filter=invert_filter)
                if os.path.getsize(tmp_bam) > 0:
                    align_bams.append(tmp_bam)
                else:
                    log.warning(
                        "No alignment output for RG %s in file %s against %s",
                        rg, inBam, refDb)

            if len(align_bams) == 0:
                util.file.touch(outBam)
            else:
                # Merge BAMs, sort, and index
                tools.picard.MergeSamFilesTool().execute(
                    align_bams,
                    outBam,
                    picardOptions=[
                        'SORT_ORDER=coordinate', 'USE_THREADING=true',
                        'CREATE_INDEX=true'
                    ],
                    JVMmemory=JVMmemory)
                if outBam.endswith(".bam") or outBam.endswith(".cram"):
                    samtools.index(outBam)
                for bam in align_bams:
                    os.unlink(bam)