Пример #1
0
    def mem(self, inReads, refDb, outAlign, options=None, min_qual=None, threads=None):
        options = [] if not options else options

        threads = threads or util.misc.available_cpu_count()
        samtools = tools.samtools.SamtoolsTool()
        fq1 = util.file.mkstempfname('.1.fastq')
        fq2 = util.file.mkstempfname('.2.fastq')
        aln_sam = util.file.mkstempfname('.sam')
        samtools.bam2fq(inReads, fq1, fq2)

        if '-t' not in options:
            threads = threads or utils.misc.available_cpu_count()
            options.extend(('-t', str(threads)))

        if '-T' not in options:
            min_qual = min_qual or 30
            options.extend(('-T', str(min_qual)))

        self.execute('mem', options + [refDb, fq1, fq2], stdout=aln_sam)

        os.unlink(fq1)
        os.unlink(fq2)
        samtools.sort(aln_sam, outAlign, threads=threads)
        os.unlink(aln_sam)
        # cannot index sam files; only do so if a bam/cram is desired
        if outAlign.endswith(".bam") or outAlign.endswith(".cram"):
            samtools.index(outAlign)
Пример #2
0
    def mem(self, inReads, refDb, outAlign, options=None, min_score_to_filter=None,
            threads=None):
        options = [] if not options else options

        threads = threads or util.misc.available_cpu_count()
        samtools = tools.samtools.SamtoolsTool()
        fq1 = util.file.mkstempfname('.1.fastq')
        fq2 = util.file.mkstempfname('.2.fastq')
        aln_sam = util.file.mkstempfname('.sam')
        samtools.bam2fq(inReads, fq1, fq2)

        if '-t' not in options:
            threads = threads or utils.misc.available_cpu_count()
            options.extend(('-t', str(threads)))

        self.execute('mem', options + [refDb, fq1, fq2], stdout=aln_sam)
        os.unlink(fq1)
        os.unlink(fq2)

        if min_score_to_filter:
            # Filter reads in the alignment based on on their alignment score
            aln_sam_filtered = util.file.mkstempfname('.sam')
            self.filter_sam_on_alignment_score(aln_sam, aln_sam_filtered,
                                               min_score_to_filter, options)
        else:
            aln_sam_filtered = aln_sam

        samtools.sort(aln_sam_filtered, outAlign, threads=threads)
        os.unlink(aln_sam)
        # cannot index sam files; only do so if a bam/cram is desired
        if outAlign.endswith(".bam") or outAlign.endswith(".cram"):
            samtools.index(outAlign)
Пример #3
0
    def mem(self,
            inReads,
            refDb,
            outAlign,
            options=None,
            min_qual=None,
            threads=None):
        options = [] if not options else options

        threads = threads or util.misc.available_cpu_count()
        samtools = tools.samtools.SamtoolsTool()
        fq1 = util.file.mkstempfname('.1.fastq')
        fq2 = util.file.mkstempfname('.2.fastq')
        aln_sam = util.file.mkstempfname('.sam')
        samtools.bam2fq(inReads, fq1, fq2)

        if '-t' not in options:
            threads = threads or utils.misc.available_cpu_count()
            options.extend(('-t', str(threads)))

        if '-T' not in options:
            min_qual = min_qual or 30
            options.extend(('-T', str(min_qual)))

        self.execute('mem', options + [refDb, fq1, fq2], stdout=aln_sam)

        os.unlink(fq1)
        os.unlink(fq2)
        samtools.sort(aln_sam, outAlign, threads=threads)
        os.unlink(aln_sam)
        # cannot index sam files; only do so if a bam/cram is desired
        if outAlign.endswith(".bam") or outAlign.endswith(".cram"):
            samtools.index(outAlign)
Пример #4
0
    def mem(self, inReads, refDb, outAlign, options=None, min_score_to_filter=None,
            threads=None, invert_filter=False, should_index=True):
        options = [] if not options else options

        threads = util.misc.sanitize_thread_count(threads)
        if '-t' not in options:
            options.extend(('-t', str(threads)))

        samtools = tools.samtools.SamtoolsTool()

        aln_sam = util.file.mkstempfname('.aligned.sam')
        fastq_pipe = samtools.bam2fq_pipe(inReads)
        self.execute('mem', options + ['-p', refDb, '-'], stdout=aln_sam, stdin=fastq_pipe.stdout)

        if fastq_pipe.poll():
            raise subprocess.CalledProcessError(fastq_pipe.returncode, "samtools.bam2fq_pipe() for {}".format(inReads))

        if min_score_to_filter:
            # Filter reads in the alignment based on on their alignment score
            aln_sam_filtered = util.file.mkstempfname('.sam')
            self.filter_sam_on_alignment_score(aln_sam, aln_sam_filtered,
                                               min_score_to_filter, options, invert_filter=invert_filter)
            os.unlink(aln_sam)
        else:
            aln_sam_filtered = aln_sam

 
        samtools.sort(aln_sam_filtered, outAlign, threads=threads)
        os.unlink(aln_sam_filtered)

        # cannot index sam files; only do so if a bam/cram is desired
        if should_index and (outAlign.endswith(".bam") or outAlign.endswith(".cram")):
            samtools.index(outAlign)
Пример #5
0
    def align_bam(self, inBam, refDb, outBam, options=None,
                      threads=None, JVMmemory=None):
        options = options or []

        samtools = tools.samtools.SamtoolsTool()
        threads = util.misc.sanitize_thread_count(threads)

        # fetch list of RGs
        rgs = list(samtools.getReadGroups(inBam).keys())

        if len(rgs) == 0:
            # Can't do this
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))

        elif len(rgs) == 1:
            # Only one RG, keep it simple
            self.align_one_rg(inBam, refDb, outBam, options=options, threads=threads)

        else:
            # Multiple RGs, align one at a time and merge
            align_bams = []

            for rg in rgs:
                tmp_bam = util.file.mkstempfname('.{}.bam'.format(rg))
                self.align_one_rg(
                    inBam,
                    refDb,
                    tmp_bam,
                    rgid=rg,
                    options=options,
                    threads=threads
                )
                if not samtools.isEmpty(tmp_bam):
                    align_bams.append(tmp_bam)
                else:
                    log.warning("No alignment output for RG %s in file %s against %s", rg, inBam, refDb)

            if len(align_bams)==0:
                with util.file.tempfname('.empty.sam') as empty_sam:
                    samtools.dumpHeader(inBam, empty_sam)
                    samtools.sort(empty_sam, outBam)
            else:
                # Merge BAMs, sort, and index
                picardOptions = ['SORT_ORDER=coordinate', 'USE_THREADING=true', 'CREATE_INDEX=true']
                tools.picard.MergeSamFilesTool().execute(
                    align_bams,
                    outBam,
                    picardOptions=picardOptions,
                    JVMmemory=JVMmemory
                )
                for bam in align_bams:
                    os.unlink(bam)
Пример #6
0
    def align_cmd(self, inReads, refDb, outAlign, options=None, threads=None):
        options = [] if not options else options

        threads = util.misc.sanitize_thread_count(threads)
        if '-t' not in options:
            options.extend(('-t', str(threads)))
        if '-2' not in options:
            options.append('-2')

        samtools = tools.samtools.SamtoolsTool()

        with util.file.tempfname('.aligned.sam') as aln_sam:
            fastq_pipe = samtools.bam2fq_pipe(inReads)
            options.extend(('-a', refDb, '-', '-o', aln_sam))
            self.execute(options, stdin=fastq_pipe.stdout)
            if fastq_pipe.wait():
                raise subprocess.CalledProcessError(fastq_pipe.returncode, "samtools.bam2fq_pipe() for {}".format(inReads))
            samtools.sort(aln_sam, outAlign, threads=threads)

        # cannot index sam files; only do so if a bam/cram is desired
        if (outAlign.endswith(".bam") or outAlign.endswith(".cram")):
            samtools.index(outAlign)
Пример #7
0
    def mem(self,
            inReads,
            refDb,
            outAlign,
            options=None,
            min_score_to_filter=None,
            threads=None,
            invert_filter=False):
        options = [] if not options else options

        threads = util.misc.sanitize_thread_count(threads)
        if '-t' not in options:
            options.extend(('-t', str(threads)))

        samtools = tools.samtools.SamtoolsTool()

        aln_sam = util.file.mkstempfname('.aligned.sam')
        with samtools.bam2fq_tmp(inReads) as (fq1, fq2):
            self.execute('mem', options + [refDb, fq1, fq2], stdout=aln_sam)

        if min_score_to_filter:
            # Filter reads in the alignment based on on their alignment score
            aln_sam_filtered = util.file.mkstempfname('.sam')
            self.filter_sam_on_alignment_score(aln_sam,
                                               aln_sam_filtered,
                                               min_score_to_filter,
                                               options,
                                               invert_filter=invert_filter)
            os.unlink(aln_sam)
        else:
            aln_sam_filtered = aln_sam

        samtools.sort(aln_sam_filtered, outAlign, threads=threads)
        os.unlink(aln_sam_filtered)

        # cannot index sam files; only do so if a bam/cram is desired
        if outAlign.endswith(".bam") or outAlign.endswith(".cram"):
            samtools.index(outAlign)
Пример #8
0
    def scaffold(self, contigs_fasta, ref_fasta, outAlign, divergence=20, options=None, threads=None):
        options = [] if not options else options

        threads = util.misc.sanitize_thread_count(threads)
        if '-t' not in options:
            options.extend(('-t', str(threads)))
        if '-2' not in options:
            options.append('-2')

        if divergence >= 20:
            options.extend(('-x', 'asm20'))
        elif divergence >= 10:
            options.extend(('-x', 'asm10'))
        else:
            options.extend(('-x', 'asm5'))

        with util.file.tempfname('.aligned.sam') as aln_sam:
            options.extend(('-a', ref_fasta, contigs_fasta, '-o', aln_sam))
            self.execute(options)
            samtools.sort(aln_sam, outAlign, threads=threads)

        # cannot index sam files; only do so if a bam/cram is desired
        if (outAlign.endswith(".bam") or outAlign.endswith(".cram")):
            samtools.index(outAlign)
Пример #9
0
def align_and_plot_coverage(out_plot_file,
                            plot_format,
                            plot_data_style,
                            plot_style,
                            plot_width,
                            plot_height,
                            plot_dpi,
                            plot_title,
                            plot_x_limits,
                            plot_y_limits,
                            base_q_threshold,
                            mapping_q_threshold,
                            max_coverage_depth,
                            read_length_threshold,
                            out_summary,
                            in_bam,
                            ref_fasta,
                            out_bam=None,
                            sensitive=False,
                            excludeDuplicates=False,
                            bin_large_plots=False,
                            binning_summary_statistic="max",
                            JVMmemory=None,
                            picardOptions=None,
                            min_score_to_filter=None,
                            aligner="bwa",
                            aligner_options='',
                            novoalign_license_path=None):
    ''' 
        Take reads, align to reference with BWA-MEM, and generate a coverage plot
    '''

    # TODO: use read_utils.py::align_and_fix in place of the duplicated alignment code here
    # The main difference is the presence/absence of GATK's local_realign

    if out_bam is None:
        bam_aligned = util.file.mkstempfname('.aligned.bam')
    else:
        bam_aligned = out_bam

    assert aligner in ["bwa", "novoalign"]
    if aligner_options is None:
        if aligner == "novoalign":
            aligner_options = '-r Random -l 40 -g 40 -x 20 -t 100 -k'
        elif aligner == 'bwa':
            aligner_options = '-1'  # hidden option to work around kernel/cpu bug; disables multithreaded file read: https://github.com/lh3/bwa/issues/102

    samtools = tools.samtools.SamtoolsTool()

    ref_indexed = util.file.mkstempfname('.reference.fasta')
    shutil.copyfile(ref_fasta, ref_indexed)

    aln_bam = util.file.mkstempfname('.bam')
    if aligner == "bwa":
        bwa = tools.bwa.Bwa()

        bwa.index(ref_indexed)

        bwa_opts = aligner_options.split()
        if sensitive:
            bwa_opts += "-k 12 -A 1 -B 1 -O 1 -E 1".split()

        bwa.align_mem_bam(in_bam,
                          ref_indexed,
                          aln_bam,
                          options=bwa_opts,
                          min_score_to_filter=min_score_to_filter)
    elif aligner == "novoalign":

        tools.novoalign.NovoalignTool(
            license_path=novoalign_license_path).index_fasta(ref_indexed)

        tools.novoalign.NovoalignTool(
            license_path=novoalign_license_path).execute(
                in_bam,
                ref_indexed,
                aln_bam,
                options=aligner_options.split(),
                JVMmemory=JVMmemory)

    aln_bam_dupe_processed = util.file.mkstempfname(
        '.filtered_dupe_processed.bam')
    if excludeDuplicates:
        opts = list(picardOptions)
        dupe_removal_out_metrics = util.file.mkstempfname('.metrics')
        tools.picard.MarkDuplicatesTool().execute([aln_bam],
                                                  aln_bam_dupe_processed,
                                                  dupe_removal_out_metrics,
                                                  picardOptions=opts,
                                                  JVMmemory=JVMmemory)
    else:
        aln_bam_dupe_processed = aln_bam

    samtools.sort(aln_bam_dupe_processed, bam_aligned)
    os.unlink(aln_bam)

    if excludeDuplicates:
        os.unlink(aln_bam_dupe_processed)

    samtools.index(bam_aligned)

    # -- call plot function --
    plot_coverage(bam_aligned, out_plot_file, plot_format, plot_data_style,
                  plot_style, plot_width, plot_height, plot_dpi, plot_title,
                  plot_x_limits, plot_y_limits, base_q_threshold,
                  mapping_q_threshold, max_coverage_depth,
                  read_length_threshold, excludeDuplicates, bin_large_plots,
                  binning_summary_statistic, out_summary)

    # remove the output bam, unless it is needed
    if out_bam is None:
        os.unlink(bam_aligned)

    # remove the files created by bwa index.
    # The empty extension causes the original fasta file to be removed
    for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]:
        file_to_remove = ref_indexed + ext
        if os.path.isfile(file_to_remove):
            os.unlink(file_to_remove)
Пример #10
0
def plot_coverage(in_bam,
                  out_plot_file,
                  plot_format,
                  plot_data_style,
                  plot_style,
                  plot_width,
                  plot_height,
                  plot_dpi,
                  plot_title,
                  plot_x_limits,
                  plot_y_limits,
                  base_q_threshold,
                  mapping_q_threshold,
                  max_coverage_depth,
                  read_length_threshold,
                  plot_only_non_duplicates=False,
                  bin_large_plots=False,
                  binning_summary_statistic="max",
                  out_summary=None):
    ''' 
        Generate a coverage plot from an aligned bam file
    '''
    samtools = tools.samtools.SamtoolsTool()

    # check if in_bam is aligned, if not raise an error
    num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"])
    if num_mapped_reads == 0:
        raise Exception(
            """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if the plot input bam file contains reads and you don't mind a simple bwa alignment. \n File: %s"""
            % in_bam)

    if out_summary is None:
        coverage_tsv_file = util.file.mkstempfname('.summary.tsv')
    else:
        coverage_tsv_file = out_summary

    bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam')
    if plot_only_non_duplicates:
        # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates
        # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates)
        samtools.view(["-F", "1024", '-@', '3'], in_bam, bam_dupe_processed)
    else:
        bam_dupe_processed = in_bam

    # only sort if not sorted
    bam_sorted = util.file.mkstempfname('.sorted.bam')
    should_remove_sorted = True
    if not util.file.bam_is_sorted(bam_dupe_processed):
        samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"])
        if plot_only_non_duplicates:
            os.unlink(bam_dupe_processed)
    else:
        bam_sorted = bam_dupe_processed
        if not plot_only_non_duplicates:
            # in this case we are passing through the original in_bam directly
            should_remove_sorted = False

    # call samtools index
    samtools.index(bam_sorted)

    # call samtools depth
    opts = []
    opts += ['-aa']  # report coverate at "absolutely all" positions
    if base_q_threshold:
        if not plot_only_non_duplicates:
            # Note: "bedtools genomecov" will count depth including duplicates, but does
            # not expose options for filtering by quality. When duplicates
            # are excluded, "samtools depth" is used which does support quality filtering
            # We use either samtools or bedtools, because the former ignores marked duplicates
            # from its depth count while bedtools includes them.
            log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-q", str(base_q_threshold)]
    if mapping_q_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-Q", str(mapping_q_threshold)]
    if max_coverage_depth:
        if not plot_only_non_duplicates:
            log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-m", str(max_coverage_depth)]
    if read_length_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-l", str(read_length_threshold)]

    # add option here for bedtools to report coverage w/ duplicates
    # (and then samtools for no-dups)
    #
    # Ex.
    #   samtools depth -aa mapped-to-ref.with-dups.tmp.bam
    #   bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d
    if not plot_only_non_duplicates:
        bt = BedTool(bam_sorted)
        # "d=True" is the equivalent of passing "-d" to the bedtools CLI
        bt.genome_coverage(d=True).saveas(coverage_tsv_file)
    else:
        samtools.depth(bam_sorted, coverage_tsv_file, opts)

    # only remove the sorted bam if it is not the original input bam
    # which we use directly in some casess
    if should_remove_sorted:
        os.unlink(bam_sorted)

    # ---- create plot based on coverage_tsv_file ----

    segment_depths = OrderedDict()
    domain_max = 0
    with open(coverage_tsv_file, "r") as tabfile:
        for row in csv.reader(tabfile, delimiter='\t'):
            segment_depths.setdefault(row[0], []).append(float(row[2]))
            domain_max += 1

    with matplotlib.pyplot.style.context(plot_style):
        fig = matplotlib.pyplot.gcf()
        DPI = plot_dpi or fig.get_dpi()
        fig.set_size_inches(
            float(plot_width) / float(DPI),
            float(plot_height) / float(DPI))

        font_size = (2.5 * plot_height) / float(DPI)

        ax = matplotlib.pyplot.subplot(
        )  # Defines ax variable by creating an empty plot

        # Set the tick labels font
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontsize(font_size)

        # Binning
        bin_size = 1
        if bin_large_plots:
            # Bin locations and take summary value (maximum or minimum) in each bin
            binning_fn = {
                "min": min,
                "max": max,
                "mean": mean,
                "median": median
            }
            binning_action = binning_fn.get(binning_summary_statistic, "max")

            inner_plot_width_inches = ax.get_window_extent().transformed(
                fig.dpi_scale_trans.inverted()).width
            inner_plot_width_px = inner_plot_width_inches * fig.dpi  # width of actual plot (sans whitespace and y axis text)
            bins_per_pixel = 1  # increase to make smaller (but less visible) bins
            bin_size = 1 + int(domain_max /
                               (inner_plot_width_px * bins_per_pixel))

            binned_segment_depths = OrderedDict()
            for segment_num, (segment_name, position_depths) in enumerate(
                    segment_depths.items()):
                summary_depths_in_bins = [
                    binning_action(position_depths[i:i + bin_size])
                    for i in range(0, len(position_depths), bin_size)
                ]
                binned_segment_depths[segment_name] = summary_depths_in_bins
            segment_depths = binned_segment_depths

        # Plotting
        domain_max = 0
        for segment_num, (segment_name, position_depths) in enumerate(
                segment_depths.items()):
            prior_domain_max = domain_max
            domain_max += len(position_depths)

            colors = list(
                matplotlib.pyplot.rcParams['axes.prop_cycle'].by_key()
                ['color'])  # get the colors for this style
            segment_color = colors[
                segment_num %
                len(colors)]  # pick a color, offset by the segment index

            x_values = range(prior_domain_max, domain_max)
            x_values = [x * bin_size for x in x_values]

            if plot_data_style == "filled":
                matplotlib.pyplot.fill_between(x_values,
                                               position_depths,
                                               [0] * len(position_depths),
                                               linewidth=0,
                                               antialiased=True,
                                               color=segment_color)
            elif plot_data_style == "line":
                matplotlib.pyplot.plot(x_values,
                                       position_depths,
                                       antialiased=True,
                                       color=segment_color)
            elif plot_data_style == "dots":
                matplotlib.pyplot.plot(x_values,
                                       position_depths,
                                       'ro',
                                       antialiased=True,
                                       color=segment_color)

        matplotlib.pyplot.title(plot_title, fontsize=font_size * 1.2)
        matplotlib.pyplot.xlabel("bp", fontsize=font_size * 1.1)

        ylabel = "read depth"
        if (bin_size > 1):
            ylabel = "read depth ({summary} in {size}-bp bin)".format(
                size=bin_size, summary=binning_summary_statistic)
        matplotlib.pyplot.ylabel(ylabel, fontsize=font_size * 1.1)

        if plot_x_limits is not None:
            x_min, x_max = plot_x_limits
            matplotlib.pyplot.xlim(x_min, x_max)
        if plot_y_limits is not None:
            y_min, y_max = plot_y_limits
            matplotlib.pyplot.ylim(y_min, y_max)

        # to squash a backend renderer error on OSX related to tight layout
        if matplotlib.pyplot.get_backend().lower() in ['agg', 'macosx']:
            fig.set_tight_layout(True)
        else:
            fig.tight_layout()

        matplotlib.pyplot.savefig(out_plot_file, format=plot_format,
                                  dpi=DPI)  #, bbox_inches='tight')
        log.info("Coverage plot saved to: " + out_plot_file)

    if not out_summary:
        os.unlink(coverage_tsv_file)
Пример #11
0
    def align_one_rg(self, inBam, refDb, outBam, rgid=None, preset=None, options=None,
                         threads=None, JVMmemory=None):
        """
            Performs an alignment of one read group in a bam file to a reference fasta file using minimap2.
            Emits alignments in sorted, index bam files.
            inBam may contain more read groups, but we will subset input to the specified rgid.
            preset may be specified as a valid value for "minimap2 -x" which depends on the type of
                data (short accurate reads vs long noisy reads). If preset is set to None, we will autodetect
                based on the PL (platform) tag in the read group header (e.g. illumina, ont, pacbio)
        """
        options = list(options).copy() or []

        samtools = tools.samtools.SamtoolsTool()

        # Require exactly one RG
        rgs = samtools.getReadGroups(inBam)
        if len(rgs) == 0:
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))
        elif len(rgs) == 1:
            if not rgid:
                rgid = list(rgs.keys())[0]
        elif not rgid:
            raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs)))
        if rgid not in rgs:
            raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid))

        headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid))
        # Strip inBam to just one RG (if necessary)
        removeInput = False
        if len(rgs) == 1:
            one_rg_inBam = inBam
            tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile)
        else:
            # strip inBam to one read group
            with util.file.tempfname('.onebam.bam') as tmp_bam:
                samtools.view(['-1', '-r', rgid], inBam, tmp_bam)
                # special exit if this file is empty
                if samtools.isEmpty(tmp_bam):
                    log.warning("No reads present for RG %s in file: %s", rgid, inBam)
                    shutil.copyfile(tmp_bam, outBam)
                    return
                # simplify BAM header otherwise Novoalign gets confused
                one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid))
                removeInput = True
                
                with open(headerFile, 'wt') as outf:
                    for row in samtools.getHeader(inBam):
                        if len(row) > 0 and row[0] == '@RG':
                            if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]:
                                # skip all read groups that are not rgid
                                continue
                        outf.write('\t'.join(row) + '\n')
                samtools.reheader(tmp_bam, headerFile, one_rg_inBam)

        # get the read group line to give to mm2
        readgroup_line = ""
        with open(headerFile) as inf:
            for line in inf:
                if line.startswith("@RG"):
                    readgroup_line = line.rstrip("\r\n")
        if not readgroup_line:
            raise Exception()
        # rather than reheader the alignment bam file later so it has the readgroup information
        # from the original bam file, we'll pass the RG line to minimap2 to write out
        options.extend(('-R', readgroup_line.replace('\t','\\t')))

        # dynamically determine the mode of operation
        if '-x' not in options:
            if preset is None:
                platform = list(x for x in readgroup_line.split('\t') if x.startswith('PL:'))
                if len(platform) != 1:
                    raise Exception("cannot autodetect minimap2 aligner mode when PL: tag is not set in the read group header for {}: {}".format(inBam, readgroup_line))
                else:
                    platform = platform[0][3:].lower()
                    if platform == 'illumina':
                        preset = 'sr'
                    elif platform == 'ont':
                        preset = 'map-ont'
                    elif platform == 'pacbio':
                        preset = 'map-pb'
                    else:
                        raise Exception("PL: tag {} for read group {} in bam {} refers to a data type we do not know how to map with minimap2".format(platform, rgid, inBam))
            options.extend(('-x', preset))

        # perform actual alignment
        if samtools.isEmpty(one_rg_inBam):
            # minimap doesn't like empty inputs, so copy empty bam through
            samtools.sort(one_rg_inBam, outBam)
        else:
            self.align_cmd(one_rg_inBam, refDb, outBam, options=options, threads=threads)

        # if there was more than one RG in the input, we had to create a temporary file with the one RG specified
        # and we can safely delete it this file
        # if there was only one RG in the input, we used it directly and should not delete it
        if removeInput:
            os.unlink(one_rg_inBam)
Пример #12
0
def align_and_plot_coverage(
    out_plot_file,
    plot_format,
    plot_data_style,
    plot_style,
    plot_width,
    plot_height,
    plot_dpi,
    plot_title,
    base_q_threshold,
    mapping_q_threshold,
    max_coverage_depth,
    read_length_threshold,
    out_summary,
    in_bam,
    ref_fasta,
    out_bam=None,
    sensitive=False,
    excludeDuplicates=False,
    JVMmemory=None,
    picardOptions=None,
    min_score_to_output=None
):
    ''' 
        Take reads, align to reference with BWA-MEM, and generate a coverage plot
    '''
    if out_bam is None:
        bam_aligned = util.file.mkstempfname('.aligned.bam')
    else:
        bam_aligned = out_bam

    ref_indexed = util.file.mkstempfname('.reference.fasta')
    shutil.copyfile(ref_fasta, ref_indexed)

    bwa = tools.bwa.Bwa()
    samtools = tools.samtools.SamtoolsTool()

    bwa.index(ref_indexed)

    bwa_opts = []
    if sensitive:
        bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split()

    map_threshold = min_score_to_output or 30

    bwa_opts + ["-T", str(map_threshold)]

    aln_bam = util.file.mkstempfname('.bam')

    bwa.mem(in_bam, ref_indexed, aln_bam, opts=bwa_opts)

    # @haydenm says:
    # For some reason (particularly when the --sensitive option is on), bwa
    # doesn't listen to its '-T' flag and outputs alignments with score less
    # than the '-T 30' threshold. So filter these:
    aln_bam_filtered = util.file.mkstempfname('.filtered.bam')
    samtools.view(["-b", "-h", "-q", str(map_threshold)], aln_bam, aln_bam_filtered)
    os.unlink(aln_bam)

    aln_bam_dupe_processed = util.file.mkstempfname('.filtered_dupe_processed.bam')
    if excludeDuplicates:
        opts = list(picardOptions)
        dupe_removal_out_metrics = util.file.mkstempfname('.metrics')
        tools.picard.MarkDuplicatesTool().execute(
            [aln_bam_filtered], aln_bam_dupe_processed,
            dupe_removal_out_metrics, picardOptions=opts,
            JVMmemory=JVMmemory
        )
    else:
        aln_bam_dupe_processed = aln_bam_filtered

    samtools.sort(aln_bam_dupe_processed, bam_aligned)
    os.unlink(aln_bam_filtered)
    
    if excludeDuplicates:
        os.unlink(aln_bam_dupe_processed)

    samtools.index(bam_aligned)

    # -- call plot function --
    plot_coverage(
        bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title,
        base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, excludeDuplicates, out_summary
    )

    # remove the output bam, unless it is needed
    if out_bam is None:
        os.unlink(bam_aligned)

    # remove the files created by bwa index. 
    # The empty extension causes the original fasta file to be removed
    for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]:
        file_to_remove = ref_indexed + ext
        if os.path.isfile(file_to_remove):
            os.unlink(file_to_remove)
Пример #13
0
def plot_coverage(
    in_bam,
    out_plot_file,
    plot_format,
    plot_data_style,
    plot_style,
    plot_width,
    plot_height,
    plot_dpi,
    plot_title,
    base_q_threshold,
    mapping_q_threshold,
    max_coverage_depth,
    read_length_threshold,
    plot_only_non_duplicates=False,
    out_summary=None
):
    ''' 
        Generate a coverage plot from an aligned bam file
    '''

    # TODO: remove this:
    #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv"

    samtools = tools.samtools.SamtoolsTool()

    # check if in_bam is aligned, if not raise an error
    num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"])
    if num_mapped_reads == 0:
        raise Exception(
            """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s"""
            % in_bam
        )

    if out_summary is None:
        coverage_tsv_file = util.file.mkstempfname('.summary.tsv')
    else:
        coverage_tsv_file = out_summary

    bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam')
    if plot_only_non_duplicates:
        # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates)
        samtools.view(["-F", "1024"], in_bam, bam_dupe_processed)
    else:
        bam_dupe_processed = in_bam

    # call samtools sort
    bam_sorted = util.file.mkstempfname('.sorted.bam')
    samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"])

    if plot_only_non_duplicates:
        os.unlink(bam_dupe_processed)

    # call samtools index
    samtools.index(bam_sorted)

    # call samtools depth
    opts = []
    opts += ['-aa']    # report coverate at "absolutely all" positions
    if base_q_threshold:
        opts += ["-q", str(base_q_threshold)]
    if mapping_q_threshold:
        opts += ["-Q", str(mapping_q_threshold)]
    if max_coverage_depth:
        opts += ["-m", str(max_coverage_depth)]
    if read_length_threshold:
        opts += ["-l", str(read_length_threshold)]

    samtools.depth(bam_sorted, coverage_tsv_file, opts)
    os.unlink(bam_sorted)

    # ---- create plot based on coverage_tsv_file ----

    segment_depths = OrderedDict()
    domain_max = 0
    with open(coverage_tsv_file, "r") as tabfile:
        for row in csv.reader(tabfile, delimiter='\t'):
            segment_depths.setdefault(row[0], []).append(int(row[2]))
            domain_max += 1

    domain_max = 0
    with plt.style.context(plot_style):
        fig = plt.gcf()
        DPI = plot_dpi or fig.get_dpi()
        fig.set_size_inches(float(plot_width) / float(DPI), float(plot_height) / float(DPI))

        font_size = (2.5 * plot_height) / float(DPI)

        ax = plt.subplot()    # Defines ax variable by creating an empty plot

        # Set the tick labels font
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontsize(font_size)

        for segment_num, (segment_name, position_depths) in enumerate(segment_depths.items()):
            prior_domain_max = domain_max
            domain_max += len(position_depths)

            colors = list(plt.rcParams['axes.prop_cycle'].by_key()['color'])    # get the colors for this style
            segment_color = colors[segment_num % len(colors)]    # pick a color, offset by the segment index

            if plot_data_style == "filled":
                plt.fill_between(
                    range(prior_domain_max, domain_max),
                    position_depths, [0] * len(position_depths),
                    linewidth=0,
                    antialiased=True,
                    color=segment_color
                )
            elif plot_data_style == "line":
                plt.plot(range(prior_domain_max, domain_max), position_depths, antialiased=True, color=segment_color)
            elif plot_data_style == "dots":
                plt.plot(
                    range(prior_domain_max, domain_max),
                    position_depths,
                    'ro',
                    antialiased=True,
                    color=segment_color
                )

        plt.title(plot_title, fontsize=font_size * 1.2)
        plt.xlabel("bp", fontsize=font_size * 1.1)
        plt.ylabel("read depth", fontsize=font_size * 1.1)

        # to squash a backend renderer error on OSX related to tight layout
        if plt.get_backend().lower() in ['agg', 'macosx']:
            fig.set_tight_layout(True)
        else:
            fig.tight_layout()

        plt.savefig(out_plot_file, format=plot_format, dpi=DPI)    #, bbox_inches='tight')
        log.info("Coverage plot saved to: " + out_plot_file)

    if not out_summary:
        os.unlink(coverage_tsv_file)
Пример #14
0
def align_and_plot_coverage(out_plot_file,
                            plot_format,
                            plot_data_style,
                            plot_style,
                            plot_width,
                            plot_height,
                            plot_dpi,
                            plot_title,
                            base_q_threshold,
                            mapping_q_threshold,
                            max_coverage_depth,
                            read_length_threshold,
                            out_summary,
                            in_bam,
                            ref_fasta,
                            out_bam=None,
                            sensitive=False,
                            excludeDuplicates=False,
                            JVMmemory=None,
                            picardOptions=None,
                            min_score_to_output=None,
                            aligner="bwa",
                            aligner_options='',
                            novoalign_license_path=None):
    ''' 
        Take reads, align to reference with BWA-MEM, and generate a coverage plot
    '''

    # TODO: use read_utils.py::align_and_fix in place of the duplicated alignment code here
    # The main difference is the presence/absence of GATK's local_realign

    if out_bam is None:
        bam_aligned = util.file.mkstempfname('.aligned.bam')
    else:
        bam_aligned = out_bam

    assert aligner in ["bwa", "novoalign"]
    if aligner_options is None:
        if aligner == "novoalign":
            aligner_options = '-r Random -l 40 -g 40 -x 20 -t 100 -k'
        elif aligner == 'bwa':
            aligner_options = '-T 30'  # quality threshold

    samtools = tools.samtools.SamtoolsTool()

    ref_indexed = util.file.mkstempfname('.reference.fasta')
    shutil.copyfile(ref_fasta, ref_indexed)

    aln_bam = util.file.mkstempfname('.bam')
    if aligner == "bwa":
        bwa = tools.bwa.Bwa()

        bwa.index(ref_indexed)

        bwa_opts = aligner_options.split()
        if sensitive:
            bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split()

        # get the quality threshold from the opts
        # for downstream filtering
        bwa_map_threshold = min_score_to_output or 30
        if '-T' in bwa_opts:
            if bwa_opts.index("-T") + 1 <= len(bwa_opts):
                bwa_map_threshold = int(bwa_opts[bwa_opts.index("-T") + 1])

        bwa.align_mem_bam(in_bam,
                          ref_indexed,
                          aln_bam,
                          options=bwa_opts,
                          min_qual=bwa_map_threshold)
    elif aligner == "novoalign":

        tools.novoalign.NovoalignTool(
            license_path=novoalign_license_path).index_fasta(ref_indexed)

        tools.novoalign.NovoalignTool(
            license_path=novoalign_license_path).execute(
                in_bam,
                ref_indexed,
                aln_bam,
                options=aligner_options.split(),
                JVMmemory=JVMmemory)

    aln_bam_dupe_processed = util.file.mkstempfname(
        '.filtered_dupe_processed.bam')
    if excludeDuplicates:
        opts = list(picardOptions)
        dupe_removal_out_metrics = util.file.mkstempfname('.metrics')
        tools.picard.MarkDuplicatesTool().execute([aln_bam],
                                                  aln_bam_dupe_processed,
                                                  dupe_removal_out_metrics,
                                                  picardOptions=opts,
                                                  JVMmemory=JVMmemory)
    else:
        aln_bam_dupe_processed = aln_bam

    samtools.sort(aln_bam_dupe_processed, bam_aligned)
    os.unlink(aln_bam)

    if excludeDuplicates:
        os.unlink(aln_bam_dupe_processed)

    samtools.index(bam_aligned)

    # -- call plot function --
    plot_coverage(bam_aligned, out_plot_file, plot_format, plot_data_style,
                  plot_style, plot_width, plot_height, plot_dpi, plot_title,
                  base_q_threshold, mapping_q_threshold, max_coverage_depth,
                  read_length_threshold, excludeDuplicates, out_summary)

    # remove the output bam, unless it is needed
    if out_bam is None:
        os.unlink(bam_aligned)

    # remove the files created by bwa index.
    # The empty extension causes the original fasta file to be removed
    for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]:
        file_to_remove = ref_indexed + ext
        if os.path.isfile(file_to_remove):
            os.unlink(file_to_remove)
Пример #15
0
def plot_coverage(in_bam,
                  out_plot_file,
                  plot_format,
                  plot_data_style,
                  plot_style,
                  plot_width,
                  plot_height,
                  plot_dpi,
                  plot_title,
                  base_q_threshold,
                  mapping_q_threshold,
                  max_coverage_depth,
                  read_length_threshold,
                  plot_only_non_duplicates=False,
                  out_summary=None):
    ''' 
        Generate a coverage plot from an aligned bam file
    '''

    # TODO: remove this:
    #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv"

    samtools = tools.samtools.SamtoolsTool()

    # check if in_bam is aligned, if not raise an error
    num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"])
    if num_mapped_reads == 0:
        raise Exception(
            """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s"""
            % in_bam)

    if out_summary is None:
        coverage_tsv_file = util.file.mkstempfname('.summary.tsv')
    else:
        coverage_tsv_file = out_summary

    bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam')
    if plot_only_non_duplicates:
        # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates
        # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates)
        samtools.view(["-F", "1024"], in_bam, bam_dupe_processed)
    else:
        bam_dupe_processed = in_bam

    # call samtools sort
    bam_sorted = util.file.mkstempfname('.sorted.bam')
    samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"])

    if plot_only_non_duplicates:
        os.unlink(bam_dupe_processed)

    # call samtools index
    samtools.index(bam_sorted)

    # call samtools depth
    opts = []
    opts += ['-aa']  # report coverate at "absolutely all" positions
    if base_q_threshold:
        if not plot_only_non_duplicates:
            # Note: "bedtools genomecov" will count depth including duplicates, but does
            # not expose options for filtering by quality. When duplicates
            # are excluded, "samtools depth" is used which does support quality filtering
            # We use either samtools or bedtools, because the former ignores marked duplicates
            # from its depth count while bedtools includes them.
            log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-q", str(base_q_threshold)]
    if mapping_q_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-Q", str(mapping_q_threshold)]
    if max_coverage_depth:
        if not plot_only_non_duplicates:
            log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-m", str(max_coverage_depth)]
    if read_length_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-l", str(read_length_threshold)]

    # add option here for bedtools to report coverage w/ duplicates
    # (and then samtools for no-dups)
    #
    # Ex.
    #   samtools depth -aa mapped-to-ref.with-dups.tmp.bam
    #   bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d
    if not plot_only_non_duplicates:
        bt = BedTool(bam_sorted)
        # "d=True" is the equivalent of passing "-d" to the bedtools CLI
        bt.genome_coverage(d=True).saveas(coverage_tsv_file)
    else:
        samtools.depth(bam_sorted, coverage_tsv_file, opts)
    os.unlink(bam_sorted)

    # ---- create plot based on coverage_tsv_file ----

    segment_depths = OrderedDict()
    domain_max = 0
    with open(coverage_tsv_file, "r") as tabfile:
        for row in csv.reader(tabfile, delimiter='\t'):
            segment_depths.setdefault(row[0], []).append(int(row[2]))
            domain_max += 1

    domain_max = 0
    with plt.style.context(plot_style):
        fig = plt.gcf()
        DPI = plot_dpi or fig.get_dpi()
        fig.set_size_inches(
            float(plot_width) / float(DPI),
            float(plot_height) / float(DPI))

        font_size = (2.5 * plot_height) / float(DPI)

        ax = plt.subplot()  # Defines ax variable by creating an empty plot

        # Set the tick labels font
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontsize(font_size)

        for segment_num, (segment_name, position_depths) in enumerate(
                segment_depths.items()):
            prior_domain_max = domain_max
            domain_max += len(position_depths)

            colors = list(plt.rcParams['axes.prop_cycle'].by_key()
                          ['color'])  # get the colors for this style
            segment_color = colors[
                segment_num %
                len(colors)]  # pick a color, offset by the segment index

            if plot_data_style == "filled":
                plt.fill_between(range(prior_domain_max, domain_max),
                                 position_depths, [0] * len(position_depths),
                                 linewidth=0,
                                 antialiased=True,
                                 color=segment_color)
            elif plot_data_style == "line":
                plt.plot(range(prior_domain_max, domain_max),
                         position_depths,
                         antialiased=True,
                         color=segment_color)
            elif plot_data_style == "dots":
                plt.plot(range(prior_domain_max, domain_max),
                         position_depths,
                         'ro',
                         antialiased=True,
                         color=segment_color)

        plt.title(plot_title, fontsize=font_size * 1.2)
        plt.xlabel("bp", fontsize=font_size * 1.1)
        plt.ylabel("read depth", fontsize=font_size * 1.1)

        # to squash a backend renderer error on OSX related to tight layout
        if plt.get_backend().lower() in ['agg', 'macosx']:
            fig.set_tight_layout(True)
        else:
            fig.tight_layout()

        plt.savefig(out_plot_file, format=plot_format,
                    dpi=DPI)  #, bbox_inches='tight')
        log.info("Coverage plot saved to: " + out_plot_file)

    if not out_summary:
        os.unlink(coverage_tsv_file)