Exemplo n.º 1
0
def align_and_plot_coverage(out_plot_file,
                            plot_format,
                            plot_data_style,
                            plot_style,
                            plot_width,
                            plot_height,
                            plot_dpi,
                            plot_title,
                            plot_x_limits,
                            plot_y_limits,
                            base_q_threshold,
                            mapping_q_threshold,
                            max_coverage_depth,
                            read_length_threshold,
                            out_summary,
                            in_bam,
                            ref_fasta,
                            out_bam=None,
                            sensitive=False,
                            excludeDuplicates=False,
                            bin_large_plots=False,
                            binning_summary_statistic="max",
                            JVMmemory=None,
                            picardOptions=None,
                            min_score_to_filter=None,
                            aligner="bwa",
                            aligner_options='',
                            novoalign_license_path=None):
    ''' 
        Take reads, align to reference with BWA-MEM, and generate a coverage plot
    '''

    # TODO: use read_utils.py::align_and_fix in place of the duplicated alignment code here
    # The main difference is the presence/absence of GATK's local_realign

    if out_bam is None:
        bam_aligned = util.file.mkstempfname('.aligned.bam')
    else:
        bam_aligned = out_bam

    assert aligner in ["bwa", "novoalign"]
    if aligner_options is None:
        if aligner == "novoalign":
            aligner_options = '-r Random -l 40 -g 40 -x 20 -t 100 -k'
        elif aligner == 'bwa':
            aligner_options = '-1'  # hidden option to work around kernel/cpu bug; disables multithreaded file read: https://github.com/lh3/bwa/issues/102

    samtools = tools.samtools.SamtoolsTool()

    ref_indexed = util.file.mkstempfname('.reference.fasta')
    shutil.copyfile(ref_fasta, ref_indexed)

    aln_bam = util.file.mkstempfname('.bam')
    if aligner == "bwa":
        bwa = tools.bwa.Bwa()

        bwa.index(ref_indexed)

        bwa_opts = aligner_options.split()
        if sensitive:
            bwa_opts += "-k 12 -A 1 -B 1 -O 1 -E 1".split()

        bwa.align_mem_bam(in_bam,
                          ref_indexed,
                          aln_bam,
                          options=bwa_opts,
                          min_score_to_filter=min_score_to_filter)
    elif aligner == "novoalign":

        tools.novoalign.NovoalignTool(
            license_path=novoalign_license_path).index_fasta(ref_indexed)

        tools.novoalign.NovoalignTool(
            license_path=novoalign_license_path).execute(
                in_bam,
                ref_indexed,
                aln_bam,
                options=aligner_options.split(),
                JVMmemory=JVMmemory)

    aln_bam_dupe_processed = util.file.mkstempfname(
        '.filtered_dupe_processed.bam')
    if excludeDuplicates:
        opts = list(picardOptions)
        dupe_removal_out_metrics = util.file.mkstempfname('.metrics')
        tools.picard.MarkDuplicatesTool().execute([aln_bam],
                                                  aln_bam_dupe_processed,
                                                  dupe_removal_out_metrics,
                                                  picardOptions=opts,
                                                  JVMmemory=JVMmemory)
    else:
        aln_bam_dupe_processed = aln_bam

    samtools.sort(aln_bam_dupe_processed, bam_aligned)
    os.unlink(aln_bam)

    if excludeDuplicates:
        os.unlink(aln_bam_dupe_processed)

    samtools.index(bam_aligned)

    # -- call plot function --
    plot_coverage(bam_aligned, out_plot_file, plot_format, plot_data_style,
                  plot_style, plot_width, plot_height, plot_dpi, plot_title,
                  plot_x_limits, plot_y_limits, base_q_threshold,
                  mapping_q_threshold, max_coverage_depth,
                  read_length_threshold, excludeDuplicates, bin_large_plots,
                  binning_summary_statistic, out_summary)

    # remove the output bam, unless it is needed
    if out_bam is None:
        os.unlink(bam_aligned)

    # remove the files created by bwa index.
    # The empty extension causes the original fasta file to be removed
    for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]:
        file_to_remove = ref_indexed + ext
        if os.path.isfile(file_to_remove):
            os.unlink(file_to_remove)
Exemplo n.º 2
0
def align_and_fix(inBam,
                  refFasta,
                  outBamAll=None,
                  outBamFiltered=None,
                  aligner_options='',
                  aligner="novoalign",
                  JVMmemory=None,
                  threads=1,
                  skip_mark_dupes=False,
                  gatk_path=None,
                  novoalign_license_path=None):
    ''' Take reads, align to reference with Novoalign, optionally mark duplicates
        with Picard, realign indels with GATK, and optionally filters
        final file to mapped/non-dupe reads.
    '''
    if not (outBamAll or outBamFiltered):
        log.warn("are you sure you meant to do nothing?")
        return

    assert aligner in ["novoalign", "bwa"]

    refFastaCopy = mkstempfname('.ref_copy.fasta')
    shutil.copyfile(refFasta, refFastaCopy)

    tools.picard.CreateSequenceDictionaryTool().execute(refFastaCopy,
                                                        overwrite=True)
    tools.samtools.SamtoolsTool().faidx(refFastaCopy, overwrite=True)

    if aligner_options is None:
        if aligner == "novoalign":
            aligner_options = '-r Random'
        elif aligner == 'bwa':
            aligner_options = ''  # use defaults

    bam_aligned = mkstempfname('.aligned.bam')
    if aligner == "novoalign":

        tools.novoalign.NovoalignTool(
            license_path=novoalign_license_path).index_fasta(refFastaCopy)

        tools.novoalign.NovoalignTool(
            license_path=novoalign_license_path).execute(
                inBam,
                refFastaCopy,
                bam_aligned,
                options=aligner_options.split(),
                JVMmemory=JVMmemory)
    elif aligner == 'bwa':
        bwa = tools.bwa.Bwa()
        bwa.index(refFastaCopy)

        opts = aligner_options.split()

        bwa.align_mem_bam(inBam, refFastaCopy, bam_aligned, options=opts)

    if skip_mark_dupes:
        bam_marked = bam_aligned
    else:
        bam_marked = mkstempfname('.mkdup.bam')
        tools.picard.MarkDuplicatesTool().execute(
            [bam_aligned],
            bam_marked,
            picardOptions=['CREATE_INDEX=true'],
            JVMmemory=JVMmemory)
        os.unlink(bam_aligned)

    tools.samtools.SamtoolsTool().index(bam_marked)

    bam_realigned = mkstempfname('.realigned.bam')
    tools.gatk.GATKTool(path=gatk_path).local_realign(bam_marked,
                                                      refFastaCopy,
                                                      bam_realigned,
                                                      JVMmemory=JVMmemory,
                                                      threads=threads)
    os.unlink(bam_marked)

    if outBamAll:
        shutil.copyfile(bam_realigned, outBamAll)
        tools.picard.BuildBamIndexTool().execute(outBamAll)
    if outBamFiltered:
        tools.samtools.SamtoolsTool().view(['-b', '-q', '1', '-F', '1028'],
                                           bam_realigned, outBamFiltered)
        tools.picard.BuildBamIndexTool().execute(outBamFiltered)
    os.unlink(bam_realigned)
Exemplo n.º 3
0
def align_and_plot_coverage(out_plot_file,
                            plot_format,
                            plot_data_style,
                            plot_style,
                            plot_width,
                            plot_height,
                            plot_dpi,
                            plot_title,
                            base_q_threshold,
                            mapping_q_threshold,
                            max_coverage_depth,
                            read_length_threshold,
                            out_summary,
                            in_bam,
                            ref_fasta,
                            out_bam=None,
                            sensitive=False,
                            excludeDuplicates=False,
                            JVMmemory=None,
                            picardOptions=None,
                            min_score_to_output=None,
                            aligner="bwa",
                            aligner_options='',
                            novoalign_license_path=None):
    ''' 
        Take reads, align to reference with BWA-MEM, and generate a coverage plot
    '''

    # TODO: use read_utils.py::align_and_fix in place of the duplicated alignment code here
    # The main difference is the presence/absence of GATK's local_realign

    if out_bam is None:
        bam_aligned = util.file.mkstempfname('.aligned.bam')
    else:
        bam_aligned = out_bam

    assert aligner in ["bwa", "novoalign"]
    if aligner_options is None:
        if aligner == "novoalign":
            aligner_options = '-r Random -l 40 -g 40 -x 20 -t 100 -k'
        elif aligner == 'bwa':
            aligner_options = '-T 30'  # quality threshold

    samtools = tools.samtools.SamtoolsTool()

    ref_indexed = util.file.mkstempfname('.reference.fasta')
    shutil.copyfile(ref_fasta, ref_indexed)

    aln_bam = util.file.mkstempfname('.bam')
    if aligner == "bwa":
        bwa = tools.bwa.Bwa()

        bwa.index(ref_indexed)

        bwa_opts = aligner_options.split()
        if sensitive:
            bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split()

        # get the quality threshold from the opts
        # for downstream filtering
        bwa_map_threshold = min_score_to_output or 30
        if '-T' in bwa_opts:
            if bwa_opts.index("-T") + 1 <= len(bwa_opts):
                bwa_map_threshold = int(bwa_opts[bwa_opts.index("-T") + 1])

        bwa.align_mem_bam(in_bam,
                          ref_indexed,
                          aln_bam,
                          options=bwa_opts,
                          min_qual=bwa_map_threshold)
    elif aligner == "novoalign":

        tools.novoalign.NovoalignTool(
            license_path=novoalign_license_path).index_fasta(ref_indexed)

        tools.novoalign.NovoalignTool(
            license_path=novoalign_license_path).execute(
                in_bam,
                ref_indexed,
                aln_bam,
                options=aligner_options.split(),
                JVMmemory=JVMmemory)

    aln_bam_dupe_processed = util.file.mkstempfname(
        '.filtered_dupe_processed.bam')
    if excludeDuplicates:
        opts = list(picardOptions)
        dupe_removal_out_metrics = util.file.mkstempfname('.metrics')
        tools.picard.MarkDuplicatesTool().execute([aln_bam],
                                                  aln_bam_dupe_processed,
                                                  dupe_removal_out_metrics,
                                                  picardOptions=opts,
                                                  JVMmemory=JVMmemory)
    else:
        aln_bam_dupe_processed = aln_bam

    samtools.sort(aln_bam_dupe_processed, bam_aligned)
    os.unlink(aln_bam)

    if excludeDuplicates:
        os.unlink(aln_bam_dupe_processed)

    samtools.index(bam_aligned)

    # -- call plot function --
    plot_coverage(bam_aligned, out_plot_file, plot_format, plot_data_style,
                  plot_style, plot_width, plot_height, plot_dpi, plot_title,
                  base_q_threshold, mapping_q_threshold, max_coverage_depth,
                  read_length_threshold, excludeDuplicates, out_summary)

    # remove the output bam, unless it is needed
    if out_bam is None:
        os.unlink(bam_aligned)

    # remove the files created by bwa index.
    # The empty extension causes the original fasta file to be removed
    for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]:
        file_to_remove = ref_indexed + ext
        if os.path.isfile(file_to_remove):
            os.unlink(file_to_remove)
Exemplo n.º 4
0
def align_and_fix(
    inBam, refFasta,
    outBamAll=None,
    outBamFiltered=None,
    aligner_options='',
    aligner="novoalign",
    JVMmemory=None,
    threads=1,
    gatk_path=None,
    novoalign_license_path=None
):
    ''' Take reads, align to reference with Novoalign, mark duplicates
        with Picard, realign indels with GATK, and optionally filter
        final file to mapped/non-dupe reads.
    '''
    if not (outBamAll or outBamFiltered):
        log.warn("are you sure you meant to do nothing?")
        return

    assert aligner in ["novoalign", "bwa"]

    refFastaCopy = mkstempfname('.ref_copy.fasta')
    shutil.copyfile(refFasta, refFastaCopy)

    tools.picard.CreateSequenceDictionaryTool().execute(refFastaCopy, overwrite=True)
    tools.samtools.SamtoolsTool().faidx(refFastaCopy, overwrite=True)    

    if aligner_options is None:
        if aligner=="novoalign":
            aligner_options = '-r Random'
        elif aligner=='bwa':
            aligner_options = '-T 30' # quality threshold

    bam_aligned = mkstempfname('.aligned.bam')
    if aligner=="novoalign":
        
        tools.novoalign.NovoalignTool(license_path=novoalign_license_path).index_fasta(refFastaCopy)

        tools.novoalign.NovoalignTool(license_path=novoalign_license_path).execute(
            inBam, refFastaCopy, bam_aligned,
            options=aligner_options.split(),
            JVMmemory=JVMmemory
        )
    elif aligner=='bwa':
        bwa = tools.bwa.Bwa()
        bwa.index(refFastaCopy)

        opts = aligner_options.split()

        # get the quality threshold from the opts
        # for downstream filtering
        bwa_map_threshold = 30
        if '-T' in opts:
            if opts.index("-T")+1 <= len(opts):
                bwa_map_threshold = int(opts[opts.index("-T")+1])

        bwa.align_mem_bam(inBam, refFastaCopy, bam_aligned, options=opts, min_qual=bwa_map_threshold)

    bam_mkdup = mkstempfname('.mkdup.bam')
    tools.picard.MarkDuplicatesTool().execute(
        [bam_aligned], bam_mkdup, picardOptions=['CREATE_INDEX=true'],
        JVMmemory=JVMmemory
    )
    os.unlink(bam_aligned)

    tools.samtools.SamtoolsTool().index(bam_mkdup)

    bam_realigned = mkstempfname('.realigned.bam')
    tools.gatk.GATKTool(path=gatk_path).local_realign(bam_mkdup, refFastaCopy, bam_realigned, JVMmemory=JVMmemory, threads=threads)
    os.unlink(bam_mkdup)

    if outBamAll:
        shutil.copyfile(bam_realigned, outBamAll)
        tools.picard.BuildBamIndexTool().execute(outBamAll)
    if outBamFiltered:
        tools.samtools.SamtoolsTool().view(['-b', '-q', '1', '-F', '1028'], bam_realigned, outBamFiltered)
        tools.picard.BuildBamIndexTool().execute(outBamFiltered)
    os.unlink(bam_realigned)
Exemplo n.º 5
0
def align_and_plot_coverage(
    out_plot_file,
    plot_format,
    plot_data_style,
    plot_style,
    plot_width,
    plot_height,
    plot_dpi,
    plot_title,
    base_q_threshold,
    mapping_q_threshold,
    max_coverage_depth,
    read_length_threshold,
    out_summary,
    in_bam,
    ref_fasta,
    out_bam=None,
    sensitive=False,
    excludeDuplicates=False,
    JVMmemory=None,
    picardOptions=None,
    min_score_to_output=None
):
    ''' 
        Take reads, align to reference with BWA-MEM, and generate a coverage plot
    '''
    if out_bam is None:
        bam_aligned = util.file.mkstempfname('.aligned.bam')
    else:
        bam_aligned = out_bam

    ref_indexed = util.file.mkstempfname('.reference.fasta')
    shutil.copyfile(ref_fasta, ref_indexed)

    bwa = tools.bwa.Bwa()
    samtools = tools.samtools.SamtoolsTool()

    bwa.index(ref_indexed)

    bwa_opts = []
    if sensitive:
        bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split()

    map_threshold = min_score_to_output or 30

    aln_bam = util.file.mkstempfname('.bam')

    bwa.align_mem_bam(in_bam, ref_indexed, aln_bam, options=bwa_opts, min_qual=map_threshold)

    aln_bam_dupe_processed = util.file.mkstempfname('.filtered_dupe_processed.bam')
    if excludeDuplicates:
        opts = list(picardOptions)
        dupe_removal_out_metrics = util.file.mkstempfname('.metrics')
        tools.picard.MarkDuplicatesTool().execute(
            [aln_bam], aln_bam_dupe_processed,
            dupe_removal_out_metrics, picardOptions=opts,
            JVMmemory=JVMmemory
        )
    else:
        aln_bam_dupe_processed = aln_bam

    samtools.sort(aln_bam_dupe_processed, bam_aligned)
    os.unlink(aln_bam)
    
    if excludeDuplicates:
        os.unlink(aln_bam_dupe_processed)

    samtools.index(bam_aligned)

    # -- call plot function --
    plot_coverage(
        bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title,
        base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, excludeDuplicates, out_summary
    )

    # remove the output bam, unless it is needed
    if out_bam is None:
        os.unlink(bam_aligned)

    # remove the files created by bwa index. 
    # The empty extension causes the original fasta file to be removed
    for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]:
        file_to_remove = ref_indexed + ext
        if os.path.isfile(file_to_remove):
            os.unlink(file_to_remove)