Пример #1
0
def haplotype_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        config = items[0]["config"]
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"],
                                   region, out_file)
        assert broad_runner.gatk_type() == "restricted", \
            "Require full version of GATK 2.4+ for haplotype calling"
        if not all(has_aligned_reads(x, region) for x in align_bams):
            vcfutils.write_empty_vcf(out_file, config)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += ["-T", "HaplotypeCaller",
                           "-o", tx_out_file,
                           "--annotation", "ClippingRankSumTest",
                           "--annotation", "DepthPerSampleHC"]
                # Enable hardware based optimizations in GATK 3.1+
                if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                    params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"]
                broad_runner.new_resources("gatk-haplotype")
                broad_runner.run_gatk(params)
    return out_file
Пример #2
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):

    """Run the MuTect paired analysis algorithm."""

    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(
            align_bams[0])[0]

    if not file_exists(out_file):
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file)

        if (not isinstance(region, (list, tuple)) and
            not all(has_aligned_reads(x, region) for x in align_bams)):

                vcfutils.write_empty_vcf(out_file)
                return

        with file_transaction(out_file) as tx_out_file:
            # Rationale: MuTect writes another table to stdout,
            # which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]

            broad_runner.run_mutect(params)

    return out_file
Пример #3
0
def shared_variantcall(call_fn,
                       name,
                       align_bams,
                       ref_file,
                       config,
                       assoc_files,
                       region=None,
                       out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    broad_runner = broad.runner_from_config(config)
    for x in align_bams:
        broad_runner.run_fn("picard_index", x)
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.info("Genotyping with {name}: {region} {fname}".format(
            name=name, region=region, fname=os.path.basename(align_bams[0])))
        variant_regions = config["algorithm"].get("variant_regions", None)
        target_regions = subset_variant_regions(variant_regions, region,
                                                out_file)
        if ((variant_regions is not None
             and isinstance(target_regions, basestring)
             and not os.path.isfile(target_regions)) or not all(
                 realign.has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                call_fn(align_bams, ref_file, config, target_regions,
                        tx_out_file)
    return out_file
Пример #4
0
def unified_genotyper(align_bams,
                      ref_file,
                      config,
                      dbsnp=None,
                      region=None,
                      out_file=None):
    """Perform SNP genotyping on the given alignment file.
    """
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp,
                                   region, out_file)
        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):
            write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += [
                    "-T", "UnifiedGenotyper", "-o", tx_out_file,
                    "--genotype_likelihoods_model", "BOTH"
                ]
                broad_runner.run_gatk(params)
    return out_file
Пример #5
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
        snp_file, intervals):
    """Step 1 of GATK recalibration process, producing table of covariates.
    """
    out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(dup_align_bam):
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = ["-T", "BaseRecalibrator",
                              "-o", tx_out_file,
                              "-I", dup_align_bam,
                              "-R", ref_file,
                              ]
                    # GATK-lite does not have support for
                    # insertion/deletion quality modeling
                    if not broad_runner.has_gatk_full():
                        params += ["--disable_indel_quals"]
                    if snp_file:
                        params += ["--knownSites", snp_file]
                    if intervals:
                        params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Пример #6
0
def write_recal_bam(data, region=None, out_file=None):
    """Step 2 of GATK recalibration -- use covariates to re-write output file.
    """
    config = data["config"]
    if out_file is None:
        out_file = "%s-gatkrecal.bam" % os.path.splitext(data["work_bam"])[0]
    logger.info("Writing recalibrated BAM for %s to %s" %
                (data["name"], out_file))
    if region == "nochr":
        out_bam = write_nochr_reads(data["work_bam"], out_file)
    else:
        out_bam = _run_recal_bam(data["work_bam"], data["prep_recal"], region,
                                 data["sam_ref"], out_file, config)
    qual_bin = config["algorithm"].get("quality_bin", None)
    if ((qual_bin is True or qual_bin == "postrecal"
         or isinstance(qual_bin, list) and "postrecal" in qual_bin)
            and has_aligned_reads(out_bam)):
        binned_bam = cram.illumina_qual_bin(out_bam, data["sam_ref"],
                                            os.path.dirname(out_bam), config)
        shutil.move(out_bam, out_bam + ".binned")
        shutil.move(binned_bam, out_bam)
        utils.save_diskspace(out_bam + ".binned",
                             "Quality binned to %s" % out_bam, config)
    data["work_bam"] = out_bam
    return [data]
Пример #7
0
def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file):
    """Call variants with samtools in target_regions.

    Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra
    Version information from VCF header lines.
    """
    config = items[0]["config"]

    max_read_depth = "1000"
    mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config,
                           target_regions=target_regions)
    bcftools = config_utils.get_program("bcftools", config)
    bcftools_version = programs.get_version("bcftools", config=config)
    samtools_version = programs.get_version("samtools", config=config)
    if LooseVersion(bcftools_version) > LooseVersion("0.1.19"):
        if LooseVersion(samtools_version) <= LooseVersion("0.1.19"):
            raise ValueError("samtools calling not supported with 0.1.19 samtools and 0.20 bcftools")
        bcftools_opts = "call -v -c"
    else:
        bcftools_opts = "view -v -c -g"
    compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
    vcfutils = config_utils.get_program("vcfutils.pl", config)
    # XXX Check if we need this when supporting samtools 0.2.0 calling.
    # 0.1.9 fails on regions without reads.
    if not any(realign.has_aligned_reads(x, target_regions) for x in align_bams):
        vcfutils.write_empty_vcf(out_file, config)
    else:
        cmd = ("{mpileup} "
               "| {bcftools} {bcftools_opts} - "
               "| {vcfutils} varFilter -D {max_read_depth} "
               "| sed 's/,Version=3>/>/'"
               "{compress_cmd} > {out_file}")
        logger.info(cmd.format(**locals()))
        do.run(cmd.format(**locals()), "Variant calling with samtools", {})
Пример #8
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple)) and
              not all(has_aligned_reads(x, region) for x in align_bams)):
                vcfutils.write_empty_vcf(out_file)
                return
        with file_transaction(out_file_mutect) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        _rename_allelic_fraction_field(out_file_mutect,config)
        disable_SID = True # SID isn't great, so use Scalpel instead
        if "appistry" not in broad_runner.get_mutect_version() or disable_SID:
            # Scalpel InDels
            is_paired = "-I:normal" in params
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                with file_transaction(out_file_indels) as tx_out_file2:
                    if not is_paired:
                        scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                    else:
                        scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=items[0]["sam_ref"],
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        else:
            # SomaticIndelDetector modifications
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files,
                                           region, out_file_indels)
            with file_transaction(out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                      out_file=out_file,
                                                      ref_file=items[0]["sam_ref"],
                                                      config=items[0]["config"],
                                                      region=region)
    return out_file
Пример #9
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
        dbsnp_file, intervals):
    """Step 1 of GATK recalibration process, producing table of covariates.
    """
    out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0]
    plot_file = "%s-plots.pdf" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(dup_align_bam, intervals):
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = ["-T", "BaseRecalibrator",
                              "-o", tx_out_file,
                              "--plot_pdf_file", plot_file,
                              "-I", dup_align_bam,
                              "-R", ref_file,
                              ]
                    downsample_pct = _get_downsample_pct(broad_runner, dup_align_bam)
                    if downsample_pct:
                        params += ["--downsample_to_fraction", str(downsample_pct),
                                   "--downsampling_type", "ALL_READS"]
                    # GATK-lite does not have support for
                    # insertion/deletion quality modeling
                    if broad_runner.gatk_type() == "lite":
                        params += ["--disable_indel_quals"]
                    if dbsnp_file:
                        params += ["--knownSites", dbsnp_file]
                    if intervals:
                        params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Пример #10
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
                            dbsnp_file, intervals, data):
    """Step 1 of GATK recalibration process, producing table of covariates.

    For GATK 4 we use local multicore spark runs:
    https://github.com/broadinstitute/gatk/issues/2345

    For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and
    the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis'
    plots in the GATK documentation:

    http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest

    This identifies large files and calculates the fraction to downsample to.
    """
    target_counts = 1e8  # 100 million reads per read group, 20x the plotted max
    out_file = "%s-recal.grp" % os.path.splitext(dup_align_bam)[0]
    if not utils.file_exists(out_file):
        if has_aligned_reads(dup_align_bam, intervals):
            with file_transaction(data, out_file) as tx_out_file:
                gatk_type = broad_runner.gatk_type()
                assert gatk_type in ["restricted", "gatk4"], \
                    "Require full version of GATK 2.4+ or GATK4 for BQSR"
                params = ["-I", dup_align_bam]
                if gatk_type == "gatk4":
                    params += [
                        "-T", "BaseRecalibratorSpark", "--sparkMaster",
                        "local[%s]" % dd.get_num_cores(data), "--output",
                        tx_out_file, "--reference",
                        dd.get_ref_twobit(data)
                    ]
                else:
                    params += [
                        "-T", "BaseRecalibrator", "-o", tx_out_file, "-R",
                        ref_file
                    ]
                    downsample_pct = bam.get_downsample_pct(
                        dup_align_bam, target_counts, data)
                    if downsample_pct:
                        params += [
                            "--downsample_to_fraction",
                            str(downsample_pct), "--downsampling_type",
                            "ALL_READS"
                        ]
                    if platform.lower() == "solid":
                        params += [
                            "--solid_nocall_strategy", "PURGE_READ",
                            "--solid_recal_mode", "SET_Q_ZERO_BASE_N"
                        ]
                if dbsnp_file:
                    params += ["--knownSites", dbsnp_file]
                if intervals:
                    params += [
                        "-L", intervals, "--interval_set_rule", "INTERSECTION"
                    ]
                broad_runner.run_gatk(params, os.path.dirname(tx_out_file))
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Пример #11
0
def haplotype_caller(align_bams,
                     items,
                     ref_file,
                     assoc_files,
                     region=None,
                     out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"],
                                   region, out_file)
        assert broad_runner.gatk_type() == "restricted", \
            "Require full version of GATK 2.4+ for haplotype calling"
        if not all(has_aligned_reads(x, region) for x in align_bams):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += ["-T", "HaplotypeCaller", "-o", tx_out_file]
                #params = _gatk_location_hack(params)
                broad_runner.new_resources("gatk-haplotype")
                broad_runner.run_gatk(params)
    return out_file
Пример #12
0
def unified_genotyper(align_bams,
                      items,
                      ref_file,
                      assoc_files,
                      region=None,
                      out_file=None):
    """Perform SNP genotyping on the given alignment file.
    """
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"],
                                   region, out_file)
        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += [
                    "-T", "UnifiedGenotyper", "-o", tx_out_file, "-ploidy",
                    (str(ploidy.get_ploidy(items, region))
                     if broad_runner.gatk_type() == "restricted" else "2"),
                    "--genotype_likelihoods_model", "BOTH"
                ]
                broad_runner.run_gatk(params)
    return out_file
Пример #13
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
                            snp_file, intervals):
    """Step 1 of GATK recalibration process, producing table of covariates.
    """
    out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(dup_align_bam):
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = [
                        "-T",
                        "BaseRecalibrator",
                        "-o",
                        tx_out_file,
                        "-I",
                        dup_align_bam,
                        "-R",
                        ref_file,
                    ]
                    # GATK-lite does not have support for
                    # insertion/deletion quality modeling
                    if not broad_runner.has_gatk_full():
                        params += ["--disable_indel_quals"]
                    if snp_file:
                        params += ["--knownSites", snp_file]
                    if intervals:
                        params += [
                            "-L", intervals, "--interval_set_rule",
                            "INTERSECTION"
                        ]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Пример #14
0
def mutect_caller(align_bams,
                  items,
                  ref_file,
                  assoc_files,
                  region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm."""

    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(
            align_bams[0])[0]

    if not file_exists(out_file):
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file)

        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):

            vcfutils.write_empty_vcf(out_file)
            return

        with file_transaction(out_file) as tx_out_file:
            # Rationale: MuTect writes another table to stdout,
            # which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]

            broad_runner.run_mutect(params)

    return out_file
Пример #15
0
def haplotype_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        config = items[0]["config"]
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"],
                                   region, out_file)
        assert broad_runner.gatk_type() == "restricted", \
            "Require full version of GATK 2.4+ for haplotype calling"
        if not all(has_aligned_reads(x, region) for x in align_bams):
            vcfutils.write_empty_vcf(out_file, config)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += ["-T", "HaplotypeCaller",
                           "-o", tx_out_file]
                #params = _gatk_location_hack(params)
                broad_runner.new_resources("gatk-haplotype")
                broad_runner.run_gatk(params)
    return out_file
Пример #16
0
def shared_variantcall(call_fn, name, align_bams, ref_file, config,
                       assoc_files, region=None, out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    broad_runner = broad.runner_from_config(config)
    for x in align_bams:
        broad_runner.run_fn("picard_index", x)
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.info("Genotyping with {name}: {region} {fname}".format(name=name,
            region=region, fname=os.path.basename(align_bams[0])))
        variant_regions = config["algorithm"].get("variant_regions", None)
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if ((variant_regions is not None and isinstance(target_regions, basestring)
              and not os.path.isfile(target_regions))
              or not all(realign.has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                call_fn(align_bams, ref_file, config, target_regions,
                        tx_out_file)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.dbsnp,
                                               ref_file, config)
    return ann_file
Пример #17
0
def _regions_for_coverage(data, region, ref_file, out_file):
    """Retrieve BED file of regions we need to calculate coverage in.

    Checks for variant region specifications that do not overlap contigs
    (in which case we do not calculate coverage) and regions smaller than
    callable_min_size (in which case we assign everything as callable).
    callable_min_size avoids calculations for small chromosomes we won't
    split on later, saving computation and disk IO.
    """
    variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data)
    ready_region = shared.subset_variant_regions(variant_regions, region, out_file)
    custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0]
    region_size = _get_region_size(ref_file, data, region)
    if variant_regions is None and region_size is not None and region_size < dd.get_callable_min_size(data):
        coverage_str = "CALLABLE" if realign.has_aligned_reads(dd.get_work_bam(data), region) else "NO_COVERAGE"
        custom_file = _write_all_chrom_file(coverage_str, custom_file, ref_file, region, data)
        return custom_file, False
    elif not ready_region:
        get_ref_bedtool(ref_file, data["config"]).saveas(custom_file)
        return custom_file, True
    elif os.path.isfile(ready_region):
        return ready_region, True
    elif isinstance(ready_region, (list, tuple)):
        c, s, e = ready_region
        pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e), from_string=True).saveas(custom_file)
        return custom_file, True
    else:
        custom_file = _write_all_chrom_file("NO_COVERAGE", custom_file, ref_file, region, data)
        return custom_file, variant_regions is None
Пример #18
0
def shared_variantcall(call_fn, name, align_bams, ref_file, items,
                       assoc_files, region=None, out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    config = items[0]["config"]
    if out_file is None:
        if vcfutils.is_paired_analysis(align_bams, items):
            out_file = "%s-paired-variants.vcf" % config["metdata"]["batch"]
        else:
            out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.info("Genotyping with {name}: {region} {fname}".format(
            name=name, region=region, fname=os.path.basename(align_bams[0])))
        for x in align_bams:
            bam.index(x, config)
        variant_regions = config["algorithm"].get("variant_regions", None)
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if ((variant_regions is not None and isinstance(target_regions, basestring)
              and not os.path.isfile(target_regions))
              or not all(realign.has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                call_fn(align_bams, ref_file, items, target_regions,
                        tx_out_file)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"],
                                               ref_file, config)
    return ann_file
Пример #19
0
def _gatk_count_covariates(broad_runner, dup_align_bam, ref_file, platform,
        snp_file, intervals):
    """Step 1 of GATK recalibration process -- counting covariates.
    """
    out_file = "%s.recal" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(dup_align_bam):
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = ["-T", "CountCovariates",
                              "-cov", "ReadGroupCovariate",
                              "-cov", "QualityScoreCovariate",
                              "-cov", "CycleCovariate",
                              "-cov", "DinucCovariate",
                              "-recalFile", tx_out_file,
                              "-I", dup_align_bam,
                              "-R", ref_file,
                              "-l", "INFO",
                              "-U",
                              "-OQ",
                              "--default_platform", platform,
                              ]
                    if snp_file:
                        params += ["--knownSites", snp_file]
                    if intervals:
                        params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Пример #20
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
                            dbsnp_file, intervals, data):
    """Step 1 of GATK recalibration process, producing table of covariates.

    Large whole genome BAM files take an excessively long time to recalibrate and
    the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis'
    plots in the GATK documentation:

    http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest

    This identifies large files and calculates the fraction to downsample to.

    TODO: Use new GATK 2.6+ AnalyzeCovariates tool to plot recalibration results.
    """
    target_counts = 1e8  # 100 million reads per read group, 20x the plotted max
    out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(dup_align_bam, intervals):
            with tx_tmpdir(data) as tmp_dir:
                with file_transaction(data, out_file) as tx_out_file:
                    params = [
                        "-T",
                        "BaseRecalibrator",
                        "-o",
                        tx_out_file,
                        "-I",
                        dup_align_bam,
                        "-R",
                        ref_file,
                    ]
                    downsample_pct = bam.get_downsample_pct(
                        dup_align_bam, target_counts, data)
                    if downsample_pct:
                        params += [
                            "--downsample_to_fraction",
                            str(downsample_pct), "--downsampling_type",
                            "ALL_READS"
                        ]
                    if platform.lower() == "solid":
                        params += [
                            "--solid_nocall_strategy", "PURGE_READ",
                            "--solid_recal_mode", "SET_Q_ZERO_BASE_N"
                        ]
                    # GATK-lite does not have support for
                    # insertion/deletion quality modeling
                    if broad_runner.gatk_type() == "lite":
                        params += ["--disable_indel_quals"]
                    if dbsnp_file:
                        params += ["--knownSites", dbsnp_file]
                    if intervals:
                        params += [
                            "-L", intervals, "--interval_set_rule",
                            "INTERSECTION"
                        ]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Пример #21
0
def pipeline_summary(data):
    """Provide summary information on processing sample.
    """
    work_bam = data.get("work_bam")
    if data["sam_ref"] is not None and work_bam and work_bam.endswith(".bam") and has_aligned_reads(work_bam):
        logger.info("Generating summary files: %s" % str(data["name"]))
        data["summary"] = _run_qc_tools(work_bam, data)
    return [[data]]
Пример #22
0
def pipeline_summary(data):
    """Provide summary information on processing sample.
    """
    work_bam = data.get("work_bam")
    if data["sam_ref"] is not None and work_bam and work_bam.endswith(
            ".bam") and has_aligned_reads(work_bam):
        logger.info("Generating summary files: %s" % str(data["name"]))
        data["summary"] = _run_qc_tools(work_bam, data)
    return [[data]]
Пример #23
0
def pipeline_summary(data):
    """Provide summary information on processing sample.
    """
    work_bam = (data.get("work_bam") if data["config"]["algorithm"].get(
        "merge_bamprep", True) else data.get("callable_bam"))
    if data["sam_ref"] is not None and work_bam and has_aligned_reads(
            work_bam):
        logger.info("Generating summary files: %s" % str(data["name"]))
        data["summary"] = _run_qc_tools(work_bam, data)
    return [[data]]
Пример #24
0
def pipeline_summary(data):
    """Provide summary information on processing sample.
    """
    work_bam = (data.get("work_bam")
                if data["config"]["algorithm"].get("merge_bamprep", True)
                else data.get("callable_bam"))
    if data["sam_ref"] is not None and work_bam and has_aligned_reads(work_bam):
        logger.info("Generating summary files: %s" % str(data["name"]))
        data["summary"] = _run_qc_tools(work_bam, data)
    return [[data]]
Пример #25
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
                            dbsnp_file, intervals, data):
    """Step 1 of GATK recalibration process, producing table of covariates.

    For GATK 4 we use local multicore spark runs:
    https://github.com/broadinstitute/gatk/issues/2345

    For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and
    the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis'
    plots in the GATK documentation:

    http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest

    This identifies large files and calculates the fraction to downsample to.
    """
    target_counts = 1e8  # 100 million reads per read group, 20x the plotted max
    out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data),
                            "%s-recal.grp" % utils.splitext_plus(os.path.basename(dup_align_bam))[0])
    if not utils.file_exists(out_file):
        if has_aligned_reads(dup_align_bam, intervals):
            with file_transaction(data, out_file) as tx_out_file:
                gatk_type = broad_runner.gatk_type()
                assert gatk_type in ["restricted", "gatk4"], \
                    "Require full version of GATK 2.4+ or GATK4 for BQSR"
                params = ["-I", dup_align_bam]
                cores = dd.get_num_cores(data)
                if gatk_type == "gatk4":
                    params += ["-T", "BaseRecalibratorSpark",
                               "--sparkMaster", "local[%s]" % cores,
                               "--output", tx_out_file, "--reference", dd.get_ref_twobit(data),
                               "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)]
                else:
                    params += ["-T", "BaseRecalibrator",
                                "-o", tx_out_file, "-R", ref_file]
                    downsample_pct = bam.get_downsample_pct(dup_align_bam, target_counts, data)
                    if downsample_pct:
                        params += ["--downsample_to_fraction", str(downsample_pct),
                                   "--downsampling_type", "ALL_READS"]
                    if platform.lower() == "solid":
                        params += ["--solid_nocall_strategy", "PURGE_READ",
                                   "--solid_recal_mode", "SET_Q_ZERO_BASE_N"]
                if dbsnp_file:
                    params += ["--knownSites", dbsnp_file]
                if intervals:
                    params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
                broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale,
                                      parallel_gc=True)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Пример #26
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
                            dbsnp_file, intervals):
    """Step 1 of GATK recalibration process, producing table of covariates.

    TODO: Use new GATK 2.6+ AnalyzeCovariates tool to plot recalibration results.
    """
    out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(dup_align_bam, intervals):
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = [
                        "-T",
                        "BaseRecalibrator",
                        "-o",
                        tx_out_file,
                        "-I",
                        dup_align_bam,
                        "-R",
                        ref_file,
                    ]
                    downsample_pct = _get_downsample_pct(
                        broad_runner, dup_align_bam)
                    if downsample_pct:
                        params += [
                            "--downsample_to_fraction",
                            str(downsample_pct), "--downsampling_type",
                            "ALL_READS"
                        ]
                    if platform.lower() == "solid":
                        params += [
                            "--solid_nocall_strategy", "PURGE_READ",
                            "--solid_recal_mode", "SET_Q_ZERO_BASE_N"
                        ]
                    # GATK-lite does not have support for
                    # insertion/deletion quality modeling
                    if broad_runner.gatk_type() == "lite":
                        params += ["--disable_indel_quals"]
                    if dbsnp_file:
                        params += ["--knownSites", dbsnp_file]
                    if intervals:
                        params += [
                            "-L", intervals, "--interval_set_rule",
                            "INTERSECTION"
                        ]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Пример #27
0
def mutect_caller(align_bams,
                  items,
                  ref_file,
                  assoc_files,
                  region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        if "appistry" in broad_runner.get_mutect_version():
            out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf"
                               in out_file else out_file + "-mutect.vcf")
        else:
            out_file_mutect = out_file
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
            return
        with file_transaction(out_file_mutect) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        if "appistry" in broad_runner.get_mutect_version():
            # SomaticIndelDetector modifications
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file,
                                           assoc_files, region,
                                           out_file_indels)
            with file_transaction(out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(
                orig_files=[out_file_mutect, out_file_indels],
                out_file=out_file,
                ref_file=items[0]["sam_ref"],
                config=items[0]["config"],
                region=region)
    return out_file
Пример #28
0
def unified_genotyper(align_bam, ref_file, config, dbsnp=None,
                       region=None, out_file=None):
    """Perform SNP genotyping on the given alignment file.
    """
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    broad_runner.run_fn("picard_index", align_bam)
    coverage_depth = config["algorithm"].get("coverage_depth", "high").lower()
    variant_regions = config["algorithm"].get("variant_regions", None)
    if coverage_depth in ["low"]:
        confidence = "4.0"
    else:
        confidence = "30.0"
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(align_bam, region):
            with file_transaction(out_file) as tx_out_file:
                params = ["-T", "UnifiedGenotyper",
                          "-I", align_bam,
                          "-R", ref_file,
                          "-o", tx_out_file,
                          "--annotation", "QualByDepth",
                          "--annotation", "HaplotypeScore",
                          "--annotation", "MappingQualityRankSumTest",
                          "--annotation", "ReadPosRankSumTest",
                          "--annotation", "FisherStrand",
                          "--annotation", "RMSMappingQuality",
                          "--annotation", "DepthOfCoverage",
                          "--genotype_likelihoods_model", "BOTH",
                          "--standard_min_confidence_threshold_for_calling", confidence,
                          "--standard_min_confidence_threshold_for_emitting", confidence,
                          "-l", "INFO",
                          ]
                if dbsnp:
                    params += ["--dbsnp", dbsnp]
                if region:
                    params += ["-L", region]
                if variant_regions:
                    params += ["-L", variant_regions, "--interval_set_rule", "INTERSECTION"]
                broad_runner.run_gatk(params)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("##fileformat=VCFv4.1\n"
                                 "## No variants; no reads aligned in region\n"
                                 "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
    return out_file
Пример #29
0
def unified_genotyper(align_bams, ref_file, config, dbsnp=None,
                       region=None, out_file=None):
    """Perform SNP genotyping on the given alignment file.
    """
    broad_runner, params, out_file = \
        _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp,
                               region, out_file)
    if not file_exists(out_file):
        if not all(has_aligned_reads(x, region) for x in align_bams):
            write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += ["-T", "UnifiedGenotyper",
                           "-o", tx_out_file,
                           "--genotype_likelihoods_model", "BOTH"]
                broad_runner.run_gatk(params)
    return out_file
Пример #30
0
def unified_genotyper(align_bam, ref_file, config, dbsnp=None,
                       region=None, out_file=None):
    """Perform SNP genotyping on the given alignment file.
    """
    broad_runner, params, out_file = \
        _shared_gatk_call_prep(align_bam, ref_file, config, dbsnp,
                               region, out_file)
    if not file_exists(out_file):
        if not has_aligned_reads(align_bam, region):
            write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += ["-T", "UnifiedGenotyper",
                           "-o", tx_out_file,
                           "--genotype_likelihoods_model", "BOTH"]
                broad_runner.run_gatk(params)
    return out_file
Пример #31
0
def unified_genotyper(align_bam, ref_file, config, dbsnp=None,
                       region=None, out_file=None):
    """Perform SNP genotyping on the given alignment file.
    """
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    broad_runner.run_fn("picard_index", align_bam)
    coverage_depth = config["algorithm"].get("coverage_depth", "high").lower()
    variant_regions = config["algorithm"].get("variant_regions", None)
    if coverage_depth in ["low"]:
        confidence = "4.0"
    else:
        confidence = "30.0"
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(align_bam, region):
            with file_transaction(out_file) as tx_out_file:
                params = ["-T", "UnifiedGenotyper",
                          "-I", align_bam,
                          "-R", ref_file,
                          "-o", tx_out_file,
                          "--annotation", "QualByDepth",
                          "--annotation", "HaplotypeScore",
                          "--annotation", "MappingQualityRankSumTest",
                          "--annotation", "ReadPosRankSumTest",
                          "--annotation", "FisherStrand",
                          "--annotation", "RMSMappingQuality",
                          "--annotation", "DepthOfCoverage",
                          "--genotype_likelihoods_model", "BOTH",
                          "--standard_min_confidence_threshold_for_calling", confidence,
                          "--standard_min_confidence_threshold_for_emitting", confidence,
                          "-l", "INFO",
                          ]
                if dbsnp:
                    params += ["--dbsnp", dbsnp]
                region = subset_variant_regions(variant_regions, region, tx_out_file)
                if region:
                    params += ["-L", region, "--interval_set_rule", "INTERSECTION"]
                broad_runner.run_gatk(params)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("##fileformat=VCFv4.1\n"
                                 "## No variants; no reads aligned in region\n"
                                 "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
    return out_file
Пример #32
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
        dbsnp_file, intervals):
    """Step 1 of GATK recalibration process, producing table of covariates.

    Large whole genome BAM files take an excessively long time to recalibrate and
    the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis'
    plots in the GATK documentation:

    http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest

    This identifies large files and calculates the fraction to downsample to.

    TODO: Use new GATK 2.6+ AnalyzeCovariates tool to plot recalibration results.
    """
    target_counts = 1e8 # 100 million reads per read group, 20x the plotted max
    out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(dup_align_bam, intervals):
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = ["-T", "BaseRecalibrator",
                              "-o", tx_out_file,
                              "-I", dup_align_bam,
                              "-R", ref_file,
                              ]
                    downsample_pct = bam.get_downsample_pct(broad_runner, dup_align_bam, target_counts)
                    if downsample_pct:
                        params += ["--downsample_to_fraction", str(downsample_pct),
                                   "--downsampling_type", "ALL_READS"]
                    if platform.lower() == "solid":
                        params += ["--solid_nocall_strategy", "PURGE_READ",
                                   "--solid_recal_mode", "SET_Q_ZERO_BASE_N"]
                    # GATK-lite does not have support for
                    # insertion/deletion quality modeling
                    if broad_runner.gatk_type() == "lite":
                        params += ["--disable_indel_quals"]
                    if dbsnp_file:
                        params += ["--knownSites", dbsnp_file]
                    if intervals:
                        params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Пример #33
0
def _gatk_count_covariates(broad_runner, dup_align_bam, ref_file, platform,
                           snp_file, intervals):
    """Step 1 of GATK recalibration process -- counting covariates.
    """
    out_file = "%s.recal" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(dup_align_bam):
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = [
                        "-T",
                        "CountCovariates",
                        "-cov",
                        "ReadGroupCovariate",
                        "-cov",
                        "QualityScoreCovariate",
                        "-cov",
                        "CycleCovariate",
                        "-cov",
                        "DinucCovariate",
                        "-recalFile",
                        tx_out_file,
                        "-I",
                        dup_align_bam,
                        "-R",
                        ref_file,
                        "-l",
                        "INFO",
                        "-U",
                        "-OQ",
                        "--default_platform",
                        platform,
                    ]
                    if snp_file:
                        params += ["--knownSites", snp_file]
                    if intervals:
                        params += [
                            "-L", intervals, "--interval_set_rule",
                            "INTERSECTION"
                        ]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Пример #34
0
def mutect_caller(align_bams,
                  items,
                  ref_file,
                  assoc_files,
                  region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm."""

    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(
            align_bams[0])[0]

    if not file_exists(out_file):
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file)

        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):

            vcfutils.write_empty_vcf(out_file)
            return

        with file_transaction(out_file) as tx_out_file:
            # Rationale: MuTect writes another table to stdout,
            # which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            try:
                broad_runner.run_mutect(params)
            except CalledProcessError as error:
                java_exception = _parse_gatk_java_error_string(error.cmd)
                #HACK: Currently MuTect bails out on certain small BAM files
                # Until the issue is fixed by Broad, this specific exception
                # will be ignored. All the other exceptions will be raised
                # correctly.
                if java_exception in _PASS_EXCEPTIONS:
                    vcfutils.write_empty_vcf(tx_out_file)
                    return
                else:
                    raise

    return out_file
Пример #35
0
def unified_genotyper(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Perform SNP genotyping on the given alignment file.
    """
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"],
                                   region, out_file)
        if (not isinstance(region, (list, tuple)) and
                not all(has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += ["-T", "UnifiedGenotyper",
                           "-o", tx_out_file,
                           "--genotype_likelihoods_model", "BOTH"]
                broad_runner.run_gatk(params)
    return out_file
Пример #36
0
def haplotype_caller(align_bam, ref_file, config, dbsnp=None,
                       region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    broad_runner, params, out_file = \
        _shared_gatk_call_prep(align_bam, ref_file, config, dbsnp,
                               region, out_file)
    assert broad_runner.has_gatk_full(), \
        "Require full version of GATK 2.0 for haplotype based calling"
    if not file_exists(out_file):
        if not has_aligned_reads(align_bam, region):
            write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += ["-T", "HaplotypeCaller",
                           "-o", tx_out_file]
                broad_runner.run_gatk(params)
    return out_file
Пример #37
0
def haplotype_caller(align_bam, ref_file, config, dbsnp=None,
                       region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    broad_runner, params, out_file = \
        _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp,
                               region, out_file)
    assert broad_runner.has_gatk_full(), \
        "Require full version of GATK 2.0 for haplotype based calling"
    if not file_exists(out_file):
        if not all(has_aligned_reads(x, region) for x in align_bams):
            write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += ["-T", "HaplotypeCaller",
                           "-o", tx_out_file]
                broad_runner.run_gatk(params)
    return out_file
Пример #38
0
def _call_variants_samtools(align_bams, ref_file, items, target_regions,
                            out_file):
    """Call variants with samtools in target_regions.

    Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra
    Version information from VCF header lines.
    """
    config = items[0]["config"]

    max_read_depth = "1000"
    mpileup = prep_mpileup(align_bams,
                           ref_file,
                           max_read_depth,
                           config,
                           target_regions=target_regions)
    bcftools = config_utils.get_program("bcftools", config)
    bcftools_version = programs.get_version("bcftools", config=config)
    samtools_version = programs.get_version("samtools", config=config)
    if LooseVersion(bcftools_version) > LooseVersion("0.1.19"):
        if LooseVersion(samtools_version) <= LooseVersion("0.1.19"):
            raise ValueError(
                "samtools calling not supported with 0.1.19 samtools and 0.20 bcftools"
            )
        bcftools_opts = "call -v -c"
    else:
        bcftools_opts = "view -v -c -g"
    compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
    vcfutils = config_utils.get_program("vcfutils.pl", config)
    # XXX Check if we need this when supporting samtools 0.2.0 calling.
    # 0.1.9 fails on regions without reads.
    if not any(
            realign.has_aligned_reads(x, target_regions) for x in align_bams):
        vcfutils.write_empty_vcf(out_file, config)
    else:
        cmd = ("{mpileup} "
               "| {bcftools} {bcftools_opts} - "
               "| {vcfutils} varFilter -D {max_read_depth} "
               "| sed 's/,Version=3>/>/'"
               "{compress_cmd} > {out_file}")
        logger.info(cmd.format(**locals()))
        do.run(cmd.format(**locals()), "Variant calling with samtools", {})
Пример #39
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        if "appistry" in broad_runner.get_mutect_version():
            out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                               if "vcf" in out_file else out_file + "-mutect.vcf")
        else:
            out_file_mutect = out_file
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple)) and
              not all(has_aligned_reads(x, region) for x in align_bams)):
                vcfutils.write_empty_vcf(out_file)
                return
        with file_transaction(out_file_mutect) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        if "appistry" in broad_runner.get_mutect_version():
            # SomaticIndelDetector modifications
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files,
                                           region, out_file_indels)
            with file_transaction(out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                      out_file=out_file,
                                                      ref_file=items[0]["sam_ref"],
                                                      config=items[0]["config"],
                                                      region=region)
    return out_file
Пример #40
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):

    """Run the MuTect paired analysis algorithm."""

    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(
            align_bams[0])[0]

    if not file_exists(out_file):
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file)

        if (not isinstance(region, (list, tuple)) and
            not all(has_aligned_reads(x, region) for x in align_bams)):

                vcfutils.write_empty_vcf(out_file)
                return

        with file_transaction(out_file) as tx_out_file:
            # Rationale: MuTect writes another table to stdout,
            # which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            try:
                broad_runner.run_mutect(params)
            except CalledProcessError as error:
                java_exception = _parse_gatk_java_error_string(error.cmd)
                #HACK: Currently MuTect bails out on certain small BAM files
                # Until the issue is fixed by Broad, this specific exception
                # will be ignored. All the other exceptions will be raised
                # correctly.
                if java_exception in _PASS_EXCEPTIONS:
                    vcfutils.write_empty_vcf(tx_out_file)
                    return
                else:
                    raise

    return out_file
Пример #41
0
def write_recal_bam(data, region=None, out_file=None):
    """Step 2 of GATK recalibration -- use covariates to re-write output file.
    """
    config = data["config"]
    if out_file is None:
        out_file = "%s-gatkrecal.bam" % os.path.splitext(data["work_bam"])[0]
    logger.info("Writing recalibrated BAM for %s to %s" % (data["name"], out_file))
    if region == "nochr":
        out_bam = write_nochr_reads(data["work_bam"], out_file, data["config"])
    else:
        out_bam = _run_recal_bam(data["work_bam"], data["prep_recal"],
                                 region, data["sam_ref"], out_file, config)
    qual_bin = config["algorithm"].get("quality_bin", None)
    if ((qual_bin is True or qual_bin == "postrecal" or
         isinstance(qual_bin, list) and "postrecal" in qual_bin)
         and has_aligned_reads(out_bam)):
        binned_bam = cram.illumina_qual_bin(out_bam, data["sam_ref"],
                                         os.path.dirname(out_bam), config)
        shutil.move(out_bam, out_bam + ".binned")
        shutil.move(binned_bam, out_bam)
        utils.save_diskspace(out_bam + ".binned",
                             "Quality binned to %s" % out_bam, config)
    data["work_bam"] = out_bam
    return [data]
Пример #42
0
def haplotype_caller(align_bams,
                     items,
                     ref_file,
                     assoc_files,
                     region=None,
                     out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        config = items[0]["config"]
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items, ref_file, assoc_files["dbsnp"],
                                   region, out_file)
        assert broad_runner.gatk_type() == "restricted", \
            "Require full version of GATK 2.4+ for haplotype calling"
        if not all(has_aligned_reads(x, region) for x in align_bams):
            vcfutils.write_empty_vcf(out_file, config)
        else:
            with file_transaction(out_file) as tx_out_file:
                params += [
                    "-T", "HaplotypeCaller", "-o", tx_out_file, "--annotation",
                    "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"
                ]
                # Enable hardware based optimizations in GATK 3.1+
                if LooseVersion(broad_runner.gatk_major_version()
                                ) >= LooseVersion("3.1"):
                    params += [
                        "--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"
                    ]
                broad_runner.new_resources("gatk-haplotype")
                broad_runner.run_gatk(params)
    return out_file
Пример #43
0
def mutect_caller(align_bams,
                  items,
                  ref_file,
                  assoc_files,
                  region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
            return
        out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect)
        if not file_exists(out_file_orig):
            with file_transaction(config, out_file_orig) as tx_out_file:
                # Rationale: MuTect writes another table to stdout, which we don't need
                params += ["--vcf", tx_out_file, "-o", os.devnull]
                broad_runner.run_mutect(params)
        is_paired = "-I:normal" in params
        if not utils.file_uptodate(out_file_mutect, out_file_orig):
            out_file_mutect = _fix_mutect_output(out_file_orig, config,
                                                 out_file_mutect, is_paired)
        indelcaller = vcfutils.get_indelcaller(base_config)
        if ("scalpel" in indelcaller.lower() and region
                and isinstance(region, (tuple, list))
                and chromhacks.is_autosomal_or_sex(region[0])):
            # Scalpel InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                if not is_paired:
                    vcfutils.check_paired_problems(items)
                    scalpel._run_scalpel_caller(align_bams,
                                                items,
                                                ref_file,
                                                assoc_files,
                                                region=region,
                                                out_file=out_file_indels)
                else:
                    scalpel._run_scalpel_paired(align_bams,
                                                items,
                                                ref_file,
                                                assoc_files,
                                                region=region,
                                                out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(
                    orig_files=[out_file_mutect, out_file_indels],
                    out_file=out_file,
                    ref_file=items[0]["sam_ref"],
                    config=items[0]["config"],
                    region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif "pindel" in indelcaller.lower():
            from bcbio.structural import pindel
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            if pindel.is_installed(items[0]["config"]):
                pindel._run_tumor_pindel_caller(align_bams,
                                                items,
                                                ref_file,
                                                assoc_files,
                                                region=region,
                                                out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(
                    orig_files=[out_file_mutect, out_file_indels],
                    out_file=out_file,
                    ref_file=ref_file,
                    config=items[0]["config"],
                    region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif (("somaticindeldetector" in indelcaller.lower()
               or "sid" in indelcaller.lower())
              and "appistry" in broad_runner.get_mutect_version()):
            # SomaticIndelDetector InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file,
                                           assoc_files, region,
                                           out_file_indels)
            with file_transaction(config, out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(
                orig_files=[out_file_mutect, out_file_indels],
                out_file=out_file,
                ref_file=items[0]["sam_ref"],
                config=items[0]["config"],
                region=region)
        else:
            utils.symlink_plus(out_file_mutect, out_file)
    return out_file
Пример #44
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
                            dbsnp_file, intervals, data):
    """Step 1 of GATK recalibration process, producing table of covariates.

    For GATK 4 we use local multicore spark runs:
    https://github.com/broadinstitute/gatk/issues/2345

    For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and
    the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis'
    plots in the GATK documentation:

    http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest

    This identifies large files and calculates the fraction to downsample to.

    spark host and timeout settings help deal with runs on restricted systems
    where we encounter network and timeout errors
    """
    target_counts = 1e8  # 100 million reads per read group, 20x the plotted max
    out_file = os.path.join(
        dd.get_work_dir(data), "align", dd.get_sample_name(data),
        "%s-recal.grp" %
        utils.splitext_plus(os.path.basename(dup_align_bam))[0])
    if not utils.file_exists(out_file):
        if has_aligned_reads(dup_align_bam, intervals):
            with file_transaction(data, out_file) as tx_out_file:
                gatk_type = broad_runner.gatk_type()
                assert gatk_type in ["restricted", "gatk4"], \
                    "Require full version of GATK 2.4+ or GATK4 for BQSR"
                params = ["-I", dup_align_bam]
                cores = dd.get_num_cores(data)
                if gatk_type == "gatk4":
                    params += [
                        "-T", "BaseRecalibratorSpark", "--spark-master",
                        "local[%s]" % cores, "--output", tx_out_file,
                        "--reference",
                        dd.get_ref_twobit(data), "--conf",
                        "spark.driver.host=localhost", "--conf",
                        "spark.network.timeout=800", "--conf",
                        "spark.executor.heartbeatInterval=100", "--conf",
                        "spark.local.dir=%s" % os.path.dirname(tx_out_file)
                    ]
                    if dbsnp_file:
                        params += ["--known-sites", dbsnp_file]
                    if intervals:
                        params += [
                            "-L", intervals, "--interval-set-rule",
                            "INTERSECTION"
                        ]
                else:
                    params += [
                        "-T", "BaseRecalibrator", "-o", tx_out_file, "-R",
                        ref_file
                    ]
                    downsample_pct = bam.get_downsample_pct(
                        dup_align_bam, target_counts, data)
                    if downsample_pct:
                        params += [
                            "--downsample_to_fraction",
                            str(downsample_pct), "--downsampling_type",
                            "ALL_READS"
                        ]
                    if platform.lower() == "solid":
                        params += [
                            "--solid_nocall_strategy", "PURGE_READ",
                            "--solid_recal_mode", "SET_Q_ZERO_BASE_N"
                        ]
                    if dbsnp_file:
                        params += ["--knownSites", dbsnp_file]
                    if intervals:
                        params += [
                            "-L", intervals, "--interval_set_rule",
                            "INTERSECTION"
                        ]
                memscale = {
                    "magnitude": 0.9 * cores,
                    "direction": "increase"
                } if cores > 1 else None
                broad_runner.run_gatk(params,
                                      os.path.dirname(tx_out_file),
                                      memscale=memscale,
                                      parallel_gc=True)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Пример #45
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple)) and
              not all(has_aligned_reads(x, region) for x in align_bams)):
                vcfutils.write_empty_vcf(out_file)
                return
        out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect)
        with file_transaction(config, out_file_orig) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        is_paired = "-I:normal" in params
        out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired)
        indelcaller = vcfutils.get_indelcaller(base_config)
        if "scalpel" in indelcaller.lower():
            # Scalpel InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                with file_transaction(config, out_file_indels) as tx_out_file2:
                    if not is_paired:
                        vcfutils.check_paired_problems(items)
                        scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                    else:
                        scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=items[0]["sam_ref"],
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif "pindel" in indelcaller.lower():
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if pindel.is_installed(items[0]["config"]):
                pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region,
                                          out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=ref_file,
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower())
              and "appistry" in broad_runner.get_mutect_version()):
            # SomaticIndelDetector InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files,
                                           region, out_file_indels)
            with file_transaction(config, out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                      out_file=out_file,
                                                      ref_file=items[0]["sam_ref"],
                                                      config=items[0]["config"],
                                                      region=region)
        else:
            utils.symlink_plus(out_file_mutect, out_file)
    return out_file