Пример #1
0
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    gatk_type = broad_runner.gatk_type()
    for x in align_bams:
        bam.index(x, config)
    picard_runner = broad.runner_from_path("picard", config)
    picard_runner.run_fn("picard_index_ref", ref_file)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        if gatk_type == "gatk4":
            params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"]
        else:
            params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    params += standard_cl_params(items)
    return broad_runner, params
Пример #2
0
def mutect2_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = ["-T", "MuTect2",
                      "-R", ref_file,
                      "--annotation", "ClippingRankSumTest",
                      "--annotation", "DepthPerSampleHC"]
            for a in annotation.get_gatk_annotations(items[0]["config"]):
                params += ["--annotation", a]
            paired = vcfutils.get_paired_bams(align_bams, items)
            params += _add_tumor_params(paired)
            params += _add_region_params(region, out_file, items)
            params += _add_assoc_params(assoc_files)
            params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            resources = config_utils.get_resources("mutect2", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner = broad.runner_from_config(items[0]["config"])
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = " ".join(broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)))
            pp_cmd = _post_process_cl(paired)
            cmd = "{gatk_cmd} | {pp_cmd} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "MuTect2")
    out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
Пример #3
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region,
                           out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_path("picard", config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += [
            "--standard_min_confidence_threshold_for_calling", confidence
        ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]
    params += standard_cl_params(items)
    broad_runner = broad.runner_from_config(config)
    return broad_runner, params
Пример #4
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_path("picard", config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence,
                   "--standard_min_confidence_threshold_for_emitting", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    variant_regions = tz.get_in(["algorithm", "variant_regions"], config)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    broad_runner = broad.runner_from_config(config)
    return broad_runner, params
Пример #5
0
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    coverage_depth = config["algorithm"].get("coverage_depth", "high").lower()
    variant_regions = config["algorithm"].get("variant_regions", None)
    confidence = "4.0" if coverage_depth in ["low"] else "30.0"
    region = subset_variant_regions(variant_regions, region, out_file)

    params = ["-R", ref_file,
              "--standard_min_confidence_threshold_for_calling", confidence,
              "--standard_min_confidence_threshold_for_emitting", confidence,
              "--downsample_to_coverage", "250",
              "--downsampling_type", "BY_SAMPLE",
              ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    return broad_runner, params
Пример #6
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    config = items[0]["config"]
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    # GATK can only downsample to a minimum of 200
    coverage_depth_max = max(200, utils.get_in(config, ("algorithm", "coverage_depth_max"), 2000))
    coverage_depth_min = utils.get_in(config, ("algorithm", "coverage_depth_min"), 4)
    variant_regions = config["algorithm"].get("variant_regions", None)
    confidence = "4.0" if coverage_depth_min < 4 else "30.0"
    region = subset_variant_regions(variant_regions, region, out_file, items)

    params = ["-R", ref_file,
              "--standard_min_confidence_threshold_for_calling", confidence,
              "--standard_min_confidence_threshold_for_emitting", confidence,
              "--downsample_to_coverage", str(coverage_depth_max),
              "--downsampling_type", "BY_SAMPLE",
              ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    return broad_runner, params
Пример #7
0
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        broad_runner.run_fn("picard_index", x)
    coverage_depth = config["algorithm"].get("coverage_depth", "high").lower()
    variant_regions = config["algorithm"].get("variant_regions", None)
    confidence = "4.0" if coverage_depth in ["low"] else "30.0"
    region = subset_variant_regions(variant_regions, region, out_file)

    params = ["-R", ref_file,
              "--standard_min_confidence_threshold_for_calling", confidence,
              "--standard_min_confidence_threshold_for_emitting", confidence,
              "--downsample_to_coverage", "250",
              "--downsampling_type", "BY_SAMPLE",
              ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    return broad_runner, params
Пример #8
0
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    gatk_type = broad_runner.gatk_type()
    for x in align_bams:
        bam.index(x, config)
    if _use_spark(num_cores, gatk_type):
        # GATK4 spark runs use 2bit reference index
        params = ["--reference", dd.get_ref_twobit(items[0])]
    else:
        picard_runner = broad.runner_from_path("picard", config)
        picard_runner.run_fn("picard_index_ref", ref_file)
        params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        if gatk_type == "gatk4":
            params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"]
        else:
            params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    params += standard_cl_params(items)
    return broad_runner, params
Пример #9
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    config = items[0]["config"]
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    # GATK can only downsample to a minimum of 200
    coverage_depth_max = max(200, utils.get_in(config, ("algorithm", "coverage_depth_max"), 2000))
    coverage_depth_min = utils.get_in(config, ("algorithm", "coverage_depth_min"), 4)
    variant_regions = config["algorithm"].get("variant_regions", None)
    confidence = "4.0" if coverage_depth_min < 4 else "30.0"
    region = subset_variant_regions(variant_regions, region, out_file, items)

    params = ["-R", ref_file,
              "--standard_min_confidence_threshold_for_calling", confidence,
              "--standard_min_confidence_threshold_for_emitting", confidence,
              "--downsample_to_coverage", str(coverage_depth_max),
              "--downsampling_type", "BY_SAMPLE",
              ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    return broad_runner, params
Пример #10
0
def _shared_gatk_call_prep(align_bams, ref_file, config, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        broad_runner.run_fn("picard_index", x)
    coverage_depth = config["algorithm"].get("coverage_depth", "high").lower()
    variant_regions = config["algorithm"].get("variant_regions", None)
    confidence = "4.0" if coverage_depth in ["low"] else "30.0"
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    region = subset_variant_regions(variant_regions, region, out_file)

    params = ["-R", ref_file,
              "--standard_min_confidence_threshold_for_calling", confidence,
              "--standard_min_confidence_threshold_for_emitting", confidence,
              ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if region:
        params += ["-L", region, "--interval_set_rule", "INTERSECTION"]
    return broad_runner, params, out_file
Пример #11
0
def mutect2_caller(align_bams,
                   items,
                   ref_file,
                   assoc_files,
                   region=None,
                   out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        paired = vcfutils.get_paired_bams(align_bams, items)
        broad_runner = broad.runner_from_config(items[0]["config"])
        gatk_type = broad_runner.gatk_type()
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = [
                "-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "-R",
                ref_file, "--annotation", "ClippingRankSumTest",
                "--annotation", "DepthPerSampleHC"
            ]
            for a in annotation.get_gatk_annotations(
                    items[0]["config"], include_baseqranksum=False):
                params += ["--annotation", a]
            # Avoid issues with BAM CIGAR reads that GATK doesn't like
            if gatk_type == "gatk4":
                params += ["--read-validation-stringency", "LENIENT"]
            params += _add_tumor_params(paired, items, gatk_type)
            params += _add_region_params(region, out_file, items, gatk_type)
            # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm
            # Not yet clear how this helps or hurts in a general case.
            #params += _add_assoc_params(assoc_files)
            params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            resources = config_utils.get_resources("mutect2",
                                                   items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = broad_runner.cl_gatk(params,
                                            os.path.dirname(tx_out_file))
            if gatk_type == "gatk4":
                tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(
                    tx_out_file)
                tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(
                    tx_out_file)
                filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file,
                                             tx_raw_file)
                cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}"
            else:
                tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                cmd = "{gatk_cmd} > {tx_raw_file}"
            do.run(cmd.format(**locals()), "MuTect2")
            out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Пример #12
0
def mutect2_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        paired = vcfutils.get_paired_bams(align_bams, items)
        broad_runner = broad.runner_from_config(items[0]["config"])
        gatk_type = broad_runner.gatk_type()
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = ["-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2",
                      "--annotation", "ClippingRankSumTest",
                      "--annotation", "DepthPerSampleHC"]
            if gatk_type == "gatk4":
                params += ["--reference", ref_file]
            else:
                params += ["-R", ref_file]
            for a in annotation.get_gatk_annotations(items[0]["config"], include_baseqranksum=False):
                params += ["--annotation", a]
            # Avoid issues with BAM CIGAR reads that GATK doesn't like
            if gatk_type == "gatk4":
                params += ["--read-validation-stringency", "LENIENT"]
            params += _add_tumor_params(paired, items, gatk_type)
            params += _add_region_params(region, out_file, items, gatk_type)
            # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm
            # Not yet clear how this helps or hurts in a general case.
            #params += _add_assoc_params(assoc_files)
            resources = config_utils.get_resources("mutect2", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file))
            if gatk_type == "gatk4":
                tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(tx_out_file)
                filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file, ref_file)
                cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}"
            else:
                tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                cmd = "{gatk_cmd} > {tx_raw_file}"
            do.run(cmd.format(**locals()), "MuTect2")
            out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Пример #13
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, cosmic, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_path("picard", config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += [
            "--standard_min_confidence_threshold_for_calling",
            confidence,
            "--standard_min_confidence_threshold_for_emitting",
            confidence,
        ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        bam.index(x, config)

    paired = vcfutils.get_paired_bams(align_bams, items)
    if not paired:
        raise ValueError(
            "Specified MuTect calling but 'tumor' phenotype not present in batch\n"
            "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
            "pipelines.html#cancer-variant-calling\n"
            "for samples: %s" % ", ".join([dd.get_sample_name(x) for x in items])
        )
    params += ["-I:tumor", paired.tumor_bam]
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
    if paired.normal_panel is not None:
        params += ["--normal_panel", paired.normal_panel]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if cosmic:
        params += ["--cosmic", cosmic]
    variant_regions = tz.get_in(["algorithm", "variant_regions"], config)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    broad_runner = broad.runner_from_config(config)
    return broad_runner, params
Пример #14
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, cosmic, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_path("picard", config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence,
                   "--standard_min_confidence_threshold_for_emitting", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        bam.index(x, config)

    paired = vcfutils.get_paired_bams(align_bams, items)
    if not paired:
        raise ValueError("Specified MuTect calling but 'tumor' phenotype not present in batch\n"
                         "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                         "pipelines.html#cancer-variant-calling\n"
                         "for samples: %s" % ", " .join([dd.get_sample_name(x) for x in items]))
    params += ["-I:tumor", paired.tumor_bam]
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
    if paired.normal_panel is not None:
        params += ["--normal_panel", paired.normal_panel]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if cosmic:
        params += ["--cosmic", cosmic]
    variant_regions = tz.get_in(["algorithm", "variant_regions"], config)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    broad_runner = broad.runner_from_config(config)
    return broad_runner, params
Пример #15
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region,
                           out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    if dd.is_set_coverage_depth_max(data):
        coverage_depth_max = dd.get_coverage_depth_max(data)
        # GATK can only downsample to a minimum of 200
        coverage_depth_max = max([200, coverage_depth_max])
        params += [
            "--downsample_to_coverage",
            str(coverage_depth_max), "--downsampling_type", "BY_SAMPLE"
        ]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += [
            "--standard_min_confidence_threshold_for_calling", confidence,
            "--standard_min_confidence_threshold_for_emitting", confidence
        ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    variant_regions = tz.get_in(["algorithm", "variant_regions"], config)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]
    return broad_runner, params
Пример #16
0
def mutect2_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's MuTect2.
    This requires the full non open-source version of GATK 3.5+.
    items = 1 sample or T/N pair
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        # call somatic variants keeping germline sites and using germline 1KG resource
        # use --native-pair-hmm-threads?
        broad_runner = broad.runner_from_config(items[0]["config"])
        gatk_type = broad_runner.gatk_type()
        # shared Mutect2 settings for PureCN analysis in the case of:
        # - PON creation
        # - Tumor-only PureCN run
        # - T/N PureCN run
        # PURECN requirement alters Mutect2 variants calling!
        if "purecn" in dd.get_svcaller(items[0]):
            # mutect call for PON creation or purecn T-only analysis
            _prep_inputs(align_bams, ref_file, items)
            with file_transaction(items[0], out_file) as tx_out_file:
                germline_resource = tz.get_in(["genome_resources", "variation", "af_only_gnomad"], items[0])
                germline_path = os.path.normpath(os.path.join(os.path.dirname(ref_file), germline_resource))
                input_bam = dd.get_work_bam(items[0])
                tx_prefilt_vcf = utils.splitext_plus(tx_out_file)[0] + ".prefilt.vcf"
                tx_vcf = os.path.splitext(tx_out_file)[0]
                out_file_ungz = os.path.splitext(out_file)[0]
                params = ["-T", "Mutect2"]
                # T/N pair
                if len(items) == 2:
                    paired = vcfutils.get_paired_bams(align_bams, items)
                    # not really running purecn with mutect1/gatk3
                    params += _add_tumor_params(paired, items, gatk_type)
                    logger.debug("You are running mutect2 in PureCN analysis in T/N mode, T-only + PON is recommended")
                else: #T only
                    params += ["-I", input_bam]
                    # adding SNV PON from background/variant
                    snv_pon = tz.get_in(["config", "algorithm", "background", "variant"], items[0])
                    if snv_pon and dd.get_batch(items[0]) != "pon_build":
                        params += ["-pon", snv_pon]
                        params += ["--genotype-pon-sites"]

                opt_list = config_utils.get_resources("mutect2", items[0]["config"]).get("options")
                # default is 50, sometimes 100 or 200 is recommended for better sensitivity in detection
                # hom del CNVs (calling more variants helps)
                interval_padding = 50
                if opt_list:
                    opt_dict = dict(zip(opt_list[::2], opt_list[1::2]))
                    if "--interval_padding" in opt_dict:
                        interval_padding = opt_dict["--interval_padding"]

                params += ["--max-mnp-distance", "0",
                           "--interval-padding", interval_padding,
                           "--germline-resource", germline_path,
                           "--genotype-germline-sites",
                           "--reference", ref_file,
                           "-O", tx_prefilt_vcf]

                params += _add_region_params(region, out_file, items, gatk_type)
                broad_runner.new_resources("mutect2")
                gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file))
                filter_cmd = _mutect2_filter(broad_runner, items, tx_prefilt_vcf, out_file_ungz, ref_file)
                cmd = "{gatk_cmd} && {filter_cmd}"
                do.run(cmd.format(**locals()), "MuTect2")
                # no AF filter for PureCN variants
                out_file = vcfutils.bgzip_and_index(out_file_ungz, items[0]["config"])
        else:
            # a regular mutect call
            paired = vcfutils.get_paired_bams(align_bams, items)
            f1r2_file = None
            _prep_inputs(align_bams, ref_file, items)
            with file_transaction(items[0], out_file) as tx_out_file:
                params = ["-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2",
                          "--annotation", "ClippingRankSumTest",
                          "--annotation", "DepthPerSampleHC"]
                if gatk_type == "gatk4":
                    params += ["--reference", ref_file]
                else:
                    params += ["-R", ref_file]
                for a in annotation.get_gatk_annotations(items[0]["config"], include_baseqranksum=False):
                    params += ["--annotation", a]
                # Avoid issues with BAM CIGAR reads that GATK doesn't like
                if gatk_type == "gatk4":
                    params += ["--read-validation-stringency", "LENIENT"]
                params += _add_tumor_params(paired, items, gatk_type)
                params += _add_region_params(region, out_file, items, gatk_type)
                if all(is_paired(bam) for bam in align_bams) and (
                        "mutect2_readmodel" in utils.get_in(items[0], "config", "tools_on")):
                    orientation_filter = True
                else:
                    orientation_filter = False

                if gatk_type == "gatk4" and orientation_filter:
                    f1r2_file = "{}-f1r2.tar.gz".format(utils.splitext_plus(out_file)[0])
                    params += ["--f1r2-tar-gz", f1r2_file]

                # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm
                # Not yet clear how this helps or hurts in a general case.
                #params += _add_assoc_params(assoc_files)
                resources = config_utils.get_resources("mutect2", items[0]["config"])
                if "options" in resources:
                    params += [str(x) for x in resources.get("options", [])]
                assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                    "Require full version of GATK 3.5+ for mutect2 calling"
                broad_runner.new_resources("mutect2")
                gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file))
                if gatk_type == "gatk4":
                    tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(out_file)
                    tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(tx_out_file)

                    if orientation_filter:
                        tx_f1r2_file = "{}-read-orientation-model.tar.gz"
                        tx_f1r2_file = tx_f1r2_file.format(utils.splitext_plus(f1r2_file)[0])
                        tx_read_orient_cmd = _mutect2_read_filter(broad_runner,
                                                                  f1r2_file,
                                                                  tx_f1r2_file)

                        filter_cmd = _mutect2_filter(broad_runner, items,
                                                     tx_raw_prefilt_file,
                                                     tx_raw_file, ref_file,
                                                     tx_f1r2_file)
                    else:
                        filter_cmd = _mutect2_filter(broad_runner, items,
                                                     tx_raw_prefilt_file,
                                                     tx_raw_file, ref_file)
                    if orientation_filter:
                        cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {tx_read_orient_cmd} && {filter_cmd}"
                    else:
                        cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}"
                else:
                    tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                    cmd = "{gatk_cmd} > {tx_raw_file}"
                do.run(cmd.format(**locals()), "MuTect2")
                out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])