def _fix_gatk_header(exist_files, out_file, config): """Ensure consistent headers for VCF concatenation. Fixes problems for genomes that start with chrM by reheadering the first file. These files do haploid variant calling which lack the PID phasing key/value pair in FORMAT, so initial chrM samples cause errors during concatenation due to the lack of header merging. This fixes this by updating the first header. """ from bcbio.variation import ploidy c, base_file = exist_files[0] replace_file = base_file items = [{"config": config}] if ploidy.get_ploidy(items, region=(c, 1, 2)) == 1: for c, x in exist_files[1:]: if ploidy.get_ploidy(items, (c, 1, 2)) > 1: replace_file = x break base_fix_file = os.path.join(os.path.dirname(out_file), "%s-fixheader%s" % utils.splitext_plus(os.path.basename(base_file))) with file_transaction(config, base_fix_file) as tx_out_file: header_file = "%s-header.vcf" % utils.splitext_plus(tx_out_file)[0] do.run("zgrep ^# %s > %s" % (replace_file, header_file), "Prepare header file for merging") resources = config_utils.get_resources("picard", config) ropts = [] if "options" in resources: ropts += [str(x) for x in resources.get("options", [])] do.run("%s && picard FixVcfHeader HEADER=%s INPUT=%s OUTPUT=%s %s" % (utils.get_java_clprep(), header_file, base_file, base_fix_file, " ".join(ropts)), "Reheader initial VCF file in merge") bgzip_and_index(base_fix_file, config) return [base_fix_file] + [x for (c, x) in exist_files[1:]]
def _fix_gatk_header(exist_files, out_file, config): """Ensure consistent headers for VCF concatenation. Fixes problems for genomes that start with chrM by reheadering the first file. These files do haploid variant calling which lack the PID phasing key/value pair in FORMAT, so initial chrM samples cause errors during concatenation due to the lack of header merging. This fixes this by updating the first header. """ from bcbio.variation import ploidy c, base_file = exist_files[0] replace_file = base_file items = [{"config": config}] if ploidy.get_ploidy(items, region=(c, 1, 2)) == 1: for c, x in exist_files[1:]: if ploidy.get_ploidy(items, (c, 1, 2)) > 1: replace_file = x break base_fix_file = os.path.join( os.path.dirname(out_file), "%s-fixheader%s" % utils.splitext_plus(os.path.basename(base_file))) with file_transaction(config, base_fix_file) as tx_out_file: header_file = "%s-header.vcf" % utils.splitext_plus(tx_out_file)[0] do.run("zgrep ^# %s > %s" % (replace_file, header_file), "Prepare header file for merging") do.run( "%s && picard FixVcfHeader HEADER=%s INPUT=%s OUTPUT=%s" % (utils.get_java_clprep(), header_file, base_file, base_fix_file), "Reheader initial VCF file in merge") bgzip_and_index(base_fix_file, config) return [base_fix_file] + [x for (c, x) in exist_files[1:]]
def _fix_gatk_header(exist_files, out_file, config): """Ensure consistent headers for VCF concatenation. Fixes problems for genomes that start with chrM by reheadering the first file. These files do haploid variant calling which lack the PID phasing key/value pair in FORMAT, so initial chrM samples cause errors during concatenation due to the lack of header merging. This fixes this by updating the first header. """ from bcbio.variation import ploidy c, base_file = exist_files[0] replace_file = base_file items = [{"config": config}] if ploidy.get_ploidy(items, region=(c, 1, 2)) == 1: for c, x in exist_files[1:]: if ploidy.get_ploidy(items, (c, 1, 2)) > 1: replace_file = x break base_fix_file = os.path.join(os.path.dirname(out_file), "%s-fixheader%s" % utils.splitext_plus(os.path.basename(base_file))) with file_transaction(config, base_fix_file) as tx_out_file: header_file = "%s-header.vcf" % utils.splitext_plus(tx_out_file)[0] do.run("zgrep ^# %s > %s" % (replace_file, header_file), "Prepare header file for merging") resources = config_utils.get_resources("picard", config) ropts = [] if "options" in resources: ropts += [str(x) for x in resources.get("options", [])] bcftools = config_utils.get_program("bcftools", config) cmd = f"{bcftools} reheader --header {header_file} --output {tx_out_file} {base_file}" message = f"Reheader {base_file} with header from {replace_file}." do.run(cmd, message) bgzip_and_index(base_fix_file, config) return [base_fix_file] + [x for (c, x) in exist_files[1:]]
def _add_variantcalls_to_output(out, data, items, is_somatic=False): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: filters = ["--filter", "cn"] cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \ filters + \ ["--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_call_file, out["cns"]] small_vrn_files = _compatible_small_variants(data, items) if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]): cmd += [ "--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample ] if small_vrn_files[0].normal: cmd += ["--normal-id", small_vrn_files[0].normal] if not is_somatic: cmd += ["-m", "clonal"] gender = population.get_gender(data) if gender and gender.lower() != "unknown": cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not os.path.exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [ os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_out_file, call_file ] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] return out
def unified_genotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"], region, out_file) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: params += [ "-T", "UnifiedGenotyper", "-o", tx_out_file, "-ploidy", (str(ploidy.get_ploidy(items, region)) if broad_runner.gatk_type() == "restricted" else "2"), "--genotype_likelihoods_model", "BOTH" ] broad_runner.run_gatk(params) return out_file
def _get_ploidy(regions, items, base_file): samples = [dd.get_sample_name(d) for d in items] out_file = "%s-ploidy.vcf" % utils.splitext_plus(base_file)[0] if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(items[0], out_file) as tx_outfile: with open(tx_outfile, "w") as h: h.write("##fileformat=VCFv4.1\n") h.write( '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n' ) h.write( '##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">\n' ) h.write( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + "\t".join(samples) + "\n") for region in regions: ploidies = [ploidy.get_ploidy([d], region) for d in items] h.write("\t".join([ region[0], str(region[1]), ".", "N", "<CNV>", ".", ".", "END=%s" % region[2], "CN" ] + [str(x) for x in ploidies]) + "\n") return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. """ opts = [] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")), items[0]) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] if "--min-alternate-fraction" not in " ".join(opts) and "-F" not in " ".join(opts): # add minimum reportable allele frequency, for which FreeBayes defaults to 20 min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 20)) / 100.0 opts += ["--min-alternate-fraction", str(min_af)] return opts
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data): """GenotypeGVCFs from a merged GenomicsDB input: GATK4. ropts += [str(x) for x in resources.get("options", [])] No core scaling -- not yet supported in GATK4. """ if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) # see issue https://github.com/bcbio/bcbio-nextgen/issues/3263 # for why --genomicsdb-use-vcf-codec is necessary params = [ "-T", "GenotypeGVCFs", "--variant", "gendb://%s" % genomics_db, "-R", dd.get_ref_file(data), "--genomicsdb-use-vcf-codec", "--output", tx_out_file, "-L", bamprep.region_to_gatk(region) ] params += ["-ploidy", str(ploidy.get_ploidy([data], region))] # Avoid slow genotyping runtimes with improved quality score calculation in GATK4 # https://gatkforums.broadinstitute.org/gatk/discussion/11471/performance-troubleshooting-tips-for-genotypegvcfs/p1 resources = config_utils.get_resources("gatk", data["config"]) params += [str(x) for x in resources.get("options", [])] cores = dd.get_cores(data) memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return vcfutils.bgzip_and_index(out_file, data["config"])
def unified_genotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, assoc_files.get("dbsnp"), region, out_file) with file_transaction(items[0], out_file) as tx_out_file: params += [ "-T", "UnifiedGenotyper", "-o", tx_out_file, "-ploidy", (str(ploidy.get_ploidy(items, region)) if broad_runner.gatk_type() == "restricted" else "2"), "--genotype_likelihoods_model", "BOTH" ] resources = config_utils.get_resources("gatk", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.run_gatk(params) return out_file
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): num_cores = dd.get_num_cores(items[0]) broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores) gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+, or GATK4 for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: if num_cores > 1 and gatk_type == "gatk4": params += ["-T", "HaplotypeCallerSpark", "--spark-master", "local[%s]" % num_cores, "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)] else: params += ["-T", "HaplotypeCaller"] params += ["--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] if gatk_type == "gatk4": params += ["--output", tx_out_file] else: params += ["-o", tx_out_file] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"): # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE if not gatk_type == "gatk4" and _supports_avx(): params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"] # Prepare gVCFs if doing joint calling is_joint = False if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items): is_joint = True if gatk_type == "gatk4": params += ["--emit-ref-confidence", "GVCF"] else: params += ["--emitRefConfidence", "GVCF"] params += ["--variant_index_type", "LINEAR", "--variant_index_parameter", "128000"] # Set GQ banding to not be single GQ resolution # No recommended default but try to balance resolution and size # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands for boundary in [10, 20, 30, 40, 60, 80]: params += ["-GQB", str(boundary)] # Enable non-diploid calling in GATK 3.3+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"): # GenomicsDB does not support non-diploid samples in GATK4 joint calling # https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4 if not is_joint and gatk_type == "gatk4": params += ["-ploidy", str(ploidy.get_ploidy(items, region))] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.new_resources("gatk-haplotype") memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=(num_cores > 1 and gatk_type == "gatk4")) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities"] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = ["-T", "MuTect2", "-R", ref_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] for a in annotation.get_gatk_annotations(items[0]["config"]): params += ["--annotation", a] paired = vcfutils.get_paired_bams(align_bams, items) params += _add_tumor_params(paired) params += _add_region_params(region, out_file, items) params += _add_assoc_params(assoc_files) params += ["-ploidy", str(ploidy.get_ploidy(items, region))] resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner = broad.runner_from_config(items[0]["config"]) assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = " ".join(broad_runner.cl_gatk(params, os.path.dirname(tx_out_file))) pp_cmd = _post_process_cl(paired) cmd = "{gatk_cmd} | {pp_cmd} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities", "--strict-vcf"] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps( bedutils.population_variant_regions(items), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any( tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, assoc_files.get("dbsnp"), region, out_file) gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+, or GATK4 for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: params += [ "-T", "HaplotypeCaller", "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC" ] if gatk_type == "gatk4": params += ["--output", tx_out_file] else: params += ["-o", tx_out_file] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion( broad_runner.gatk_major_version()) >= LooseVersion("3.1"): # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE if not gatk_type == "gatk4": params += [ "--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING" ] # Enable non-diploid calling in GATK 3.3+ if LooseVersion( broad_runner.gatk_major_version()) >= LooseVersion("3.3"): params += ["-ploidy", str(ploidy.get_ploidy(items, region))] # Prepare gVCFs if doing joint calling if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items): params += [ "--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000" ] # Set GQ banding to not be single GQ resolution # No recommended default but try to balance resolution and size # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands for boundary in [10, 20, 30, 40, 60, 80]: params += ["-GQB", str(boundary)] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, assoc_files.get("dbsnp"), region, out_file) assert broad_runner.gatk_type() == "restricted", \ "Require full version of GATK 2.4+ for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: params += ["-T", "HaplotypeCaller", "-o", tx_out_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"): params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"] # Enable non-diploid calling in GATK 3.3+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"): params += ["-ploidy", str(ploidy.get_ploidy(items, region))] if _joint_calling(items): # Prepare gVCFs if doing joint calling params += ["--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000"] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): num_cores = dd.get_num_cores(items[0]) broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores) gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+, or GATK4 for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: if num_cores > 1 and gatk_type == "gatk4": params += ["-T", "HaplotypeCallerSpark", "--sparkMaster", "local[%s]" % num_cores, "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)] else: params += ["-T", "HaplotypeCaller"] params += ["--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] if gatk_type == "gatk4": params += ["--output", tx_out_file] else: params += ["-o", tx_out_file] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"): # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE if not gatk_type == "gatk4" and _supports_avx(): params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"] # Prepare gVCFs if doing joint calling is_joint = False if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items): is_joint = True params += ["--emitRefConfidence", "GVCF"] if not gatk_type == "gatk4": params += ["--variant_index_type", "LINEAR", "--variant_index_parameter", "128000"] # Set GQ banding to not be single GQ resolution # No recommended default but try to balance resolution and size # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands for boundary in [10, 20, 30, 40, 60, 80]: params += ["-GQB", str(boundary)] # Enable non-diploid calling in GATK 3.3+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"): # GenomicsDB does not support non-diploid samples in GATK4 joint calling # https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4 if not is_joint and gatk_type == "gatk4": params += ["-ploidy", str(ploidy.get_ploidy(items, region))] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.new_resources("gatk-haplotype") memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=(num_cores > 1 and gatk_type == "gatk4")) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): paired = vcfutils.get_paired_bams(align_bams, items) broad_runner = broad.runner_from_config(items[0]["config"]) gatk_type = broad_runner.gatk_type() _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = [ "-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "-R", ref_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC" ] for a in annotation.get_gatk_annotations( items[0]["config"], include_baseqranksum=False): params += ["--annotation", a] # Avoid issues with BAM CIGAR reads that GATK doesn't like if gatk_type == "gatk4": params += ["--read-validation-stringency", "LENIENT"] params += _add_tumor_params(paired, items, gatk_type) params += _add_region_params(region, out_file, items, gatk_type) # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm # Not yet clear how this helps or hurts in a general case. #params += _add_assoc_params(assoc_files) params += ["-ploidy", str(ploidy.get_ploidy(items, region))] resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) if gatk_type == "gatk4": tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus( tx_out_file) tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus( tx_out_file) filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file) cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}" else: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) cmd = "{gatk_cmd} > {tx_raw_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities", "--strict-vcf"] cur_ploidy = ploidy.get_ploidy(items, region) base_ploidy = ploidy.get_ploidy(items) opts += ["--ploidy", str(cur_ploidy)] # Adjust min fraction when trying to call more sensitively in certain # regions. This is primarily meant for pooled mitochondrial calling. if (isinstance(region, (list, tuple)) and chromhacks.is_mitochondrial(region[0]) and cur_ploidy >= base_ploidy and "--min-alternate-fraction" not in opts and "-F" not in opts): opts += ["--min-alternate-fraction", "0.01"] variant_regions = bedutils.merge_overlaps( bedutils.population_variant_regions(items), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any( tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def _add_variantcalls_to_output(out, data, is_somatic=False): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) gender = population.get_gender(data) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: filters = ["--filter", "cn"] cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \ filters + \ ["--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_call_file, out["cns"]] small_vrn_files = _compatible_small_variants(data) if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]): cmd += ["-v", small_vrn_files[0]] if not is_somatic: cmd += ["-m", "clonal"] if gender and gender.lower() != "unknown": cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not os.path.exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_out_file, call_file] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] return out
def _add_variantcalls_to_output(out, data, items, is_somatic=False): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: filters = ["--filter", "cn"] cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \ filters + \ ["--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_call_file, out["cns"]] small_vrn_files = _compatible_small_variants(data, items) if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]): cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample] if small_vrn_files[0].normal: cmd += ["--normal-id", small_vrn_files[0].normal] if not is_somatic: cmd += ["-m", "clonal"] gender = _get_batch_gender(items) if gender: cmd += ["--sample-sex", gender] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not os.path.exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_out_file, call_file] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] out["vrn_file"] = shared.annotate_with_depth(out["vrn_file"], items) return out
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities", "--strict-vcf"] cur_ploidy = ploidy.get_ploidy(items, region) base_ploidy = ploidy.get_ploidy(items) opts += ["--ploidy", str(cur_ploidy)] # Adjust min fraction when trying to call more sensitively in certain # regions. This is primarily meant for pooled mitochondrial calling. if (isinstance(region, (list, tuple)) and chromhacks.is_mitochondrial(region[0]) and cur_ploidy >= base_ploidy and "--min-alternate-fraction" not in opts and "-F" not in opts): opts += ["--min-alternate-fraction", "0.01"] variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def cnv_to_event(name, data): """Convert a CNV to an event name. """ cur_ploidy = ploidy.get_ploidy([data]) if name.startswith("cnv"): num = max([int(x) for x in name.split("_")[0].replace("cnv", "").split(";")]) if num < cur_ploidy: return "DEL" elif num > cur_ploidy: return "DUP" else: return name else: return name
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): broad_runner, params = _shared_gatk_call_prep( align_bams, items, ref_file, assoc_files.get("dbsnp"), region, out_file ) assert broad_runner.gatk_type() == "restricted", "Require full version of GATK 2.4+ for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: params += [ "-T", "HaplotypeCaller", "-o", tx_out_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC", ] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"): params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"] # Enable non-diploid calling in GATK 3.3+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"): params += ["-ploidy", str(ploidy.get_ploidy(items, region))] # Prepare gVCFs if doing joint calling if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items): params += [ "--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000", ] # Set GQ banding to not be single GQ resolution # No recommended default but try to balance resolution and size # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands for boundary in [10, 20, 30, 40, 60, 80]: params += ["-GQB", str(boundary)] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def _freebayes_options_from_config(items, aconfig, out_file, region=None): opts = [] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = aconfig.get("variant_regions", None) target = subset_variant_regions(variant_regions, region, out_file) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] #background = aconfig.get("call_background", None) #if background and os.path.exists(background): # opts += ["--variant-input", background] return opts
def _get_ploidy(regions, items, base_file): samples = [dd.get_sample_name(d) for d in items] out_file = "%s-ploidy.vcf" % utils.splitext_plus(base_file)[0] if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(items[0], out_file) as tx_outfile: with open(tx_outfile, "w") as h: h.write("##fileformat=VCFv4.1\n") h.write('##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n') h.write('##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">\n') h.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + "\t".join(samples) + "\n") for region in regions: ploidies = [ploidy.get_ploidy([d], region) for d in items] h.write("\t".join([region[0], str(region[1]), ".", "N", "<CNV>", ".", ".", "END=%s" % region[2], "CN"] + [str(x) for x in ploidies]) + "\n") return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _freebayes_options_from_config(items, config, out_file, region=None): opts = [] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = utils.get_in(config, ("algorithm", "variant_regions")) target = subset_variant_regions(variant_regions, region, out_file) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): paired = vcfutils.get_paired_bams(align_bams, items) broad_runner = broad.runner_from_config(items[0]["config"]) gatk_type = broad_runner.gatk_type() _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = ["-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "-R", ref_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] for a in annotation.get_gatk_annotations(items[0]["config"], include_baseqranksum=False): params += ["--annotation", a] # Avoid issues with BAM CIGAR reads that GATK doesn't like if gatk_type == "gatk4": params += ["--read-validation-stringency", "LENIENT"] params += _add_tumor_params(paired, items, gatk_type) params += _add_region_params(region, out_file, items, gatk_type) # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm # Not yet clear how this helps or hurts in a general case. #params += _add_assoc_params(assoc_files) params += ["-ploidy", str(ploidy.get_ploidy(items, region))] resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) if gatk_type == "gatk4": tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(tx_out_file) filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file) cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}" else: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) cmd = "{gatk_cmd} > {tx_raw_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, assoc_files.get("dbsnp"), region, out_file) assert broad_runner.gatk_type() == "restricted", \ "Require full version of GATK 2.4+ for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: params += [ "-T", "HaplotypeCaller", "-o", tx_out_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC" ] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion( broad_runner.gatk_major_version()) >= LooseVersion("3.1"): params += [ "--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING" ] # Enable non-diploid calling in GATK 3.3+ if LooseVersion( broad_runner.gatk_major_version()) >= LooseVersion("3.3"): params += ["-ploidy", str(ploidy.get_ploidy(items, region))] if _joint_calling(items): # Prepare gVCFs if doing joint calling params += [ "--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000" ] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def _prep_genome(out_dir, data): """Create prepped reference directory for pisces. Requires a custom GenomeSize.xml file present. """ genome_name = utils.splitext_plus(os.path.basename(dd.get_ref_file(data)))[0] out_dir = utils.safe_makedir(os.path.join(out_dir, genome_name)) ref_file = dd.get_ref_file(data) utils.symlink_plus(ref_file, os.path.join(out_dir, os.path.basename(ref_file))) with open(os.path.join(out_dir, "GenomeSize.xml"), "w") as out_handle: out_handle.write('<sequenceSizes genomeName="%s">' % genome_name) for c in pysam.AlignmentFile("%s.dict" % utils.splitext_plus(ref_file)[0]).header["SQ"]: cur_ploidy = ploidy.get_ploidy([data], region=[c["SN"]]) out_handle.write('<chromosome fileName="%s" contigName="%s" totalBases="%s" knownBases="%s" ' 'isCircular="false" ploidy="%s" md5="%s"/>' % (os.path.basename(ref_file), c["SN"], c["LN"], c["LN"], cur_ploidy, c["M5"])) out_handle.write('</sequenceSizes>') return out_dir
def unified_genotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, assoc_files.get("dbsnp"), region, out_file) with file_transaction(items[0], out_file) as tx_out_file: params += ["-T", "UnifiedGenotyper", "-o", tx_out_file, "-ploidy", (str(ploidy.get_ploidy(items, region)) if broad_runner.gatk_type() == "restricted" else "2"), "--genotype_likelihoods_model", "BOTH"] broad_runner.run_gatk(params) return out_file
def _freebayes_options_from_config(items, config, out_file, region=None): opts = [] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = utils.get_in(config, ("algorithm", "variant_regions")) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] if "--min-alternate-fraction" not in " ".join(opts) and "-F" not in " ".join(opts): # add minimum reportable allele frequency, for which FreeBayes defaults to 20 min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 20)) / 100.0 opts += ["--min-alternate-fraction", str(min_af)] return opts
def _freebayes_options_from_config(items, config, out_file, region=None): opts = [] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = utils.get_in(config, ("algorithm", "variant_regions")) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] if "--min-alternate-fraction" not in " ".join(opts) and "-F" not in " ".join(opts): # add minimum reportable allele frequency, for which FreeBayes defaults to 20 min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"),20)) / 100.0 opts += ["--min-alternate-fraction", str(min_af)] return opts
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. """ opts = [] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")), items[0]) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts
def unified_genotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"], region, out_file) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: params += ["-T", "UnifiedGenotyper", "-o", tx_out_file, "-ploidy", str(ploidy.get_ploidy(items, region)), "--genotype_likelihoods_model", "BOTH"] broad_runner.run_gatk(params) return out_file
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data): """GenotypeGVCFs from a merged GenomicsDB input: GATK4. No core scaling -- not yet supported in GATK4. """ if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) params = ["-T", "GenotypeGVCFs", "--variant", "gendb://%s" % genomics_db, "-R", dd.get_ref_file(data), "--output", tx_out_file, "-L", bamprep.region_to_gatk(region)] params += ["-ploidy", str(ploidy.get_ploidy([data], region))] # Avoid slow genotyping runtimes with improved quality score calculation in GATK4 # https://gatkforums.broadinstitute.org/gatk/discussion/11471/performance-troubleshooting-tips-for-genotypegvcfs/p1 params += ["--use-new-qual-calculator"] cores = dd.get_cores(data) memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return vcfutils.bgzip_and_index(out_file, data["config"])
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): num_cores = dd.get_num_cores(items[0]) broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores) gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+, or GATK4 for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: resources = config_utils.get_resources("gatk-spark", items[0]["config"]) spark_opts = [str(x) for x in resources.get("options", [])] if _use_spark(num_cores, gatk_type, items, spark_opts): params += ["-T", "HaplotypeCallerSpark"] if spark_opts: params += spark_opts else: params += ["--spark-master", "local[%s]" % num_cores, "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file), "--conf", "spark.driver.host=localhost", "--conf", "spark.network.timeout=800", "--conf", "spark.executor.heartbeatInterval=100"] else: params += ["-T", "HaplotypeCaller"] params += ["--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"): if _supports_avx(): # Scale down HMM thread default to avoid overuse of cores # https://github.com/bcbio/bcbio-nextgen/issues/2442 if gatk_type == "gatk4": params += ["--native-pair-hmm-threads", "1"] # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE # GATK3 needs to be explicitly set else: params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] # Prepare gVCFs if doing joint calling is_joint = False if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items): is_joint = True # If joint calling parameters not set in user options if not any([x in ["--emit-ref-confidence", "-ERC", "--emitRefConfidence"] for x in params]): if gatk_type == "gatk4": params += ["--emit-ref-confidence", "GVCF"] else: params += ["--emitRefConfidence", "GVCF"] params += ["--variant_index_type", "LINEAR", "--variant_index_parameter", "128000"] # Set GQ banding to not be single GQ resolution # No recommended default but try to balance resolution and size # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands if not any([x in ["-GQB"] for x in params]): for boundary in [10, 20, 30, 40, 60, 80]: params += ["-GQB", str(boundary)] # Enable non-diploid calling in GATK 3.3+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"): params += ["-ploidy", str(ploidy.get_ploidy(items, region))] if gatk_type == "gatk4": # GATK4 Spark calling does not support bgzipped output, use plain VCFs if is_joint and _use_spark(num_cores, gatk_type, items, spark_opts): tx_out_file = tx_out_file.replace(".vcf.gz", ".vcf") params += ["--output", tx_out_file] else: params += ["-o", tx_out_file] broad_runner.new_resources("gatk-haplotype") memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None try: broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=_use_spark(num_cores, gatk_type, items, spark_opts)) except subprocess.CalledProcessError as msg: # Spark failing on regions without any reads, write an empty VCF instead # https://github.com/broadinstitute/gatk/issues/4234 if (_use_spark(num_cores, gatk_type, items, spark_opts) and str(msg).find("java.lang.UnsupportedOperationException: empty collection") >= 0 and str(msg).find("at org.apache.spark.rdd.RDD") >= 0): vcfutils.write_empty_vcf(tx_out_file, samples=[dd.get_sample_name(d) for d in items]) else: raise if tx_out_file.endswith(".vcf"): vcfutils.bgzip_and_index(tx_out_file, items[0]["config"]) # avoid bug in GATK where files can get output as non-compressed if out_file.endswith(".gz") and not os.path.exists(out_file + ".tbi"): with open(out_file, "r") as in_handle: is_plain_text = in_handle.readline().startswith("##fileformat") if is_plain_text: text_out_file = out_file out_file = out_file.replace(".vcf.gz", ".vcf") shutil.move(text_out_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): num_cores = dd.get_num_cores(items[0]) broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores) gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+, or GATK4 for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: if _use_spark(num_cores, gatk_type): params += [ "-T", "HaplotypeCallerSpark", "--spark-master", "local[%s]" % num_cores, "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file), "--conf", "spark.driver.host=localhost", "--conf", "spark.network.timeout=800", "--conf", "spark.executor.heartbeatInterval=100" ] else: params += ["-T", "HaplotypeCaller"] params += [ "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC" ] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion( broad_runner.gatk_major_version()) >= LooseVersion("3.1"): if _supports_avx(): # Scale down HMM thread default to avoid overuse of cores # https://github.com/bcbio/bcbio-nextgen/issues/2442 if gatk_type == "gatk4": params += ["--native-pair-hmm-threads", "1"] # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE # GATK3 needs to be explicitly set else: params += [ "--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING" ] # Prepare gVCFs if doing joint calling is_joint = False if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items): is_joint = True if gatk_type == "gatk4": params += ["--emit-ref-confidence", "GVCF"] else: params += ["--emitRefConfidence", "GVCF"] params += [ "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000" ] # Set GQ banding to not be single GQ resolution # No recommended default but try to balance resolution and size # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands for boundary in [10, 20, 30, 40, 60, 80]: params += ["-GQB", str(boundary)] # Enable non-diploid calling in GATK 3.3+ if LooseVersion( broad_runner.gatk_major_version()) >= LooseVersion("3.3"): # GenomicsDB does not support non-diploid samples in GATK4 joint calling # https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4 if not is_joint and gatk_type == "gatk4": params += [ "-ploidy", str(ploidy.get_ploidy(items, region)) ] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] if gatk_type == "gatk4": # GATK4 Spark calling does not support bgzipped output, use plain VCFs if is_joint and _use_spark(num_cores, gatk_type): tx_out_file = tx_out_file.replace(".vcf.gz", ".vcf") params += ["--output", tx_out_file] else: params += ["-o", tx_out_file] broad_runner.new_resources("gatk-haplotype") memscale = { "magnitude": 0.9 * num_cores, "direction": "increase" } if num_cores > 1 else None try: broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=_use_spark( num_cores, gatk_type)) except subprocess.CalledProcessError as msg: # Spark failing on regions without any reads, write an empty VCF instead # https://github.com/broadinstitute/gatk/issues/4234 if (_use_spark(num_cores, gatk_type) and str( msg ).find("java.lang.UnsupportedOperationException: empty collection" ) >= 0 and str(msg).find("at org.apache.spark.rdd.RDD") >= 0): vcfutils.write_empty_vcf( tx_out_file, samples=[dd.get_sample_name(d) for d in items]) else: raise if tx_out_file.endswith(".vcf"): vcfutils.bgzip_and_index(tx_out_file, items[0]["config"]) # avoid bug in GATK where files can get output as non-compressed if out_file.endswith(".gz") and not os.path.exists(out_file + ".tbi"): with open(out_file, "r") as in_handle: is_plain_text = in_handle.readline().startswith("##fileformat") if is_plain_text: text_out_file = out_file out_file = out_file.replace(".vcf.gz", ".vcf") shutil.move(text_out_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])