def unified_genotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"], region, out_file) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: params += [ "-T", "UnifiedGenotyper", "-o", tx_out_file, "-ploidy", (str(ploidy.get_ploidy(items, region)) if broad_runner.gatk_type() == "restricted" else "2"), "--genotype_likelihoods_model", "BOTH" ] broad_runner.run_gatk(params) return out_file
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.debug("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target_regions = subset_variant_regions(variant_regions, region, out_file) if (variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(config, out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) if out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.5": raise IOError("Please install version 2.3.5 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) jvm_opts = _get_varscan_opts(config) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" cmd = ("{mpileup} | {remove_zerocoverage} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) for x in align_bams: bam.index(x, config) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ((variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)) or not all(realign.has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config) return ann_file
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): config = items[0]["config"] broad_runner, params = \ _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"], region, out_file) assert broad_runner.gatk_type() == "restricted", \ "Require full version of GATK 2.4+ for haplotype calling" if not all(has_aligned_reads(x, region) for x in align_bams): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(out_file) as tx_out_file: params += ["-T", "HaplotypeCaller", "-o", tx_out_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"): params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"] broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file): """Call variants with samtools in target_regions. Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra Version information from VCF header lines. """ config = items[0]["config"] max_read_depth = "1000" mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions) bcftools = config_utils.get_program("bcftools", config) bcftools_version = programs.get_version("bcftools", config=config) samtools_version = programs.get_version("samtools", config=config) if LooseVersion(bcftools_version) > LooseVersion("0.1.19"): if LooseVersion(samtools_version) <= LooseVersion("0.1.19"): raise ValueError("samtools calling not supported with 0.1.19 samtools and 0.20 bcftools") bcftools_opts = "call -v -c" else: bcftools_opts = "view -v -c -g" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" vcfutils = config_utils.get_program("vcfutils.pl", config) # XXX Check if we need this when supporting samtools 0.2.0 calling. # 0.1.9 fails on regions without reads. if not any(realign.has_aligned_reads(x, target_regions) for x in align_bams): vcfutils.write_empty_vcf(out_file, config) else: cmd = ("{mpileup} " "| {bcftools} {bcftools_opts} - " "| {vcfutils} varFilter -D {max_read_depth} " "| sed 's/,Version=3>/>/'" "{compress_cmd} > {out_file}") logger.info(cmd.format(**locals())) do.run(cmd.format(**locals()), "Variant calling with samtools", {})
def shared_variantcall(call_fn, name, align_bams, ref_file, config, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ broad_runner = broad.runner_from_config(config) for x in align_bams: broad_runner.run_fn("picard_index", x) if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ((variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)) or not all( realign.has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, config, target_regions, tx_out_file) return out_file
def shared_variantcall(call_fn, name, align_bams, ref_file, config, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ broad_runner = broad.runner_from_config(config) for x in align_bams: broad_runner.run_fn("picard_index", x) if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info("Genotyping with {name}: {region} {fname}".format(name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ((variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)) or not all(realign.has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, config, target_regions, tx_out_file) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.dbsnp, ref_file, config) return ann_file
def combine_calls(batch_id, samples, data): """Combine multiple callsets into a final set of merged calls. """ logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"]))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id) exist_variants = False for tmp_vrn_file in vrn_files: if vcfutils.vcf_has_variants(tmp_vrn_file): exist_variants = True break if exist_variants: if "classifiers" not in edata["config"]["algorithm"]["ensemble"]: callinfo = _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, edata["sam_ref"], edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"]) edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file) callinfo = {"variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None} return [[batch_id, callinfo]]
def _run_delly(bam_files, sv_type, ref_file, work_dir, items): """Run delly, calling structural variations for the specified type. """ out_file = os.path.join( work_dir, "%s-svs%s.vcf" % (os.path.splitext(os.path.basename(bam_files[0]))[0], sv_type)) cores = min( utils.get_in(items[0], ("config", "algorithm", "num_cores"), 1), len(bam_files)) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(out_file) as tx_out_file: sv_exclude_bed = get_sv_exclude_file(items) exclude = ["-x", sv_exclude_bed] if sv_exclude_bed else [] cmd = ["delly", "-t", sv_type, "-g", ref_file, "-o", tx_out_file ] + exclude + bam_files multi_cmd = "export OMP_NUM_THREADS=%s && " % cores try: do.run(multi_cmd + " ".join(cmd), "delly structural variant") except subprocess.CalledProcessError, msg: # delly returns an error exit code if there are no variants if "No structural variants found" in str(msg): vcfutils.write_empty_vcf(out_file) else: raise
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): config = items[0]["config"] broad_runner, params = \ _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"], region, out_file) assert broad_runner.gatk_type() == "restricted", \ "Require full version of GATK 2.4+ for haplotype calling" if not all(has_aligned_reads(x, region) for x in align_bams): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(out_file) as tx_out_file: params += ["-T", "HaplotypeCaller", "-o", tx_out_file] #params = _gatk_location_hack(params) broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.debug("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target_regions = subset_variant_regions(variant_regions, region, out_file, items=items) if (variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(config, out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) if out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm.""" if out_file is None: out_file = "%s-paired-variants.vcf" % os.path.splitext( align_bams[0])[0] if not file_exists(out_file): broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file) as tx_out_file: # Rationale: MuTect writes another table to stdout, # which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) return out_file
def run_cortex(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ if len(align_bams) == 1: align_bam = align_bams[0] config = items[0]["config"] else: raise NotImplementedError("Need to add multisample calling for cortex_var") if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if region is not None: work_dir = safe_makedir(os.path.join(os.path.dirname(out_file), region.replace(".", "_"))) else: work_dir = os.path.dirname(out_file) if not file_exists(out_file): bam.index(align_bam, config) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError("Only support regional variant calling with cortex_var: set variant_regions") target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [ _run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, work_dir, out_file, config) for x in in_handle ] combine_file = apply("{0}-raw{1}".format, os.path.splitext(out_file)) _combine_variants(regional_vcfs, combine_file, ref_file, config) _select_final_variants(combine_file, out_file, config) else: vcfutils.write_empty_vcf(out_file) return out_file
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"], region, out_file) assert broad_runner.gatk_type() == "restricted", \ "Require full version of GATK 2.4+ for haplotype calling" if not all(has_aligned_reads(x, region) for x in align_bams): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: params += ["-T", "HaplotypeCaller", "-o", tx_out_file] #params = _gatk_location_hack(params) broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file_mutect) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) _rename_allelic_fraction_field(out_file_mutect,config) disable_SID = True # SID isn't great, so use Scalpel instead if "appistry" not in broad_runner.get_mutect_version() or disable_SID: # Scalpel InDels is_paired = "-I:normal" in params out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if scalpel.is_installed(items[0]["config"]): with file_transaction(out_file_indels) as tx_out_file2: if not is_paired: scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) else: scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) else: # SomaticIndelDetector modifications out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) return out_file
def run_cortex(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ raise NotImplementedError("Cortex currently out of date and needs reworking.") if len(align_bams) == 1: align_bam = align_bams[0] config = items[0]["config"] else: raise NotImplementedError("Need to add multisample calling for cortex_var") if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if region is not None: work_dir = safe_makedir(os.path.join(os.path.dirname(out_file), region.replace(".", "_"))) else: work_dir = os.path.dirname(out_file) if not file_exists(out_file): bam.index(align_bam, config) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError("Only support regional variant calling with cortex_var: set variant_regions") target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [_run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, work_dir, out_file, config) for x in in_handle] combine_file = "{0}-raw{1}".format(*os.path.splitext(out_file)) _combine_variants(regional_vcfs, combine_file, ref_file, config) _select_final_variants(combine_file, out_file, config) else: vcfutils.write_empty_vcf(out_file) return out_file
def _run_somatic(paired, ref_file, assoc_files, region, out_file, work_dir): if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, work_dir) as tx_work_dir: workflow_file = _configure_somatic(paired, ref_file, region, out_file, tx_work_dir) if workflow_file: has_variants = True _run_workflow(paired.tumor_data, workflow_file, tx_work_dir) else: has_variants = False vcfutils.write_empty_vcf( out_file, paired.tumor_data["config"], [ dd.get_sample_name(d) for d in [paired.tumor_data, paired.normal_data] ]) if has_variants: var_dir = os.path.join(work_dir, "results", "variants") vcfutils.combine_variant_files([ _postprocess_somatic(os.path.join(var_dir, f), paired) for f in ["somatic.snvs.vcf.gz", "somatic.indels.vcf.gz"] ], out_file, ref_file, paired.tumor_data["config"], region=region) return out_file
def combine_calls(batch_id, samples, data): """Combine multiple callsets into a final set of merged calls. """ logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"]))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id) exist_variants = False for tmp_vrn_file in vrn_files: if vcfutils.vcf_has_variants(tmp_vrn_file): exist_variants = True break if exist_variants: if "classifiers" not in edata["config"]["algorithm"]["ensemble"]: callinfo = _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, edata["sam_ref"], edata) edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file) callinfo = {"variantcaller": "ensemble", "vrn_file": out_vcf_file, "bed_file": None} return [[batch_id, callinfo]]
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes for paired tumor/normal samples. Sources of options for FreeBayes: mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916 sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/ sga_generate_varcall_makefile.pl#L299 """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering" freebayes = config_utils.get_program("freebayes", config) opts, no_target_regions = _freebayes_options_from_config( items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: opts = " ".join(opts) opts += " --min-repeat-entropy 1" opts += " --no-partial-observations" opts = _add_somatic_opts(opts, paired) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() clean_fmt_cmd = _clean_freebayes_fmt_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cl = ( "{freebayes} -f {ref_file} {opts} " "{paired.tumor_bam} {paired.normal_bam} " """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """ "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' " "| {fix_ambig} | {clean_fmt_cmd} bcftools view -a - | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - | vcfuniqalleles " "{compress_cmd} > {tx_out_file}") do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None, somatic=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: freebayes = config_utils.get_program("freebayes", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts, no_target_regions = _freebayes_options_from_config( items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf( tx_out_file, config, samples=[dd.get_sample_name(d) for d in items]) else: opts = " ".join(opts) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1" # Remove partial observations, which cause a preference for heterozygote calls # https://github.com/ekg/freebayes/issues/234#issuecomment-205331765 opts += " --no-partial-observations" if somatic: opts = _add_somatic_opts(opts, somatic) compress_cmd = "| bgzip -c" if out_file.endswith( "gz") else "" # For multi-sample outputs, ensure consistent order samples = ( "-s" + ",".join([dd.get_sample_name(d) for d in items])) if len(items) > 1 else "" fix_ambig = vcfutils.fix_ambiguous_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cmd = ( "{freebayes} -f {ref_file} {opts} {input_bams} " """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """ "| {fix_ambig} | bcftools view {samples} -a - | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - | vcfuniqalleles | vt uniq - 2> /dev/null " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) return out_file
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.6": raise IOError("Please install version 2.3.6 or better of VarScan" " with support for multisample calling and indels" " in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" # write a temporary mpileup file so we can check if empty mpfile = "%s.mpileup" % os.path.splitext(out_file)[0] with file_transaction(config, mpfile) as mpfile_tx: cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}") do.run(cmd.format(**locals()), "mpileup for Varscan") if os.path.getsize(mpfile) == 0: write_empty_vcf(out_file) else: with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ( "cat {mpfile} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "| {fix_ambig} | vcfuniqalleles > {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) os.remove(mpfile) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) else: freebayes.clean_vcf_output(out_file, _clean_varscan_line, config) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def combine_calls(*args): """Combine multiple callsets into a final set of merged calls. """ if len(args) == 3: is_cwl = False batch_id, samples, data = args caller_names, vrn_files = _organize_variants(samples, batch_id) else: is_cwl = True samples = [utils.to_single_data(x) for x in args] samples = [cwlutils.unpack_tarballs(x, x) for x in samples] data = samples[0] batch_id = data["batch_id"] caller_names = data["variants"]["variantcallers"] vrn_files = data["variants"]["calls"] logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(caller_names))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) if any([vcfutils.vcf_has_variants(f) for f in vrn_files]): # Decompose multiallelic variants and normalize passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False) vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True, nonrefonly=True, work_dir=utils.safe_makedir(os.path.join(base_dir, c))) for c, f in zip(caller_names, vrn_files)] if "classifiers" not in (dd.get_ensemble(edata) or {}): callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, dd.get_ref_file(edata), edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"]) # After decomposing multiallelic variants and normalizing, re-evaluate effects ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data) if ann_ma_file: callinfo["vrn_file"] = ann_ma_file edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples]) callinfo = {"variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None} if is_cwl: callinfo["batch_samples"] = data["batch_samples"] callinfo["batch_id"] = batch_id return [{"ensemble": callinfo}] else: return [[batch_id, callinfo]]
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.6": raise IOError("Please install version 2.3.6 or better of VarScan" " with support for multisample calling and indels" " in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" # write a temporary mpileup file so we can check if empty mpfile = "%s.mpileup" % os.path.splitext(out_file)[0] with file_transaction(config, mpfile) as mpfile_tx: cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}") do.run(cmd.format(**locals()), "mpileup for Varscan") if os.path.getsize(mpfile) == 0: write_empty_vcf(out_file) else: with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ("cat {mpfile} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "| {fix_ambig} | vcfuniqalleles > {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) os.remove(mpfile) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) else: freebayes.clean_vcf_output(out_file, _clean_varscan_line, config) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None, somatic=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: freebayes = config_utils.get_program("freebayes", config) vcffilter = config_utils.get_program("vcffilter", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts, no_target_regions = _freebayes_options_from_config( items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf( tx_out_file, config, samples=[dd.get_sample_name(d) for d in items]) else: opts = " ".join(opts) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1" if somatic: opts = _add_somatic_opts(opts, somatic) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cmd = ( "{freebayes} -f {ref_file} {opts} {input_bams} | " "{vcffilter} -f 'QUAL > 5' -s | {fix_ambig} | " "bcftools view -a - 2> /dev/null | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - 2> /dev/null | vcfuniqalleles " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }" # we use ifne from moreutils to ensure we process only on files with input, skipping otherwise # http://manpages.ubuntu.com/manpages/natty/man1/ifne.1.html with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_jvm_opts(config, tmp_dir) opts = " ".join(_varscan_options_from_config(config)) min_af = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) py_cl = os.path.join(os.path.dirname(sys.executable), "py") export = utils.local_path_export() cmd = ( "{export} {mpileup} | {remove_zerocoverage} | " "ifne varscan {jvm_opts} mpileup2cns {opts} " "--vcf-sample-list {sample_list} --min-var-freq {min_af} --output-vcf --variants | " """{py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' | """ "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x)' | " "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles > {out_file}" ) do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def clean_ras(): """Add VCF headers to RAS VCFs and bgzip and tabix. """ for in_vcf in (x for x in glob.glob(os.path.join(in_vcf_dir, "*.vcf")) if x.find("NA1287") == -1): out_vcf = os.path.join(out_vcf_dir, os.path.join(os.path.basename(in_vcf).replace(".txt", ""))) if not os.path.exists(out_vcf) and not os.path.exists(out_vcf + ".gz"): vcfutils.write_empty_vcf(out_vcf, samples=[os.path.basename(in_vcf).replace(".txt.vcf", "")]) with open(out_vcf, "a") as out_handle: with open(in_vcf) as in_handle: for line in in_handle: if line.startswith("chr"): line = line[3:] out_handle.write(line) vcfutils.bgzip_and_index(out_vcf)
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file, work_dir): if not utils.file_exists(out_file): with file_transaction(items[0], work_dir) as tx_work_dir: workflow_file = _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir) if workflow_file: _run_workflow(items[0], workflow_file, tx_work_dir) else: vcfutils.write_empty_vcf(out_file, items[0]["config"], [dd.get_sample_name(d) for d in items]) raw_file = os.path.join(work_dir, "results", "variants", "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz") utils.copy_plus(raw_file, out_file) # Remove files with relative symlinks utils.remove_plus(os.path.join(work_dir, "results", "variants", "genome.vcf.gz")) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _bgzip_and_clean(bcf_file, items): """Create a clean bgzipped VCF output file from bcf for downstream processing. Also corrects problems with missing likelihoods: https://github.com/dellytools/delly/issues/37 GATK does not like missing GLs like '.,.,.'. This converts them to the recognized '.' """ out_file = "%s.vcf.gz" % (utils.splitext_plus(bcf_file)[0]) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: if not utils.file_exists(bcf_file): vcfutils.write_empty_vcf(tx_out_file, samples=[dd.get_sample_name(d) for d in items]) else: cmd = ("bcftools view {bcf_file} | sed 's/\.,\.,\././' | bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "Convert and clean delly output") return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with FreeBayes for paired tumor/normal samples. Sources of options for FreeBayes: mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916 sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/ sga_generate_varcall_makefile.pl#L299 """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering" freebayes = config_utils.get_program("freebayes", config) opts, no_target_regions = _freebayes_options_from_config(items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf(tx_out_file, config, samples=[x for x in [paired.tumor_name, paired.normal_name] if x]) else: opts = " ".join(opts) opts += " --min-repeat-entropy 1" opts += " --no-partial-observations" opts = _add_somatic_opts(opts, paired) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" # For multi-sample outputs, ensure consistent order samples = ("-s " + ",".join([dd.get_sample_name(d) for d in items])) if len(items) > 1 else "" fix_ambig = vcfutils.fix_ambiguous_cl() bcbio_py = sys.executable py_cl = os.path.join(os.path.dirname(sys.executable), "py") cl = ("{freebayes} -f {ref_file} {opts} " "{paired.tumor_bam} {paired.normal_bam} " """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """ """| {bcbio_py} -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("{paired.tumor_name}", "{paired.normal_name}")' """ "| {fix_ambig} | bcftools view {samples} -a - | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - | vcfuniqalleles | vt uniq - 2> /dev/null " "{compress_cmd} > {tx_out_file}") do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {}) return out_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") if "appistry" in broad_runner.get_mutect_version(): out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") else: out_file_mutect = out_file broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file_mutect) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) if "appistry" in broad_runner.get_mutect_version(): # SomaticIndelDetector modifications out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) return out_file
def _run_cortex_on_region(region, align_bam, ref_file, work_dir, out_file_base, config): """Run cortex on a specified chromosome start/end region. """ kmers = [31, 51, 71] min_reads = 1750 cortex_dir = config_utils.get_program("cortex", config, "dir") stampy_dir = config_utils.get_program("stampy", config, "dir") vcftools_dir = config_utils.get_program("vcftools", config, "dir") if cortex_dir is None or stampy_dir is None: raise ValueError( "cortex_var requires path to pre-built cortex and stampy") region_str = apply("{0}-{1}-{2}".format, region) base_dir = safe_makedir(os.path.join(work_dir, region_str)) try: out_vcf_base = os.path.join( base_dir, "{0}-{1}".format( os.path.splitext(os.path.basename(out_file_base))[0], region_str)) out_file = os.path.join( work_dir, os.path.basename("{0}.vcf".format(out_vcf_base))) if not file_exists(out_file): fastq = _get_fastq_in_region(region, align_bam, out_vcf_base) if _count_fastq_reads(fastq, min_reads) < min_reads: vcfutils.write_empty_vcf(out_file) else: local_ref, genome_size = _get_local_ref( region, ref_file, out_vcf_base) indexes = _index_local_ref(local_ref, cortex_dir, stampy_dir, kmers) cortex_out = _run_cortex( fastq, indexes, { "kmers": kmers, "genome_size": genome_size, "sample": get_sample_name(align_bam) }, out_vcf_base, { "cortex": cortex_dir, "stampy": stampy_dir, "vcftools": vcftools_dir }, config) if cortex_out: _remap_cortex_out(cortex_out, region, out_file) else: vcfutils.write_empty_vcf(out_file) finally: if os.path.exists(base_dir): shutil.rmtree(base_dir) return [region[0], int(region[1]), int(region[2]), out_file]
def _run_delly(bam_files, sv_type, ref_file, work_dir): """Run delly, calling structural variations for the specified type. """ out_file = os.path.join(work_dir, "%s-svs%s.vcf" % (os.path.splitext(os.path.basename(bam_files[0]))[0], sv_type)) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(out_file) as tx_out_file: cmd = ["delly", "-g", ref_file, "-o", tx_out_file] + bam_files try: do.run(cmd, "delly structural variant") except subprocess.CalledProcessError, msg: # delly returns an error exit code if there are no variants if "No structural variants found" in str(msg): vcfutils.write_empty_vcf(out_file) else: raise
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv using smoove. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" name = "%s%s" % (dd.get_sample_name(items[0]), ext) out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) if utils.file_exists(old_out_file): return old_out_file, sv_exclude_bed if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: cores = dd.get_num_cores(items[0]) out_dir = os.path.dirname(tx_out_file) ref_file = dd.get_ref_file(items[0]) full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items, os.path.dirname(tx_out_file))) std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"] def _is_std_exclude(n): clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes] return any([n.startswith(x) or n.endswith(x) for x in clean_excludes]) exclude_chrs = [c.name for c in ref.file_contigs(ref_file) if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)] exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs) exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else "" tempdir = os.path.dirname(tx_out_file) cmd = ("export TMPDIR={tempdir} && " "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} " "--name {name} --outdir {out_dir} " "{exclude_bed} {exclude_chrs} {full_bams}") with utils.chdir(tempdir): try: do.run(cmd.format(**locals()), "smoove lumpy calling", items[0]) except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"], samples=[dd.get_sample_name(d) for d in items]) else: logger.exception() raise vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file, sv_exclude_bed
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv using smoove. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" name = "%s%s" % (dd.get_sample_name(items[0]), ext) out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) if utils.file_exists(old_out_file): return old_out_file, sv_exclude_bed if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: cores = dd.get_num_cores(items[0]) out_dir = os.path.dirname(tx_out_file) ref_file = dd.get_ref_file(items[0]) full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items, os.path.dirname(tx_out_file))) std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"] def _is_std_exclude(n): clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes] return any([n.startswith(x) or n.endswith(x) for x in clean_excludes]) exclude_chrs = [c.name for c in ref.file_contigs(ref_file) if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)] exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs) exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else "" tempdir = os.path.dirname(tx_out_file) cmd = ("export TMPDIR={tempdir} && " "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} " "--name {name} --outdir {out_dir} " "{exclude_bed} {exclude_chrs} {full_bams}") with utils.chdir(tempdir): try: do.run(cmd.format(**locals()), "smoove lumpy calling", items[0]) except subprocess.CalledProcessError as msg: if _allowed_errors(msg): vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"], samples=[dd.get_sample_name(d) for d in items]) else: logger.exception() raise vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file, sv_exclude_bed
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None, somatic=None): """Detect SNPs and indels with FreeBayes. Performs post-filtering to remove very low quality variants which can cause issues feeding into GATK. Breaks variants into individual allelic primitives for analysis and evaluation. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: freebayes = config_utils.get_program("freebayes", config) input_bams = " ".join("-b %s" % x for x in align_bams) opts, no_target_regions = _freebayes_options_from_config(items, config, out_file, region) if no_target_regions: vcfutils.write_empty_vcf(tx_out_file, config, samples=[dd.get_sample_name(d) for d in items]) else: opts = " ".join(opts) # Recommended options from 1000 genomes low-complexity evaluation # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ opts += " --min-repeat-entropy 1" # Remove partial observations, which cause a preference for heterozygote calls # https://github.com/ekg/freebayes/issues/234#issuecomment-205331765 opts += " --no-partial-observations" if somatic: opts = _add_somatic_opts(opts, somatic) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" fix_ambig = vcfutils.fix_ambiguous_cl() clean_fmt_cmd = _clean_freebayes_fmt_cl() py_cl = os.path.join(os.path.dirname(sys.executable), "py") cmd = ("{freebayes} -f {ref_file} {opts} {input_bams} " """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """ "| {fix_ambig} | {clean_fmt_cmd} bcftools view -a - | " "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | " "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | " "vt normalize -n -r {ref_file} -q - | vcfuniqalleles " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _bgzip_and_clean(bcf_file, items): """Create a clean bgzipped VCF output file from bcf for downstream processing. Also corrects problems with missing likelihoods: https://github.com/dellytools/delly/issues/37 GATK does not like missing GLs like '.,.,.'. This converts them to the recognized '.' """ out_file = "%s.vcf.gz" % (utils.splitext_plus(bcf_file)[0]) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: if not utils.file_exists(bcf_file): vcfutils.write_empty_vcf( tx_out_file, samples=[dd.get_sample_name(d) for d in items]) else: cmd = (f"bcftools view {bcf_file} | " fr"sed 's/\.,\.,\././' | " f"bgzip -c > {tx_out_file}") do.run(cmd, "Convert and clean delly output") return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm.""" if out_file is None: out_file = "%s-paired-variants.vcf" % os.path.splitext( align_bams[0])[0] if not file_exists(out_file): broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file) as tx_out_file: # Rationale: MuTect writes another table to stdout, # which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] try: broad_runner.run_mutect(params) except CalledProcessError as error: java_exception = _parse_gatk_java_error_string(error.cmd) #HACK: Currently MuTect bails out on certain small BAM files # Until the issue is fixed by Broad, this specific exception # will be ignored. All the other exceptions will be raised # correctly. if java_exception in _PASS_EXCEPTIONS: vcfutils.write_empty_vcf(tx_out_file) return else: raise return out_file
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = r"{ ifne grep -v -P '\t0\t\t$' || true; }" # we use ifne from moreutils to ensure we process only on files with input, skipping otherwise # http://manpages.ubuntu.com/manpages/natty/man1/ifne.1.html with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_jvm_opts(config, tmp_dir) opts = " ".join(_varscan_options_from_config(config)) min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) py_cl = os.path.join(os.path.dirname(sys.executable), "py") export = utils.local_path_export() cmd = ("{export} {mpileup} | {remove_zerocoverage} | " "ifne varscan {jvm_opts} mpileup2cns {opts} " "--vcf-sample-list {sample_list} --min-var-freq {min_af} --output-vcf --variants | " """{py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' | """ "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x)' | " "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles > {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def _run_germline(bam_file, data, ref_file, region, out_file): """Single sample germline variant calling. """ work_dir = utils.safe_makedir("%s-work" % utils.splitext_plus(out_file)[0]) region_bed = strelka2.get_region_bed(region, [data], out_file, want_gzip=False) example_dir = _make_examples(bam_file, data, ref_file, region_bed, out_file, work_dir) if _has_candidate_variants(example_dir): tfrecord_file = _call_variants(example_dir, region_bed, data, out_file) return _postprocess_variants(tfrecord_file, data, ref_file, out_file) else: return vcfutils.write_empty_vcf(out_file, data["config"], [dd.get_sample_name(data)])
def _run_delly(bam_files, chrom, sv_type, ref_file, work_dir, items): """Run delly, calling structural variations for the specified type. """ out_file = os.path.join(work_dir, "%s-svs%s-%s.vcf" % (os.path.splitext(os.path.basename(bam_files[0]))[0], sv_type, chrom)) cores = min(utils.get_in(items[0], ("config", "algorithm", "num_cores"), 1), len(bam_files)) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(out_file) as tx_out_file: exclude = ["-x", prepare_exclude_file(items, out_file, chrom)] cmd = ["delly", "-t", sv_type, "-g", ref_file, "-o", tx_out_file] + exclude + bam_files multi_cmd = "export OMP_NUM_THREADS=%s && " % cores try: do.run(multi_cmd + " ".join(cmd), "delly structural variant") except subprocess.CalledProcessError, msg: # delly returns an error exit code if there are no variants if "No structural variants found" in str(msg): vcfutils.write_empty_vcf(out_file) else: raise
def unified_genotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, ref_file, items[0]["config"], assoc_files["dbsnp"], region, out_file) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: params += ["-T", "UnifiedGenotyper", "-o", tx_out_file, "--genotype_likelihoods_model", "BOTH"] broad_runner.run_gatk(params) return out_file
def combine_calls(batch_id, samples, data): """Combine multiple callsets into a final set of merged calls. """ logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"]))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id) exist_variants = False for tmp_vrn_file in vrn_files: if vcfutils.vcf_has_variants(tmp_vrn_file): exist_variants = True break if exist_variants: # Decompose multiallelic variants and normalize passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False) vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True) for f in vrn_files] if "classifiers" not in edata["config"]["algorithm"]["ensemble"]: callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, edata["sam_ref"], edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"]) # After decomposing multiallelic variants and normalizing, re-evaluate effects ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data) if ann_ma_file: callinfo["vrn_file"] = ann_ma_file edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples]) callinfo = {"variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None} return [[batch_id, callinfo]]
def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file): """Call variants with samtools in target_regions. Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra Version information from VCF header lines. """ config = items[0]["config"] max_read_depth = "1000" mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions) bcftools = config_utils.get_program("bcftools", config) bcftools_version = programs.get_version("bcftools", config=config) samtools_version = programs.get_version("samtools", config=config) if LooseVersion(bcftools_version) > LooseVersion("0.1.19"): if LooseVersion(samtools_version) <= LooseVersion("0.1.19"): raise ValueError( "samtools calling not supported with 0.1.19 samtools and 0.20 bcftools" ) bcftools_opts = "call -v -c" else: bcftools_opts = "view -v -c -g" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" vcfutils = config_utils.get_program("vcfutils.pl", config) # XXX Check if we need this when supporting samtools 0.2.0 calling. # 0.1.9 fails on regions without reads. if not any( realign.has_aligned_reads(x, target_regions) for x in align_bams): vcfutils.write_empty_vcf(out_file, config) else: cmd = ("{mpileup} " "| {bcftools} {bcftools_opts} - " "| {vcfutils} varFilter -D {max_read_depth} " "| sed 's/,Version=3>/>/'" "{compress_cmd} > {out_file}") logger.info(cmd.format(**locals())) do.run(cmd.format(**locals()), "Variant calling with samtools", {})
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") if "appistry" in broad_runner.get_mutect_version(): out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") else: out_file_mutect = out_file broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file_mutect) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) if "appistry" in broad_runner.get_mutect_version(): # SomaticIndelDetector modifications out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) return out_file
def _run_delly(bam_files, chrom, sv_type, ref_file, work_dir, items): """Run delly, calling structural variations for the specified type. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join(work_dir, "%s%s-%s-%s.vcf" % (os.path.splitext(os.path.basename(bam_files[0]))[0], ext, chrom, sv_type)) cores = min(utils.get_in(items[0], ("config", "algorithm", "num_cores"), 1), len(bam_files)) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: names = [tz.get_in(["rgnames", "sample"], x) for x in items] if not sshared.has_variant_regions(items, out_file, chrom): vcfutils.write_empty_vcf(tx_out_file, samples=names) else: exclude = ["-x", _delly_exclude_file(items, out_file, chrom)] map_qual = ["--map-qual", "1"] cmd = ["delly", "-t", sv_type, "-g", ref_file, "-o", tx_out_file] + exclude + bam_files multi_cmd = "export OMP_NUM_THREADS=%s && export LC_ALL=C && " % cores try: do.run(multi_cmd + " ".join(cmd), "delly structural variant") # Delly will write nothing if no variants found if not utils.file_exists(tx_out_file): vcfutils.write_empty_vcf(tx_out_file, samples=names) except subprocess.CalledProcessError, msg: # delly returns an error exit code if there are no variants if "No structural variants found" in str(msg): vcfutils.write_empty_vcf(tx_out_file, samples=names) else: raise
def _run_cortex_on_region(region, align_bam, ref_file, work_dir, out_file_base, config): """Run cortex on a specified chromosome start/end region. """ kmers = [31, 51, 71] min_reads = 1750 cortex_dir = config_utils.get_program("cortex", config, "dir") stampy_dir = config_utils.get_program("stampy", config, "dir") vcftools_dir = config_utils.get_program("vcftools", config, "dir") if cortex_dir is None or stampy_dir is None: raise ValueError("cortex_var requires path to pre-built cortex and stampy") region_str = apply("{0}-{1}-{2}".format, region) base_dir = safe_makedir(os.path.join(work_dir, region_str)) try: out_vcf_base = os.path.join( base_dir, "{0}-{1}".format(os.path.splitext(os.path.basename(out_file_base))[0], region_str) ) out_file = os.path.join(work_dir, os.path.basename("{0}.vcf".format(out_vcf_base))) if not file_exists(out_file): fastq = _get_fastq_in_region(region, align_bam, out_vcf_base) if _count_fastq_reads(fastq, min_reads) < min_reads: vcfutils.write_empty_vcf(out_file) else: local_ref, genome_size = _get_local_ref(region, ref_file, out_vcf_base) indexes = _index_local_ref(local_ref, cortex_dir, stampy_dir, kmers) cortex_out = _run_cortex( fastq, indexes, {"kmers": kmers, "genome_size": genome_size, "sample": get_sample_name(align_bam)}, out_vcf_base, {"cortex": cortex_dir, "stampy": stampy_dir, "vcftools": vcftools_dir}, config, ) if cortex_out: _remap_cortex_out(cortex_out, region, out_file) else: vcfutils.write_empty_vcf(out_file) finally: if os.path.exists(base_dir): shutil.rmtree(base_dir) return [region[0], int(region[1]), int(region[2]), out_file]
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.6": raise IOError("Please install version 2.3.6 or better of VarScan" " with support for multisample calling and indels" " in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) jvm_opts = _get_varscan_opts(config) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" cmd = ( "{mpileup} | {remove_zerocoverage} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): config = items[0]["config"] broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, assoc_files["dbsnp"], region, out_file) assert broad_runner.gatk_type() == "restricted", \ "Require full version of GATK 2.4+ for haplotype calling" if not all(has_aligned_reads(x, region) for x in align_bams): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(out_file) as tx_out_file: params += [ "-T", "HaplotypeCaller", "-o", tx_out_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC" ] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion(broad_runner.gatk_major_version() ) >= LooseVersion("3.1"): params += [ "--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING" ] broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): num_cores = dd.get_num_cores(items[0]) broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores) gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+, or GATK4 for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: resources = config_utils.get_resources("gatk-spark", items[0]["config"]) spark_opts = [str(x) for x in resources.get("options", [])] if _use_spark(num_cores, gatk_type, items, spark_opts): params += ["-T", "HaplotypeCallerSpark"] if spark_opts: params += spark_opts else: params += ["--spark-master", "local[%s]" % num_cores, "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file), "--conf", "spark.driver.host=localhost", "--conf", "spark.network.timeout=800", "--conf", "spark.executor.heartbeatInterval=100"] else: params += ["-T", "HaplotypeCaller"] params += ["--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"): if _supports_avx(): # Scale down HMM thread default to avoid overuse of cores # https://github.com/bcbio/bcbio-nextgen/issues/2442 if gatk_type == "gatk4": params += ["--native-pair-hmm-threads", "1"] # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE # GATK3 needs to be explicitly set else: params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] # Prepare gVCFs if doing joint calling is_joint = False if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items): is_joint = True # If joint calling parameters not set in user options if not any([x in ["--emit-ref-confidence", "-ERC", "--emitRefConfidence"] for x in params]): if gatk_type == "gatk4": params += ["--emit-ref-confidence", "GVCF"] else: params += ["--emitRefConfidence", "GVCF"] params += ["--variant_index_type", "LINEAR", "--variant_index_parameter", "128000"] # Set GQ banding to not be single GQ resolution # No recommended default but try to balance resolution and size # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands if not any([x in ["-GQB"] for x in params]): for boundary in [10, 20, 30, 40, 60, 80]: params += ["-GQB", str(boundary)] # Enable non-diploid calling in GATK 3.3+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"): params += ["-ploidy", str(ploidy.get_ploidy(items, region))] if gatk_type == "gatk4": # GATK4 Spark calling does not support bgzipped output, use plain VCFs if is_joint and _use_spark(num_cores, gatk_type, items, spark_opts): tx_out_file = tx_out_file.replace(".vcf.gz", ".vcf") params += ["--output", tx_out_file] else: params += ["-o", tx_out_file] broad_runner.new_resources("gatk-haplotype") memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None try: broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=_use_spark(num_cores, gatk_type, items, spark_opts)) except subprocess.CalledProcessError as msg: # Spark failing on regions without any reads, write an empty VCF instead # https://github.com/broadinstitute/gatk/issues/4234 if (_use_spark(num_cores, gatk_type, items, spark_opts) and str(msg).find("java.lang.UnsupportedOperationException: empty collection") >= 0 and str(msg).find("at org.apache.spark.rdd.RDD") >= 0): vcfutils.write_empty_vcf(tx_out_file, samples=[dd.get_sample_name(d) for d in items]) else: raise if tx_out_file.endswith(".vcf"): vcfutils.bgzip_and_index(tx_out_file, items[0]["config"]) # avoid bug in GATK where files can get output as non-compressed if out_file.endswith(".gz") and not os.path.exists(out_file + ".tbi"): with open(out_file, "r") as in_handle: is_plain_text = in_handle.readline().startswith("##fileformat") if is_plain_text: text_out_file = out_file out_file = out_file.replace(".vcf.gz", ".vcf") shutil.move(text_out_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions(items[0]), region, out_file, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = (" ".join(_vardict_options_from_config(items, config, out_file, target)) if _is_bed_file(target) else "") vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(items[0]) > 5000 else "" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd()) cmd = ("{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect) with file_transaction(config, out_file_orig) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) is_paired = "-I:normal" in params out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired) indelcaller = vcfutils.get_indelcaller(base_config) if "scalpel" in indelcaller.lower(): # Scalpel InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if scalpel.is_installed(items[0]["config"]): with file_transaction(config, out_file_indels) as tx_out_file2: if not is_paired: vcfutils.check_paired_problems(items) scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) else: scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif "pindel" in indelcaller.lower(): out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if pindel.is_installed(items[0]["config"]): pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=ref_file, config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower()) and "appistry" in broad_runner.get_mutect_version()): # SomaticIndelDetector InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(config, out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) return out_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions( items[0]), region, out_file, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts = " ".join( _vardict_options_from_config(items, config, out_file, target)) coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage( items[0]) > 5000 else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(( "config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ( "| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " "| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" % os.path.join(os.path.dirname(sys.executable), "py")) freq_filter = ( "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) cmd = ( "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "{freq_filter} " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file