def merge_overlaps(in_file, data, distance=None, out_dir=None): """Merge bed file intervals to avoid overlapping regions. Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes that don't collapse BEDs prior to using them. """ config = data["config"] if in_file: bedtools = config_utils.get_program("bedtools", config, default="bedtools") work_dir = tz.get_in(["dirs", "work"], data) if out_dir: bedprep_dir = out_dir elif work_dir: bedprep_dir = utils.safe_makedir(os.path.join(work_dir, "bedprep")) else: bedprep_dir = os.path.dirname(in_file) out_file = os.path.join(bedprep_dir, "%s-merged.bed" % (utils.splitext_plus(os.path.basename(in_file))[0])) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: distance = "-d %s" % distance if distance else "" cmd = "{bedtools} merge {distance} -i {in_file} > {tx_out_file}" do.run(cmd.format(**locals()), "Prepare merged BED file", data) vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False) return out_file
def _run_genomicsdb_import(vrn_files, region, out_file, data): """Create a GenomicsDB reference for all the variation files: GATK4. Not yet tested as scale, need to explore --batchSize to reduce memory usage if needed. Does not support transactional directories yet, since GenomicsDB databases cannot be moved to new locations. We try to identify half-finished databases and restart: https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4 Known issue -- Genomics DB workspace path core dumps on longer paths: (std::string::compare(char const*)) """ out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0] if not os.path.exists(out_dir) or _incomplete_genomicsdb(out_dir): if os.path.exists(out_dir): shutil.rmtree(out_dir) with utils.chdir(os.path.dirname(out_file)): with file_transaction(data, out_dir) as tx_out_dir: broad_runner = broad.runner_from_config(data["config"]) cores = dd.get_cores(data) params = ["-T", "GenomicsDBImport", "--reader-threads", str(cores), "--genomicsdb-workspace-path", os.path.relpath(out_dir, os.getcwd()), "-L", bamprep.region_to_gatk(region)] for vrn_file in vrn_files: vcfutils.bgzip_and_index(vrn_file, data["config"]) params += ["--variant", vrn_file] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return out_dir
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcfstreamsort = config_utils.get_program("vcfstreamsort", config) perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] db_file = os.path.join(tmp_path, "main", "somatic.db") if not os.path.exists(db_file + ".dir"): if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path # caling cl = ("{perl_exports} && " "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}") do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # filtering to adjust input parameters bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path)) use_defaults = True if use_defaults: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf") # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher # to swap precision for sensitivity else: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz") with file_transaction(config, scalpel_tmp_file) as tx_indel_file: cmd = ("{perl_exports} && " "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} " "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 " "| bgzip -c > {tx_indel_file}") do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {}) scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config) scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config) fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _filter_by_bedpe(vcf_file, bedpe_file, data): """Add filters to VCF based on pre-filtered bedpe file. """ out_file = "%s-filter%s" % utils.splitext_plus(vcf_file) nogzip_out_file = out_file.replace(".vcf.gz", ".vcf") if not utils.file_exists(out_file): filters = {} with open(bedpe_file) as in_handle: for line in in_handle: parts = line.split("\t") name = parts[6] cur_filter = parts[-1].strip() if cur_filter != "PASS": filters[name] = cur_filter with file_transaction(nogzip_out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with utils.open_gzipsafe(vcf_file) as in_handle: for line in in_handle: if not line.startswith("#"): parts = line.split("\t") cur_id = parts[2].split("_")[0] cur_filter = filters.get(cur_id, "PASS") if cur_filter != "PASS": parts[6] = cur_filter line = "\t".join(parts) out_handle.write(line) if out_file.endswith(".gz"): vcfutils.bgzip_and_index(nogzip_out_file, data["config"]) return out_file
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(vrn_file) sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data)) rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) cmd = ["rtg", "vcfeval", "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] do.run(cmd, "Validate calls using rtg vcfeval", data) return {"tp": os.path.join(out_dir, "tp.vcf.gz"), "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")}
def clean_file(in_file, data, prefix="", bedprep_dir=None, simple=None): """Prepare a clean sorted input BED file without headers """ # Remove non-ascii characters. Used in coverage analysis, to support JSON code in one column # and be happy with sambamba: simple = "iconv -c -f utf-8 -t ascii | sed 's/ //g' |" if simple else "" if in_file: if not bedprep_dir: bedprep_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "bedprep")) # Avoid running multiple times with same prefix if prefix and os.path.basename(in_file).startswith(prefix): return in_file out_file = os.path.join(bedprep_dir, "%s%s" % (prefix, os.path.basename(in_file))) out_file = out_file.replace(".interval_list", ".bed") if out_file.endswith(".gz"): out_file = out_file[:-3] if not utils.file_uptodate(out_file, in_file): check_bed_contigs(in_file, data) check_bed_coords(in_file, data) with file_transaction(data, out_file) as tx_out_file: bcbio_py = sys.executable cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" sort_cmd = get_sort_cmd(os.path.dirname(tx_out_file)) cmd = ("{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^@ | " "grep -v ^# | {simple} " "{bcbio_py} -c 'from bcbio.variation import bedutils; bedutils.remove_bad()' | " "{sort_cmd} -k1,1 -k2,2n > {tx_out_file}") do.run(cmd.format(**locals()), "Prepare cleaned BED file", data) vcfutils.bgzip_and_index(out_file, data.get("config", {}), remove_orig=False) return out_file
def gatk_rnaseq_calling(data): """ use GATK to perform variant calling on RNA-seq data """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) split_bam = dd.get_split_bam(data) out_file = os.path.splitext(split_bam)[0] + ".vcf" bgzipped_file = out_file + ".gz" num_cores = dd.get_num_cores(data) if file_exists(bgzipped_file): data = dd.set_vrn_file(data, bgzipped_file) return data with file_transaction(data, out_file) as tx_out_file: params = ["-T", "HaplotypeCaller", "-R", ref_file, "-I", split_bam, "-o", tx_out_file, "-nct", str(num_cores), "--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000", "-dontUseSoftClippedBases"] broad_runner.run_gatk(params) bgzip_and_index(out_file, dd.get_config(data)) data = dd.set_vrn_file(data, bgzipped_file) return data
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller)) if not utils.file_exists(simple_vcf): gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data) # If we have a standard gene list we can skip BED based prioritization priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if gene_list: if vcf_file.endswith(".vcf.gz"): utils.symlink_plus(vcf_file, priority_vcf) else: assert vcf_file.endswith(".vcf") utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf")) vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"), data["config"], remove_orig=False) # otherwise prioritize based on BED and proceed else: if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data)}}}) jvm_opts = " ".join(jvm_opts) export = utils.local_path_export() cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))) with file_transaction(data, simple_vcf) as tx_out_file: fusion_file = os.path.join(data_dir, "fusion_pairs.txt") opts = "" if os.path.exists(fusion_file): opts += " --known_fusion_pairs %s" % fusion_file if not gene_list: opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt") else: opts += " --gene_list %s" % gene_list cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) if post_prior_fn: simple_vcf = post_prior_fn(simple_vcf, work_dir, data) if not utils.file_uptodate(out_file, simple_vcf): with file_transaction(data, out_file) as tx_out_file: export = utils.local_path_export(env_cmd="vawk") cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file, simple_vcf
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) if not utils.file_exists(out_file): priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: cmd = ("bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") if post_prior_fn: priority_vcf = post_prior_fn(priority_vcf, work_dir, data) simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0] if not utils.file_exists(simple_vcf): with file_transaction(data, simple_vcf) as tx_out_file: transcript_file = regions.get_sv_bed(data, "transcripts1000", work_dir) if transcript_file: transcript_file = vcfutils.bgzip_and_index(transcript_file, data["config"]) ann_opt = "--gene_bed %s" % transcript_file else: ann_opt = "" cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) with file_transaction(data, out_file) as tx_out_file: cmd = ("zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file
def run_tnhaplotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's TNhaplotyper (MuTect2 like). """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(items[0]), items[0]) interval = _get_interval(variant_regions, region, out_file, items) with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for Sentieon TNhaplotyper" dbsnp = "--dbsnp %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" cosmic = "--cosmic %s" % (assoc_files.get("cosmic")) if "cosmic" in assoc_files else "" license = license_export(items[0]) tx_orig_file = "%s-orig%s" % utils.splitext_plus(tx_out_file) cores = dd.get_num_cores(items[0]) cmd = ("{license}sentieon driver -t {cores} -r {ref_file} " "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} " "--algo TNhaplotyper " "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} " "{dbsnp} {cosmic} {tx_orig_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper") cmd = ("gunzip -c {tx_orig_file} | " "sed 's/ID=ECNT,Number=1,Type=Integer/ID=ECNT,Number=1,Type=String/' | " "sed 's/ID=HCNT,Number=1,Type=Integer/ID=HCNT,Number=1,Type=String/' | " "sed 's/ID=NLOD,Number=1,Type=Float/ID=NLOD,Number=1,Type=String/' | " "sed 's/ID=TLOD,Number=1,Type=Float/ID=TLOD,Number=1,Type=String/' | " "sed 's/ID=PON,Number=1,Type=Integer/ID=PON,Number=1,Type=String/' | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper: make headers GATK compatible") vcfutils.bgzip_and_index(tx_out_file, items[0]["config"]) return out_file
def annotate_with_depth(in_file, items): """Annotate called VCF file with depth using duphold (https://github.com/brentp/duphold) Currently annotates single sample and tumor samples in somatic analysis. """ bam_file = None if len(items) == 1: bam_file = dd.get_align_bam(items[0]) else: paired = vcfutils.get_paired(items) if paired: bam_file = paired.tumor_bam if bam_file: out_file = "%s-duphold.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: if not in_file.endswith(".gz"): in_file = vcfutils.bgzip_and_index(in_file, remove_orig=False, out_dir=os.path.dirname(tx_out_file)) ref_file = dd.get_ref_file(items[0]) # cores for BAM reader thread, so max out at 4 based on recommendations cores = min([dd.get_num_cores(items[0]), 4]) cmd = ("duphold --threads {cores} --vcf {in_file} --bam {bam_file} --fasta {ref_file} " "-o {tx_out_file}") do.run(cmd.format(**locals()), "Annotate SV depth with duphold") vcfutils.bgzip_and_index(out_file) return out_file else: return in_file
def annotate_nongatk_vcf(orig_file, bam_files, dbsnp_file, ref_file, config): """Annotate a VCF file with dbSNP and standard GATK called annotations. """ orig_file = vcfutils.bgzip_and_index(orig_file, config) broad_runner = broad.runner_from_config(config) if not broad_runner.has_gatk(): return orig_file else: out_file = "%s-gatkann%s" % utils.splitext_plus(orig_file) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: # Avoid issues with incorrectly created empty GATK index files. # Occurs when GATK cannot lock shared dbSNP database on previous run idx_file = orig_file + ".idx" if os.path.exists(idx_file) and not utils.file_exists(idx_file): os.remove(idx_file) annotations = get_gatk_annotations(config) params = ["-T", "VariantAnnotator", "-R", ref_file, "--variant", orig_file, "--dbsnp", dbsnp_file, "--out", tx_out_file, "-L", orig_file] for bam_file in bam_files: params += ["-I", bam_file] for x in annotations: params += ["-A", x] broad_runner = broad.runner_from_config(config) broad_runner.run_gatk(params, memory_retry=True) vcfutils.bgzip_and_index(out_file, config) return out_file
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in validate.summarize_grading(items)] out = {"validate": items[0]["validate"], "variants": {"calls": [], "gvcf": []}} added = set([]) for data in items: if data.get("vrn_file"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) if cur_name not in added: out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) return [out]
def filter_to_pass_and_reject(in_file, paired, out_dir=None): """Filter VCF to only those with a strict PASS/REJECT: somatic + germline. Removes low quality calls filtered but also labeled with REJECT. """ from bcbio.heterogeneity import bubbletree out_file = "%s-prfilter.vcf.gz" % utils.splitext_plus(in_file)[0] if out_dir: out_file = os.path.join(out_dir, os.path.basename(out_file)) if not utils.file_uptodate(out_file, in_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: max_depth = bubbletree.max_normal_germline_depth(in_file, bubbletree.PARAMS, paired) tx_out_plain = tx_out_file.replace(".vcf.gz", ".vcf") with contextlib.closing(cyvcf2.VCF(in_file)) as reader: reader = _add_db_to_header(reader) with contextlib.closing(cyvcf2.Writer(tx_out_plain, reader)) as writer: for rec in reader: filters = rec.FILTER.split(";") if rec.FILTER else [] other_filters = [x for x in filters if x not in ["PASS", ".", "REJECT"]] if len(other_filters) == 0 or bubbletree.is_info_germline(rec): # Germline, check if we should include based on frequencies if "REJECT" in filters or bubbletree.is_info_germline(rec): stats = bubbletree._is_possible_loh(rec, reader, bubbletree.PARAMS, paired, use_status=True, max_normal_depth=max_depth) if stats: rec.FILTER = "PASS" rec.INFO["DB"] = True writer.write_record(rec) # Somatic, always include else: writer.write_record(rec) vcfutils.bgzip_and_index(tx_out_plain, paired.tumor_data["config"]) return out_file
def run_vep(data): """Annotate input VCF file with Ensembl variant effect predictor. """ out_file = utils.append_stem(data["vrn_file"], "-vepeffects") assert data["vrn_file"].endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data) loftee_args, loftee_fields = _get_loftee(data) std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON", "PolyPhen", "SIFT", "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout"] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--sift", "b", "--polyphen", "b", "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--ccds", "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + dbnsfp_args + loftee_args cmd = "gunzip -c %s | %s | bgzip -c > %s" % (data["vrn_file"], " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def cutoff_w_expression(vcf_file, expression, data, name="+", filterext="", extra_cmd="", limit_regions="variant_regions"): """Perform cutoff-based soft filtering using bcftools expressions like %QUAL < 20 || DP < 4. """ base, ext = utils.splitext_plus(vcf_file) out_file = "{base}-filter{filterext}{ext}".format(**locals()) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: if vcfutils.vcf_has_variants(vcf_file): bcftools = config_utils.get_program("bcftools", data["config"]) bgzip_cmd = "| bgzip -c" if out_file.endswith(".gz") else "" intervals = "" if limit_regions == "variant_regions": variant_regions = dd.get_variant_regions(data) if variant_regions: intervals = "-T %s" % vcfutils.bgzip_and_index(variant_regions, data["config"]) cmd = ("{bcftools} filter -O v {intervals} --soft-filter '{name}' " "-e '{expression}' -m '+' {vcf_file} {extra_cmd} {bgzip_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Cutoff-based soft filtering %s with %s" % (vcf_file, expression), data) else: shutil.copy(vcf_file, out_file) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _filter_by_bedpe(vcf_file, bedpe_file, data): """Add filters to VCF based on pre-filtered bedpe file. Also removes problem calls in the output VCF with missing alleles. """ out_file = "%s-filter%s" % utils.splitext_plus(vcf_file) nogzip_out_file = out_file.replace(".vcf.gz", ".vcf") if not utils.file_exists(out_file): filters = {} with open(bedpe_file) as in_handle: for line in in_handle: parts = line.split("\t") name = parts[6] cur_filter = parts[-1].strip() if cur_filter != "PASS": filters[name] = cur_filter with file_transaction(data, nogzip_out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with utils.open_gzipsafe(vcf_file) as in_handle: for line in in_handle: if not line.startswith("#"): parts = line.split("\t") # Problem breakends can have empty alleles when at contig ends if not parts[3].strip(): parts[3] = "N" cur_id = parts[2].split("_")[0] cur_filter = filters.get(cur_id, "PASS") if cur_filter != "PASS": parts[6] = cur_filter line = "\t".join(parts) out_handle.write(line) if out_file.endswith(".gz"): vcfutils.bgzip_and_index(nogzip_out_file, data["config"]) return out_file
def combine_calls(batch_id, samples, data): """Combine multiple callsets into a final set of merged calls. """ logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"]))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id) exist_variants = False for tmp_vrn_file in vrn_files: if vcfutils.vcf_has_variants(tmp_vrn_file): exist_variants = True break if exist_variants: if "classifiers" not in edata["config"]["algorithm"]["ensemble"]: callinfo = _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, edata["sam_ref"], edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"]) edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file) callinfo = {"variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None} return [[batch_id, callinfo]]
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("vep", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow if dd.get_ref_file_compressed(data): hgvs_compatible = True config_args = ["--fasta", dd.get_ref_file_compressed(data)] else: hgvs_compatible = False config_args = ["--fasta", dd.get_ref_file(data)] if is_human: plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion} plugins = ["loftee"] if "vep_splicesite_annotations" in dd.get_tools_on(data): # "genesplicer" too unstable so currently removed plugins += ["maxentscan", "spliceregion"] for plugin in plugins: plugin_args = plugin_fns[plugin](data) config_args += plugin_args config_args += ["--sift", "b", "--polyphen", "b"] if hgvs_compatible: config_args += ["--hgvs", "--shift_hgvs", "1"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in(("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick_allele"] if ensembl_name.endswith("_merged"): config_args += ["--merged"] ensembl_name = ensembl_name.replace("_merged", "") resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory", "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad", "--pubmed", "--variant_class", "--allele_number"] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _filter_nonref(in_file, data): """Remove NON_REF gVCF items from GATK VCF output; these occasionally sneak through in joint calling. """ out_file = "%s-gatkclean%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = "gunzip -c {in_file} | grep -v NON_REF | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Remove stray NON_REF gVCF information from VCF output", data) vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def add_dbsnp(orig_file, dbsnp_file, config): """Annotate a VCF file with dbSNP. """ orig_file = vcfutils.bgzip_and_index(orig_file, config) out_file = "%s-wdbsnp.vcf.gz" % utils.splitext_plus(orig_file)[0] if not utils.file_uptodate(out_file, orig_file): with file_transaction(config, out_file) as tx_out_file: cmd = "bcftools annotate -c ID -a {dbsnp_file} -o {tx_out_file} -O z {orig_file}" do.run(cmd.format(**locals()), "Annotate with dbSNP") return vcfutils.bgzip_and_index(out_file, config)
def gatk_joint_calling(data, vrn_files, ref_file): joint_file = os.path.join("variation", "joint.vcf") out_file = os.path.join("variation", "combined.vcf") bgzjoint_file = os.path.join("variation", "joint.vcf.gz") bgzout_file = os.path.join("variation", "combined.vcf.gz") if not file_exists(bgzout_file): joint_file = _run_genotype_gvcfs(data, vrn_files, ref_file, joint_file) bgzip_and_index(joint_file, dd.get_config(data)) out_file = gatk_filter_rnaseq(data, bgzjoint_file, out_file) bgzip_and_index(out_file, dd.get_config(data)) return bgzout_file
def clean_titration(): """Subset to interval regions and bgzip/tabix. """ region_bed = os.path.join(in_region_dir, "Intervals_TSAVP_Titr.bed") for in_vcf in glob.glob(os.path.join(in_vcf_dir, "NA1287*.vcf")): out_vcf = os.path.join(out_vcf_dir, "%s.gz" % os.path.join(os.path.basename(in_vcf))) if not os.path.exists(out_vcf): cmd = ("bcftools view {in_vcf} -T {region_bed} | grep -v '##contig' | " "sed 's/^chr//g' | bgzip -c > {out_vcf}") subprocess.check_call(cmd.format(**locals()), shell=True) vcfutils.bgzip_and_index(out_vcf)
def combine_calls(*args): """Combine multiple callsets into a final set of merged calls. """ if len(args) == 3: is_cwl = False batch_id, samples, data = args caller_names, vrn_files = _organize_variants(samples, batch_id) else: is_cwl = True samples = [utils.to_single_data(x) for x in args] samples = [cwlutils.unpack_tarballs(x, x) for x in samples] data = samples[0] batch_id = data["batch_id"] caller_names = data["variants"]["variantcallers"] vrn_files = data["variants"]["calls"] logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(caller_names))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) if any([vcfutils.vcf_has_variants(f) for f in vrn_files]): # Decompose multiallelic variants and normalize passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False) vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True, nonrefonly=True, work_dir=utils.safe_makedir(os.path.join(base_dir, c))) for c, f in zip(caller_names, vrn_files)] if "classifiers" not in (dd.get_ensemble(edata) or {}): callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, dd.get_ref_file(edata), edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"]) # After decomposing multiallelic variants and normalizing, re-evaluate effects ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data) if ann_ma_file: callinfo["vrn_file"] = ann_ma_file edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples]) callinfo = {"variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None} if is_cwl: callinfo["batch_samples"] = data["batch_samples"] callinfo["batch_id"] = batch_id return [{"ensemble": callinfo}] else: return [[batch_id, callinfo]]
def clean_file(in_file, data, prefix=""): """Prepare a clean sorted input BED file without headers """ if in_file: bedprep_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "bedprep")) out_file = os.path.join(bedprep_dir, "%s%s" % (prefix, os.path.basename(in_file))) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: cmd = "grep -v ^track {in_file} | grep -v ^browser | sort -k1,1 -k2,2n > {tx_out_file}" do.run(cmd.format(**locals()), "Prepare cleaned BED file", data) vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False) return out_file
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data) loftee_args, loftee_fields = _get_loftee(data) std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON", "PolyPhen", "SIFT", "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--sift", "b", "--polyphen", "b", "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--ccds", "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + dbnsfp_args + loftee_args if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False): # In case of clinical reporting, we need one and only one # variant per gene # From the VEP docs: # "Pick once line of consequence data per variant, # including transcript-specific columns. Consequences are # chosen by the canonical, biotype status and length of the # transcript, along with the ranking of the consequence # type according to this table. This is the best method to # use if you are interested only in one consequence per # variant. cmd += ["--pick"] # TODO investigate hgvs reporting but requires indexing the reference file # cmd += ["--hgvs", "--shift-hgvs", "--fasta", dd.get_ref_file(data)] perllib = "export PERL5LIB=%s:$PERL5LIB" % _get_perllib() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perllib, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) if is_human: dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data) loftee_args, loftee_fields = _get_loftee(data) prediction_args = ["--sift", "b", "--polyphen", "b"] prediction_fields = ["PolyPhen", "SIFT"] else: dbnsfp_args, dbnsfp_fields = [], [] loftee_args, loftee_fields = [], [] prediction_args, prediction_fields = [], [] if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False): # In case of clinical reporting, we need one and only one variant per gene # http://useast.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick # Also use hgvs reporting but requires indexing the reference file clinical_args = ["--pick", "--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data)] clinical_fields = ["HGVSc", "HGVSp"] else: clinical_args, clinical_fields = [], [] std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields + clinical_fields)] + \ prediction_args + dbnsfp_args + loftee_args + clinical_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def annotate_nongatk_vcf(orig_file, bam_files, dbsnp_file, ref_file, data, out_file=None): """Annotate a VCF file with dbSNP and standard GATK called annotations. """ orig_file = vcfutils.bgzip_and_index(orig_file, data["config"]) broad_runner = broad.runner_from_config_safe(data["config"]) if not broad_runner or not broad_runner.has_gatk() or broad_runner.gatk_type() == "gatk4": if dbsnp_file: return add_dbsnp(orig_file, dbsnp_file, data, out_file) else: return orig_file else: if out_file is None: out_file = "%s-gatkann%s" % utils.splitext_plus(orig_file) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: # Avoid issues with incorrectly created empty GATK index files. # Occurs when GATK cannot lock shared dbSNP database on previous run idx_file = orig_file + ".idx" if os.path.exists(idx_file) and not utils.file_exists(idx_file): os.remove(idx_file) annotations = get_gatk_annotations(data["config"], include_depth=False) params = ["-T", "VariantAnnotator", "-R", ref_file, "--variant", orig_file, "--out", tx_out_file, "-L", orig_file] if dbsnp_file: params += ["--dbsnp", dbsnp_file] for bam_file in bam_files: params += ["-I", bam_file] for x in annotations: params += ["-A", x] if ("--allow_potentially_misencoded_quality_scores" not in params and "-allowPotentiallyMisencodedQuals" not in params): params += ["--allow_potentially_misencoded_quality_scores"] # be less stringent about BAM and VCF files (esp. N in CIGAR for RNA-seq) # start by removing existing -U or --unsafe opts # (if another option is added to Gatk that starts with -U... this may create a bug) unsafe_options = [x for x in params if x.startswith(("-U", "--unsafe"))] for my_opt in unsafe_options: ind_to_rem = params.index(my_opt) # are the options given as separate strings or in one? if my_opt.strip() == "-U" or my_opt.strip() == "--unsafe": params.pop(ind_to_rem + 1) params.pop(ind_to_rem) params.extend(["-U", "ALL"]) broad_runner = broad.runner_from_config(data["config"]) broad_runner.run_gatk(params) vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.6": raise IOError("Please install version 2.3.6 or better of VarScan" " with support for multisample calling and indels" " in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" # write a temporary mpileup file so we can check if empty mpfile = "%s.mpileup" % os.path.splitext(out_file)[0] with file_transaction(config, mpfile) as mpfile_tx: cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}") do.run(cmd.format(**locals()), "mpileup for Varscan") if os.path.getsize(mpfile) == 0: write_empty_vcf(out_file) else: with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ("cat {mpfile} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "| {fix_ambig} | vcfuniqalleles > {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) os.remove(mpfile) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) else: freebayes.clean_vcf_output(out_file, _clean_varscan_line, config) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) config_args, config_fields, prediction_fields = [], [], [] if is_human: plugin_fns = {"dbnsfp": _get_dbnsfp, "loftee": _get_loftee, "dbscsnv": _get_dbscsnv, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer} plugins = tz.get_in(("config", "resources", "vep", "plugins"), data, ["dbnsfp", "loftee"]) for plugin in plugins: plugin_args, plugin_fields = plugin_fns[plugin](data) config_args += plugin_args config_fields += plugin_fields config_args += ["--sift", "b", "--polyphen", "b"] prediction_fields += ["PolyPhen", "SIFT"] # Use HGVS by default, requires indexing the reference genome config_args += ["--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data)] config_fields += ["HGVSc", "HGVSp"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in(("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick"] std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--fields", ",".join(std_fields + config_fields)] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller)) if not utils.file_exists(simple_vcf): gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data) # If we have a standard gene list we can skip BED based prioritization if gene_list: priority_vcf = os.path.join(work_dir, os.path.basename(vcf_file)) utils.symlink_plus(vcf_file, priority_vcf) # otherwise prioritize based on BED and proceed else: priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"])) export = utils.local_path_export() cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") if post_prior_fn: priority_vcf = post_prior_fn(priority_vcf, work_dir, data) data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))) with file_transaction(data, simple_vcf) as tx_out_file: fusion_file = os.path.join(data_dir, "fusion_pairs.txt") opts = "" if os.path.exists(fusion_file): opts += " --known_fusion_pairs %s" % fusion_file if not gene_list: opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt") else: opts += " --gene_list %s" % gene_list cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) if not utils.file_uptodate(out_file, simple_vcf): with file_transaction(data, out_file) as tx_out_file: export = utils.local_path_export() cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file, simple_vcf
def run_combine_gvcfs(vrn_files, region, ref_file, out_file, data): if not utils.file_exists(out_file): broad_runner = broad.runner_from_config(data["config"]) with file_transaction(data, out_file) as tx_out_file: params = ["-T", "CombineGVCFs", "-R", ref_file, "-o", tx_out_file] if region: params += ["-L", bamprep.region_to_gatk(region)] for vrn_file in vrn_files: params += ["--variant", vrn_file] cores = dd.get_cores(data) memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params, memscale=memscale) return vcfutils.bgzip_and_index(out_file, data["config"])
def _select_sample(data, variant_file, work_dir): """Select current sample from original call file. """ sample_name = dd.get_sample_name(data) if dd.get_phenotype(data) == "germline": variant_file = germline.fix_germline_samplename(variant_file, sample_name, data) out_file = os.path.join(work_dir, "%s-%s.vcf.gz" % (utils.splitext_plus(os.path.basename(variant_file))[0], sample_name)) if not utils.file_uptodate(out_file, variant_file): with file_transaction(data, out_file) as tx_out_file: cmd = "bcftools view -s {sample_name} -O z -o {tx_out_file} {variant_file}" do.run(cmd.format(**locals()), "Run manta SV analysis") return vcfutils.bgzip_and_index(out_file, data["config"])
def to_single(in_file, data): """Convert multi-allelic inputs in the original VCF file into single alleles. """ out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): if vcfutils.vcf_has_variants(in_file): ready_ma_file = _decompose(in_file, data) ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data) if ann_ma_file: ready_ma_file = ann_ma_file out_file = ready_ma_file else: utils.symlink_plus(in_file, out_file) return vcfutils.bgzip_and_index(out_file, data["config"])
def merge_overlaps(in_file, data, distance=None, out_dir=None): """Merge bed file intervals to avoid overlapping regions. Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes that don't collapse BEDs prior to using them. """ if in_file: bedtools = config_utils.get_program("bedtools", data["config"]) work_dir = tz.get_in(["dirs", "work"], data) if out_dir: bedprep_dir = out_dir elif work_dir: bedprep_dir = utils.safe_makedir(os.path.join(work_dir, "bedprep")) else: bedprep_dir = os.path.dirname(in_file) out_file = os.path.join(bedprep_dir, "%s-merged.bed" % (utils.splitext_plus(os.path.basename(in_file))[0])) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: distance = "-d %s" % distance if distance else "" cmd = "{bedtools} merge {distance} -i {in_file} > {tx_out_file}" do.run(cmd.format(**locals()), "Prepare merged BED file", data) vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False) return out_file
def finalize_vcf(in_file, variantcaller, items): """Perform cleanup and annotation of the final VCF. """ out_file = "%s-annotated%s" % utils.splitext_plus(in_file) if not utils.file_uptodate(out_file, in_file): with file_transaction(items[0], out_file) as tx_out_file: cl = _add_vcf_header_sample_cl(in_file, items, out_file) if cl: cmd = "{cl} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Annotate") if utils.file_exists(out_file): return vcfutils.bgzip_and_index(out_file, items[0]["config"]) else: return in_file
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file, work_dir): if not utils.file_exists(out_file): with file_transaction(items[0], work_dir) as tx_work_dir: workflow_file = _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir) if workflow_file: _run_workflow(items[0], workflow_file, tx_work_dir) else: vcfutils.write_empty_vcf(out_file, items[0]["config"], [dd.get_sample_name(d) for d in items]) raw_file = os.path.join(work_dir, "results", "variants", "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz") utils.copy_plus(raw_file, out_file) # Remove files with relative symlinks utils.remove_plus(os.path.join(work_dir, "results", "variants", "genome.vcf.gz")) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _apply_priority_filter(in_file, priority_file, data): """Annotate variants with priority information and use to apply filters. """ out_file = "%s-priority%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: header = ('##INFO=<ID=EPR,Number=.,Type=String,' 'Description="Somatic prioritization based on external annotations, ' 'identify as likely germline">') header_file = "%s-repeatheader.txt" % utils.splitext_plus(tx_out_file)[0] with open(header_file, "w") as out_handle: out_handle.write(header) if "tumoronly_germline_filter" in dd.get_tools_on(data): filter_cmd = ("bcftools filter -m '+' -s 'LowPriority' " """-e "EPR[*] != 'pass'" |""") else: filter_cmd = "" cmd = ("bcftools annotate -a {priority_file} -h {header_file} " "-c CHROM,FROM,TO,REF,ALT,INFO/EPR {in_file} | " "{filter_cmd} bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "Run external annotation based prioritization filtering") vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _cnn_score_variants(in_file, tensor_type, data): """Score variants with pre-trained CNN models. """ out_file = "%s-cnnscore.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): runner = broad.runner_from_config(data["config"]) gatk_type = runner.gatk_type() assert gatk_type == "gatk4", "CNN filtering requires GATK4" with file_transaction(data, out_file) as tx_out_file: params = ["-T", "CNNScoreVariants", "--variant", in_file, "--reference", dd.get_ref_file(data), "--output", tx_out_file, "--input", dd.get_align_bam(data)] params += ["--tensor-type", tensor_type] runner.run_gatk(params) return vcfutils.bgzip_and_index(out_file, data["config"])
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Return DeepVariant calling on germline samples. region can be a single region or list of multiple regions for multicore calling. """ assert not vcfutils.is_paired_analysis(align_bams, items), \ ("DeepVariant currently only supports germline calling: %s" % (", ".join([dd.get_sample_name(d) for d in items]))) assert len(items) == 1, \ ("DeepVariant currently only supports single sample calling: %s" % (", ".join([dd.get_sample_name(d) for d in items]))) out_file = _run_germline(align_bams[0], items[0], ref_file, region, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _run_genomicsdb_import(vrn_files, region, out_file, data): """Create a GenomicsDB reference for all the variation files: GATK4. Not yet tested as scale, need to explore --batchSize to reduce memory usage if needed. Does not support transactional directories yet, since GenomicsDB databases cannot be moved to new locations. We try to identify half-finished databases and restart: https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4 Known issue -- Genomics DB workspace path core dumps on longer paths: (std::string::compare(char const*)) """ out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0] if not os.path.exists(out_dir) or _incomplete_genomicsdb(out_dir): if os.path.exists(out_dir): shutil.rmtree(out_dir) with utils.chdir(os.path.dirname(out_file)): with file_transaction(data, out_dir) as tx_out_dir: broad_runner = broad.runner_from_config(data["config"]) cores = dd.get_cores(data) params = ["-T", "GenomicsDBImport", "--reader-threads", str(cores), "--genomicsdb-workspace-path", os.path.relpath(out_dir, os.getcwd()), "-L", bamprep.region_to_gatk(region)] for vrn_file in vrn_files: vcfutils.bgzip_and_index(vrn_file, data["config"]) samplemap = _create_samplemap_file(vrn_files) params += ["--sample-name-map", samplemap] # For large inputs, reduce memory usage by batching # https://github.com/bcbio/bcbio-nextgen/issues/2852 if len(vrn_files) > 200: params += ["--batch-size", "50"] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, memscale=memscale) return out_dir
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = [ "platypus", "callVariants", "--regions=%s" % _subset_regions(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-", "--logFileName", "/dev/null", "--verbosity=1" ] resources = config_utils.get_resources("platypus", items[0]["config"]) if resources.get("options"): # normalize options so we can set defaults without overwriting user specified for opt in resources["options"]: if "=" in opt: key, val = opt.split("=") cmd.extend([key, val]) else: cmd.append(opt) if any("gvcf" in dd.get_tools_on(d) for d in items): cmd += ["--outputRefCalls", "1", "--refCallBlockSize", "50000"] # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers # Currently not used after doing more cross validation as they increase false positives # which seems to be a major advantage for Platypus users. # tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9", # "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001", # "--minVarFreq", "0.0", "--assemble", "1"] # for okey, oval in utils.partition_all(2, tuned_opts): # if okey not in cmd: # cmd.extend([okey, oval]) # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not dd.get_mark_duplicates(data) for data in items): cmd += ["--filterDuplicates=0"] post_process_cmd = ( " | %s | %s | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | " "vcfstreamsort | bgzip -c > %s" % (vcfutils.fix_ambiguous_cl(), vcfutils.fix_ambiguous_cl(5), vcfutils.add_contig_to_header_cl(items[0]), tx_out_file)) do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def variants(data): if "vrn_file" not in data: return data if not dd.get_coverage(data): return data in_vcf = data['vrn_file'] sample = dd.get_sample_name(data) cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): if file_exists(qc_file): return data in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", bed_file, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): os.remove(cg_file) return data
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): paired = vcfutils.get_paired_bams(align_bams, items) broad_runner = broad.runner_from_config(items[0]["config"]) gatk_type = broad_runner.gatk_type() _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = ["-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] if gatk_type == "gatk4": params += ["--reference", ref_file] else: params += ["-R", ref_file] for a in annotation.get_gatk_annotations(items[0]["config"], include_baseqranksum=False): params += ["--annotation", a] # Avoid issues with BAM CIGAR reads that GATK doesn't like if gatk_type == "gatk4": params += ["--read-validation-stringency", "LENIENT"] params += _add_tumor_params(paired, items, gatk_type) params += _add_region_params(region, out_file, items, gatk_type) # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm # Not yet clear how this helps or hurts in a general case. #params += _add_assoc_params(assoc_files) resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) if gatk_type == "gatk4": tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(tx_out_file) filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file, ref_file) cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}" else: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) cmd = "{gatk_cmd} > {tx_raw_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _rename_allelic_fraction_field(orig_file, config, out_file): """Rename allelic fraction field in mutect output from FA to FREQ to standarize with other tools """ out_file_noc = out_file.replace(".vcf.gz", ".vcf") with file_transaction(config, out_file_noc) as tx_out_file: with open_gzipsafe(orig_file) as in_handle: with open(tx_out_file, 'w') as out_handle: for line in in_handle: if line.startswith("##FORMAT=<ID=FA"): line = line.replace("=FA", "=FREQ") if not line.startswith("#"): line = line.replace("FA", "FREQ") out_handle.write(line) return bgzip_and_index(out_file_noc, config)
def _get_ploidy(regions, items, base_file): samples = [dd.get_sample_name(d) for d in items] out_file = "%s-ploidy.vcf" % utils.splitext_plus(base_file)[0] if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(items[0], out_file) as tx_outfile: with open(tx_outfile, "w") as h: h.write("##fileformat=VCFv4.1\n") h.write('##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n') h.write('##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">\n') h.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + "\t".join(samples) + "\n") for region in regions: ploidies = [ploidy.get_ploidy([d], region) for d in items] h.write("\t".join([region[0], str(region[1]), ".", "N", "<CNV>", ".", ".", "END=%s" % region[2], "CN"] + [str(x) for x in ploidies]) + "\n") return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _filter_vcf(orig_file, ftype="max", name="ColorCustom"): """Filter VCF with bcftools, providing count summary of items removed. """ exprs = {} exprs["max"] = ('SUM(AD[*]) < 15 || ' 'PL[0] / SUM(AD[*]) <= 3.0 || ' 'GC < 20.0 || GC > 77.0 || ' 'RPT[*] = "rmsk" || ' 'RPT[*] = "lcr"') exprs["min2"] = ('SUM(AD[*]) < 15 || ' 'PL[0] / SUM(AD[*]) <= 3.0 || ' 'RPT[*] = "lcr"') exprs["min1"] = ('SUM(AD[*]) < 15 || ' 'PL[0] / SUM(AD[*]) <= 3.0 || ' '(RPT[*] = "lcr" && RPT[*] = "rmsk")') exprs["min0"] = ('SUM(AD[*]) < 15 || ' 'PL[0] / SUM(AD[*]) <= 3.0') exprs["all"] = 'GC < 1.0' expr = exprs[ftype] base, ext = utils.splitext_plus(orig_file) out_file = "%s-filter%s%s" % (base, ftype, ext) if not utils.file_exists(out_file): with file_transaction({}, out_file) as tx_out_file: cmd = ("bcftools filter -O z -o {tx_out_file} " "-m '+' -e '{expr}' -s '{name}' {orig_file}") do.run(cmd.format(**locals()), "Hard filter VCF") vcfutils.bgzip_and_index(out_file, {}) def count(f): with gzip.open(f) as h: return sum(1 for line in h if not line.startswith("#") and line.split("\t")[6] in ["PASS", "."]) removed_stats = {"orig": count(orig_file), "final": count(out_file)} removed_stats["pct"] = float( removed_stats["final"]) * 100.0 / removed_stats["orig"] return out_file, removed_stats
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file): if not utils.file_exists(out_file): work_dir = "%s-work" % utils.splitext_plus(out_file)[0] with file_transaction(items[0], work_dir) as tx_work_dir: workflow_file = _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir) _run_workflow(items[0], workflow_file, tx_work_dir) raw_file = os.path.join( work_dir, "results", "variants", "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz") out_file = annotation.annotate_nongatk_vcf(raw_file, align_bams, assoc_files.get("dbsnp"), ref_file, items[0], out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def merge_gvcfs(data, region, vrn_files, out_file): """Simple merging of gVCFs with gvcftools. merge_variants does appear to work correctly, so we remove gVCF parts with extract_variants and then combine the merged samples together. Longer term we plan to replace this with agg (https://github.com/Illumina/agg) or GLnexus (https://github.com/dnanexus-rnd/GLnexus). """ if not utils.file_exists(out_file): region = bamprep.region_to_gatk(region) vcfutils.merge_variant_files([_extract_variants_from_gvcf(f, region, out_file, data) for f in vrn_files], out_file, dd.get_ref_file(data), data["config"], region) return vcfutils.bgzip_and_index(out_file, data["config"])
def filter_to_pass_and_reject(in_file, paired, out_dir=None): """Filter VCF to only those with a strict PASS/REJECT: somatic + germline. Removes low quality calls filtered but also labeled with REJECT. """ from bcbio.heterogeneity import bubbletree out_file = "%s-prfilter.vcf.gz" % utils.splitext_plus(in_file)[0] if out_dir: out_file = os.path.join(out_dir, os.path.basename(out_file)) if not utils.file_uptodate(out_file, in_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: tx_out_plain = tx_out_file.replace(".vcf.gz", ".vcf") with contextlib.closing(cyvcf2.VCF(in_file)) as reader: reader = _add_db_to_header(reader) with contextlib.closing(cyvcf2.Writer(tx_out_plain, reader)) as writer: for rec in reader: filters = rec.FILTER.split(";") if rec.FILTER else [] other_filters = [ x for x in filters if x not in ["PASS", ".", "REJECT"] ] if len(other_filters) == 0: # Germline, check if we should include based on frequencies if "REJECT" in filters or rec.INFO.get( "STATUS", "").lower() == "germline": stats = bubbletree._is_possible_loh( rec, reader, bubbletree.PARAMS, paired) if stats: rec.INFO["DB"] = True writer.write_record(rec) # Somatic, always include else: writer.write_record(rec) vcfutils.bgzip_and_index(tx_out_plain, paired.tumor_data["config"]) return out_file
def add_dbsnp(orig_file, dbsnp_file, data, out_file=None): """Annotate a VCF file with dbSNP. """ orig_file = vcfutils.bgzip_and_index(orig_file, data["config"]) if out_file is None: out_file = "%s-wdbsnp.vcf.gz" % utils.splitext_plus(orig_file)[0] if not utils.file_uptodate(out_file, orig_file): with file_transaction(data, out_file) as tx_out_file: conf_file = os.path.join(os.path.dirname(tx_out_file), "dbsnp.conf") with open(conf_file, "w") as out_handle: out_handle.write('[[annotation]]\n') out_handle.write('file="%s"\n' % os.path.normpath( os.path.join(dd.get_work_dir(data), dbsnp_file))) out_handle.write('fields=["ID"]\n') out_handle.write('names=["rs_ids"]\n') out_handle.write('ops=["concat"]\n') ref_file = dd.get_ref_file(data) cmd = ( "vcfanno {conf_file} {orig_file} | " "bcftools annotate --set-id +'%INFO/rs_ids' -o {tx_out_file} -O z" ) do.run(cmd.format(**locals()), "Annotate with dbSNP") return vcfutils.bgzip_and_index(out_file, data["config"])
def _decompose(in_file, data): """Convert multi-allelic variants into single allelic. """ out_file = "%s-decompose%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): assert out_file.endswith(".vcf.gz") with file_transaction(data, out_file) as tx_out_file: cmd = ("gunzip -c %s | " "sed 's/ID=AD,Number=./ID=AD,Number=R/' | " "vt decompose -s - " """| awk '{ gsub("./-65", "./."); print $0 }'""" "| bgzip -c > %s") do.run(cmd % (in_file, tx_out_file), "Multi-allelic to single allele") return vcfutils.bgzip_and_index(out_file, data["config"])
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file): if not utils.file_exists(out_file): work_dir = "%s-work" % utils.splitext_plus(out_file)[0] with file_transaction(items[0], work_dir) as tx_work_dir: workflow_file = _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir) _run_workflow(items[0], workflow_file, tx_work_dir) raw_file = os.path.join( work_dir, "results", "variants", "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz") utils.copy_plus(raw_file, out_file) # Remove files with relative symlinks utils.remove_plus( os.path.join(work_dir, "results", "variants", "genome.vcf.gz")) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def filter_to_pass_and_reject(in_file, data, out_dir=None): """Filter VCF to only those with a strict PASS/REJECT: somatic + germline. Removes low quality calls filtered but also labeled with REJECT. """ out_file = "%s-prfilter.vcf.gz" % utils.splitext_plus(in_file)[0] if out_dir: out_file = os.path.join(out_dir, os.path.basename(out_file)) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: tx_out_plain = tx_out_file.replace(".vcf.gz", ".vcf") with contextlib.closing(cyvcf2.VCF(in_file)) as reader: with contextlib.closing(cyvcf2.Writer(tx_out_plain, reader)) as writer: for rec in reader: filters = rec.FILTER.split(";") if rec.FILTER else [] filters = [ x for x in filters if x not in ["PASS", ".", "REJECT"] ] if len(filters) == 0: writer.write_record(rec) vcfutils.bgzip_and_index(tx_out_plain, data["config"]) return out_file
def sort_to_ref(fname, ref_file, add_chr): """Match reference genome ordering. """ out_file = "%s-prep.vcf.gz" % (fname.replace(".vcf.gz", "")) if not os.path.exists(out_file): if add_chr: fix_chrom = r'| sed "s/^\([0-9]\+\)\t/chr\1\t/g" | sed "s/^MT/chrM/g" | sed "s/^X/chrX/g" | sed "s/^Y/chrY/g" ' else: fix_chrom = '' contig_cl = vcfutils.add_contig_to_header_cl(ref_file, out_file) cmd = ("gunzip -c {fname} {fix_chrom} | " "gsort /dev/stdin {ref_file}.fai | {contig_cl} | " "bgzip -c > {out_file}") subprocess.check_call(cmd.format(**locals()), shell=True) return vcfutils.bgzip_and_index(out_file, {})
def clean_file(in_file, data, prefix="", bedprep_dir=None, simple=None): """Prepare a clean sorted input BED file without headers """ # Remove non-ascii characters. Used in coverage analysis, to support JSON code in one column # and be happy with sambamba: simple = "iconv -c -f utf-8 -t ascii | sed 's/ //g' |" if simple else "" if in_file: if not bedprep_dir: bedprep_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "bedprep")) # Avoid running multiple times with same prefix if prefix and os.path.basename(in_file).startswith(prefix): return in_file out_file = os.path.join(bedprep_dir, "%s%s" % (prefix, os.path.basename(in_file))) out_file = out_file.replace(".interval_list", ".bed") if out_file.endswith(".gz"): out_file = out_file[:-3] if not utils.file_uptodate(out_file, in_file): check_bed_contigs(in_file, data) check_bed_coords(in_file, data) with file_transaction(data, out_file) as tx_out_file: bcbio_py = sys.executable cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" sort_cmd = get_sort_cmd(os.path.dirname(tx_out_file)) cmd = ( "{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^@ | " "grep -v ^# | {simple} " "{bcbio_py} -c 'from bcbio.variation import bedutils; bedutils.remove_bad()' | " "{sort_cmd} -k1,1 -k2,2n > {tx_out_file}") do.run(cmd.format(**locals()), "Prepare cleaned BED file", data) vcfutils.bgzip_and_index(out_file, data.get("config", {}), remove_orig=False) return out_file
def subset_by_callers(in_file, callers): out_file = "%s-%s.vcf" % (in_file.replace(".vcf", "").replace( ".gz", ""), "_".join(callers)) if not os.path.exists(out_file) and not os.path.exists(out_file + ".gz"): want_callers = set(callers) reader = VariantFile(in_file) writer = VariantFile(out_file, "w", header=reader.header) count = 0 for rec in reader: cur_callers = set(rec.info["set"].split("-")) if len(cur_callers & want_callers) > 0: count += 1 writer.write(rec) print callers, count return vcfutils.bgzip_and_index(out_file, {})
def hard_w_expression(vcf_file, expression, data, name="+", filterext="", extra_cmd="", limit_regions="variant_regions"): """Perform hard filtering using bcftools expressions like %QUAL < 20 || DP < 4. """ base, ext = utils.splitext_plus(vcf_file) out_file = "{base}-filter{filterext}{ext}".format(**locals()) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: if vcfutils.vcf_has_variants(vcf_file): bcftools = config_utils.get_program("bcftools", data["config"]) bgzip_cmd = "| bgzip -c" if out_file.endswith(".gz") else "" variant_regions = ( utils.get_in(data, ("config", "algorithm", "variant_regions")) if limit_regions == "variant_regions" else None) intervals = ( "-T %s" % vcfutils.bgzip_and_index(variant_regions, data["config"]) if variant_regions else "") cmd = ( "{bcftools} filter -O v {intervals} --soft-filter '{name}' " "-e '{expression}' -m '+' {vcf_file} {extra_cmd} {bgzip_cmd} > {tx_out_file}" ) do.run(cmd.format(**locals()), "Hard filtering %s with %s" % (vcf_file, expression), data) else: shutil.copy(vcf_file, out_file) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _setup_call_false(vrn_file, rm_bed, base_dir, data, call_type): """Create set of false positives or ngatives for inputs with empty truth sets. """ out_file = os.path.join(base_dir, "%s.vcf.gz" % call_type) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: if not vrn_file.endswith(".gz"): vrn_file = vcfutils.bgzip_and_index( vrn_file, out_dir=os.path.dirname(tx_out_file)) cmd = ( "bcftools view -R {rm_bed} -f 'PASS,.' {vrn_file} -O z -o {tx_out_file}" ) do.run(cmd.format(**locals()), "Prepare %s with empty reference" % call_type, data) return {call_type: out_file}
def fix_germline_samplename(in_file, sample_name, data): """Replace germline sample names, originally from normal BAM file. """ out_file = "%s-fixnames%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: sample_file = "%s-samples.txt" % utils.splitext_plus( tx_out_file)[0] with open(sample_file, "w") as out_handle: out_handle.write("%s\n" % sample_name) cmd = ( "bcftools reheader -s {sample_file} {in_file} -o {tx_out_file}" ) do.run(cmd.format(**locals()), "Fix germline samplename: %s" % sample_name) return vcfutils.bgzip_and_index(out_file, data["config"])