def run(items): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ assert len(items) == 1, "Expect one input to MetaSV ensemble calling" data = items[0] work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir] methods = [] for call in data.get("sv", []): if call["variantcaller"] in SUPPORTED and call["variantcaller"] not in methods: methods.append(call["variantcaller"]) cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if len(methods) >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--assembly_max_tools=1", "--assembly_pad=500"] cmd += ["--boost_sc", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>50000) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>80) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || " "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>2)") filter_file = vfilter.hard_w_expression(out_file, filters, data, name="ReassemblyStats", limit_regions=None) effects_vcf, _ = effects.add_to_vcf(filter_file, data, "snpeff") data["sv"].append({"variantcaller": "metasv", "vrn_file": effects_vcf or filter_file}) return [data]
def run(calls, data): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir] available_callers = 0 for call in calls: if call["variantcaller"] in SUPPORTED: available_callers += 1 cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if available_callers >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--assembly_max_tools=1", "--assembly_pad=500"] cmd += ["--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>10000) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>20) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || " "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>1)") filter_file = vfilter.hard_w_expression(out_file, filters, data, name="ReassemblyStats", limit_regions=None) calls.append({"variantcaller": "metasv", "vrn_file": filter_file}) return calls
def _variant_filtration_indel(snp_file, ref_file, vrn_files, data): """Filter indel variant calls using GATK best practice recommendations. """ config = data["config"] broad_runner = broad.runner_from_config(config) filter_type = "INDEL" variantcaller = config["algorithm"].get("variantcaller", "gatk") if not config_utils.use_vqsr([config["algorithm"]]): filterexp = " || ".join(["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"]) return vfilter.hard_w_expression(snp_file, filterexp, data, filter_type) else: # also check if we've failed recal and needed to do strict filtering filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type) if file_exists(filter_file): config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, data) assert "train_indels" in vrn_files, "Need indel training file specified" params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend(["--recal_file", tx_recal, "--tranches_file", tx_tranches]) if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("2.7"): params.extend(["--numBadVariants", "3000"]) try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False) except: logger.info("VQSR failed due to lack of training data. Using hard filtering.") config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, data) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def run(items): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ assert len(items) == 1, "Expect one input to MetaSV ensemble calling" data = items[0] work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + [ "--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir ] methods = [] for call in data.get("sv", []): if call["variantcaller"] in SUPPORTED and call[ "variantcaller"] not in methods: methods.append(call["variantcaller"]) cmd += [ "--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"]) ] if len(methods) >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save( dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")) cmd += [ "--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data)) ] cmd += [ "--spades", utils.which("spades.py"), "--age", utils.which("age_align") ] cmd += ["--assembly_max_tools=1", "--assembly_pad=500"] cmd += [ "--boost_sc", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"] ] do.run(cmd, "Combine variant calls with MetaSV") filters = ( "(NUM_SVTOOLS = 1 && ABS(SVLEN)>50000) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>80) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || " "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>2)") filter_file = vfilter.hard_w_expression(out_file, filters, data, name="ReassemblyStats", limit_regions=None) effects_vcf, _ = effects.add_to_vcf(filter_file, data, "snpeff") data["sv"].append({ "variantcaller": "metasv", "vrn_file": effects_vcf or filter_file }) return [data]
def _filter_by_support(in_file, data): """Filter call file based on supporting evidence, adding FILTER annotations to VCF. Filters based on the following criteria: - Minimum read support for the call (SU = total support) - Large calls need split read evidence. """ rc_filter = ("FORMAT/SU < 4 || " "(FORMAT/SR == 0 && ABS(SVLEN)>20000)") return vfilter.hard_w_expression(in_file, rc_filter, data, name="ReadCountSupport", limit_regions=None)
def _filter_by_support(in_file, data): """Filter call file based on supporting evidence, adding FILTER annotations to VCF. Filters based on the following criteria: - Minimum read support for the call. Other filters not currently applied due to being too restrictive: - Multiple forms of evidence in any sample (split and paired end) """ rc_filter = "FORMAT/SU < 4" # approach_filter = "FORMAT/SR == 0 || FORMAT/PE == 0" return vfilter.hard_w_expression(in_file, rc_filter, data, name="ReadCountSupport")
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir( os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["align_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = len(items) config["resources"]["delly"] = delly_config parallel = { "type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"] } sv_types = [ "DEL", "DUP", "INV" ] # "TRA" has invalid VCF END specifications that GATK doesn't like with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam: bytype_vcfs = run_multicore( _run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type ) in itertools.product(pysam_work_bam.references, sv_types)], config, parallel) out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample( combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = vfilter.hard_w_expression( delly_sample_vcf, "FMT/DV < 4 || (FMT/DV / (FMT/DV + FMT/DR)) < 0.2", data, name="DVSupport") data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf}) out.append(data) return out
def _filter_by_support(in_file, data): """Filter call file based on supporting evidence, adding FILTER annotations to VCF. Filters based on the following criteria: - Minimum read support for the call. Other filters not currently applied due to being too restrictive: - Multiple forms of evidence in any sample (split and paired end) """ rc_filter = "FORMAT/SU < 4" # approach_filter = "FORMAT/SR == 0 || FORMAT/PE == 0" return vfilter.hard_w_expression(in_file, rc_filter, data, name="ReadCountSupport", limit_regions=None)
def _filter_by_support(in_file, data): """Filter call file based on supporting evidence, adding FILTER annotations to VCF. Filters based on the following criteria: - Minimum read support for the call (SU = total support) - Large calls need split read evidence. """ rc_filter = ("FORMAT/SU < 4 || " "(FORMAT/SR == 0 && FORMAT/SU < 15 && ABS(SVLEN)>50000) || " "(FORMAT/SR == 0 && FORMAT/SU < 5 && ABS(SVLEN)<2000) || " "(FORMAT/SR == 0 && FORMAT/SU < 15 && ABS(SVLEN)<300)") return vfilter.hard_w_expression(in_file, rc_filter, data, name="ReadCountSupport", limit_regions=None)
def _variant_filtration_snp(snp_file, ref_file, vrn_files, data): """Filter SNP variant calls using GATK best practice recommendations. """ config = data["config"] broad_runner = broad.runner_from_config(config) filter_type = "SNP" variantcaller = config["algorithm"].get("variantcaller", "gatk") filters = [ "QD < 2.0", "MQ < 40.0", "FS > 60.0", "MQRankSum < -12.5", "ReadPosRankSum < -8.0" ] # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores # resulting in excessive filtering, so avoid this metric if variantcaller not in ["gatk-haplotype"]: filters.append("HaplotypeScore > 13.0") if not config_utils.use_vqsr([config["algorithm"]]): return vfilter.hard_w_expression(snp_file, " || ".join(filters), data, filter_type) else: # also check if we've failed recal and needed to do strict filtering filter_file = "{base}-filter{ext}.vcf".format( base=os.path.splitext(snp_file)[0], ext=filter_type) if file_exists(filter_file): config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_snp(snp_file, ref_file, vrn_files, data) assert "train_hapmap" in vrn_files and "train_1000g_omni" in vrn_files, \ "Need HapMap and 1000 genomes training files" params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend( ["--recal_file", tx_recal, "--tranches_file", tx_tranches]) try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False) # Can fail to run if not enough values are present to train. Rerun with regional # filtration approach instead except: logger.info( "VQSR failed due to lack of training data. Using hard filtering." ) config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_snp(snp_file, ref_file, vrn_files, data) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["align_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = len(items) config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} sv_types = ["DEL", "DUP", "INV"] # "TRA" has invalid VCF END specifications that GATK doesn't like exclude_file = _get_full_exclude_file(items, work_dir) bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product(get_sv_chroms(items, exclude_file), sv_types)], config, parallel) out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = vfilter.hard_w_expression(delly_sample_vcf, "FMT/DV < 4 || (FMT/DV / (FMT/DV + FMT/DR)) < 0.2", data, name="DVSupport") data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf, "exclude": exclude_file}) out.append(data) return out
def _variant_filtration_indel(snp_file, ref_file, vrn_files, data): """Filter indel variant calls using GATK best practice recommendations. """ config = data["config"] broad_runner = broad.runner_from_config(config) filter_type = "INDEL" variantcaller = config["algorithm"].get("variantcaller", "gatk") if not config_utils.use_vqsr([config["algorithm"]]): filterexp = " || ".join( ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"]) return vfilter.hard_w_expression(snp_file, filterexp, data, filter_type) else: # also check if we've failed recal and needed to do strict filtering filter_file = "{base}-filter{ext}.vcf".format( base=os.path.splitext(snp_file)[0], ext=filter_type) if file_exists(filter_file): config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, data) assert "train_indels" in vrn_files, "Need indel training file specified" params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend( ["--recal_file", tx_recal, "--tranches_file", tx_tranches]) if LooseVersion(broad_runner.gatk_major_version() ) >= LooseVersion("2.7"): params.extend(["--numBadVariants", "3000"]) try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False) except: logger.info( "VQSR failed due to lack of training data. Using hard filtering." ) config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, data) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def _variant_filtration_snp(snp_file, ref_file, vrn_files, data): """Filter SNP variant calls using GATK best practice recommendations. """ config = data["config"] broad_runner = broad.runner_from_config(config) filter_type = "SNP" variantcaller = config["algorithm"].get("variantcaller", "gatk") filters = ["QD < 2.0", "MQ < 40.0", "FS > 60.0", "MQRankSum < -12.5", "ReadPosRankSum < -8.0"] # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores # resulting in excessive filtering, so avoid this metric if variantcaller not in ["gatk-haplotype"]: filters.append("HaplotypeScore > 13.0") if not config_utils.use_vqsr([config["algorithm"]]): return vfilter.hard_w_expression(snp_file, " || ".join(filters), data, filter_type) else: # also check if we've failed recal and needed to do strict filtering filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type) if file_exists(filter_file): config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_snp(snp_file, ref_file, vrn_files, data) assert "train_hapmap" in vrn_files and "train_1000g_omni" in vrn_files, \ "Need HapMap and 1000 genomes training files" params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend(["--recal_file", tx_recal, "--tranches_file", tx_tranches]) try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False) # Can fail to run if not enough values are present to train. Rerun with regional # filtration approach instead except: logger.info("VQSR failed due to lack of training data. Using hard filtering.") config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_snp(snp_file, ref_file, vrn_files, data) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)