def _sample_variant_file_in_population(x): """Check if a sample file is the same as the population file. This is true for batches where we don't extract into samples and do not run decomposition for gemini. '""" if "population" in x: a = _get_project_vcf(x) b = _get_variant_file(x, ("vrn_file",)) decomposed = tz.get_in(("population", "decomposed"), x) if (a and b and not decomposed and len(a) > 0 and len(b) > 0 and vcfutils.get_samples(a[0]["path"]) == vcfutils.get_samples(b[0]["path"])): return True return False
def _sample_variant_file_in_population(x): """Check if a sample file is the same as the population file. This is true for batches where we don't extract into samples and do not run decomposition for gemini. '""" if "population" in x: a = _get_variant_file(x, ("population", "vcf")) b = _get_variant_file(x, ("vrn_file",)) decomposed = tz.get_in(("population", "decomposed"), x) if (a and b and not decomposed and len(a) > 0 and len(b) > 0 and vcfutils.get_samples(a[0]["path"]) == vcfutils.get_samples(b[0]["path"])): return True return False
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, do not split the final file and attach it to the tumor input. """ # not split, do nothing if "group_orig" not in data: return [[data]] # cancer tumor/normal elif vcfutils.get_paired_phenotype(data): out = [] for i, sub_data in enumerate(data["group_orig"]): if vcfutils.get_paired_phenotype(sub_data) == "tumor": if "combine" in data: sub_data["combine"] = data["combine"] sub_data["vrn_file"] = data["vrn_file"] out.append([sub_data]) return out # population or single sample else: out = [] for sub_data in data["group_orig"]: sub_vrn_file = data["vrn_file"].replace(data["group"][0] + "-", sub_data["name"][-1] + "-") if len(vcfutils.get_samples(data["vrn_file"])) > 1: vcfutils.select_sample(data["vrn_file"], sub_data["name"][-1], sub_vrn_file, data["config"]) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(data["vrn_file"], sub_vrn_file) if "combine" in data: sub_data["combine"] = data["combine"] sub_data["vrn_file"] = sub_vrn_file out.append([sub_data]) return out
def filter_vcf_by_sex(vcf_file, data): """Post-filter a single sample VCF, handling sex chromosomes. Handles sex chromosomes and mitochondrial. Does not try to resolve called hets into potential homozygotes when converting diploid to haploid. Skips filtering on pooled samples, we still need to implement. """ if len(vcfutils.get_samples(vcf_file)) > 1: return vcf_file _, sexes = _configured_ploidy_sex([data]) sex = sexes.pop() out_file = "%s-ploidyfix%s" % utils.splitext_plus(vcf_file) if not utils.file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with utils.open_gzipsafe(vcf_file) as in_handle: for line in in_handle: if line.startswith("#"): out_handle.write(line) else: line = _fix_line_ploidy(line, sex) if line: out_handle.write(line) if orig_out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _get_vcf_samples(calls, items): have_full_file = False all_samples = set([]) sample_matches = False for f in utils.flatten(calls): if have_full_file: cur = set(vcfutils.get_samples(f)) if cur: if not all_samples: all_samples = cur else: all_samples &= set(cur) else: for data in items: for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)): # For tumor/normal batches, want to attach germline VCFs to normals # Standard somatics go to tumors if dd.get_phenotype(data) == "normal": test_name += "-germline" if os.path.basename(f).startswith(("%s-" % test_name, "%s." % test_name)): # Prefer matches to single samples (gVCF) over joint batches if i == 0: sample_matches = True if sample_matches and i > 0: continue else: all_samples.add(dd.get_sample_name(data)) return list(all_samples)
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, do not split the final file and attach it to the tumor input. """ # not split, do nothing if "group_orig" not in data: return [[data]] # cancer tumor/normal elif vcfutils.get_paired_phenotype(data): out = [] for i, sub_data in enumerate(data["group_orig"]): if vcfutils.get_paired_phenotype(sub_data) == "tumor": sub_data["vrn_file"] = data["vrn_file"] out.append([sub_data]) return out # population or single sample else: out = [] for sub_data in data["group_orig"]: sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-") if len(vcfutils.get_samples(data["vrn_file"])) > 1: vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"]) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(data["vrn_file"], sub_vrn_file) sub_data["vrn_file"] = sub_vrn_file out.append([sub_data]) return out
def _comparison_stats_from_merge(in_file, stats, svcaller, data): """Extract true/false positive/negatives from a merged SURIVOR VCF. """ truth_stats = {"tp": [], "fn": [], "fp": []} samples = [ "truth" if x.endswith("-truth") else "eval" for x in vcfutils.get_samples(in_file) ] with open(in_file) as in_handle: for call in (l.rstrip().split("\t") for l in in_handle if not l.startswith("#")): supp_vec_str = [ x for x in call[7].split(";") if x.startswith("SUPP_VEC=") ][0] _, supp_vec = supp_vec_str.split("=") calls = dict(zip(samples, [int(x) for x in supp_vec])) if calls["truth"] and calls["eval"]: metric = "tp" elif calls["truth"]: metric = "fn" else: metric = "fp" truth_stats[metric].append(_summarize_call(call)) return _to_csv(truth_stats, stats, dd.get_sample_name(data), svcaller)
def _get_vcf_samples(calls, items): have_full_file = False all_samples = set([]) sample_matches = False for f in utils.flatten(calls): if have_full_file: cur = set(vcfutils.get_samples(f)) if cur: if not all_samples: all_samples = cur else: all_samples &= set(cur) else: for data in items: for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)): if os.path.basename(f).startswith( ("%s-" % test_name, "%s." % test_name)): # Prefer matches to single samples (gVCF) over joint batches if i == 0: sample_matches = True if sample_matches and i > 0: continue else: all_samples.add(dd.get_sample_name(data)) return list(all_samples)
def _get_vcf_samples(calls, items): have_full_file = False all_samples = set([]) sample_matches = False for f in utils.flatten(calls): if have_full_file: cur = set(vcfutils.get_samples(f)) if cur: if not all_samples: all_samples = cur else: all_samples &= set(cur) else: for data in items: for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)): # For tumor/normal batches, want to attach germline VCFs to normals # Standard somatics go to tumors if dd.get_phenotype(data) == "normal": test_name += "-germline" if os.path.basename(f).startswith( ("%s-" % test_name, "%s." % test_name)): # Prefer matches to single samples (gVCF) over joint batches if i == 0: sample_matches = True if sample_matches and i > 0: continue else: all_samples.add(dd.get_sample_name(data)) return list(all_samples)
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(vrn_file) sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data)) rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) cmd = ["rtg", "vcfeval", "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] do.run(cmd, "Validate calls using rtg vcfeval", data) return {"tp": os.path.join(out_dir, "tp.vcf.gz"), "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")}
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, do not split the final file and attach it to the tumor input. """ # not split, do nothing if "group_orig" not in data: return [[data]] # cancer tumor/normal elif vcfutils.get_paired_phenotype(data): out = [] for i, sub_data in enumerate(get_orig_items(data)): if vcfutils.get_paired_phenotype(sub_data) == "tumor": sub_data["vrn_file"] = data["vrn_file"] else: sub_data.pop("vrn_file", None) out.append([sub_data]) return out # joint calling, do not split back up due to potentially large sample sizes elif tz.get_in(("config", "algorithm", "jointcaller"), data): return [[data]] # population or single sample else: out = [] for sub_data in get_orig_items(data): sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-") if len(vcfutils.get_samples(data["vrn_file"])) > 1: vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"]) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(data["vrn_file"], sub_vrn_file) sub_data["vrn_file_batch"] = data["vrn_file"] sub_data["vrn_file"] = sub_vrn_file out.append([sub_data]) return out
def filter_vcf_by_sex(vcf_file, data): """Post-filter a single sample VCF, handling sex chromosomes. Handles sex chromosomes and mitochondrial. Does not try to resolve called hets into potential homozygotes when converting diploid to haploid. Skips filtering on pooled samples, we still need to implement. """ if len(vcfutils.get_samples(vcf_file)) > 1: return vcf_file _, sexes = _configured_ploidy_sex([data]) sex = sexes.pop() out_file = "%s-ploidyfix%s" % utils.splitext_plus(vcf_file) if not utils.file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with utils.open_gzipsafe(vcf_file) as in_handle: for line in in_handle: if line.startswith("#"): out_handle.write(line) else: line = _fix_line_ploidy(line, sex) if line: out_handle.write(line) if orig_out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _create_samplemap_file(vrn_files): tf = tempfile.NamedTemporaryFile(suffix=".tsv", delete=False) samplemap = tf.name samplenames = [vcfutils.get_samples(vrn_file)[0] for vrn_file in vrn_files] with open(samplemap, "w") as out_handle: for samplename, vrn_file in zip(samplenames, vrn_files): print(f"{samplename}\t{vrn_file}", file=out_handle) return samplemap
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(os.path.basename(vrn_file)) sample_file = os.path.join( base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) rtg_ref = tz.get_in(["reference", "rtg"], data) assert rtg_ref and os.path.exists(rtg_ref), ( "Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) # handle CWL where we have a reference to a single file in the RTG directory if os.path.isfile(rtg_ref): rtg_ref = os.path.dirname(rtg_ref) threads = min(dd.get_num_cores(data), 6) mem = "%sg" % threads cmd = [ "rtg", "vcfeval", "--threads", str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir ] cmd += [ "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file)) ] mem_export = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=%s" % mem cmd = mem_export + " && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = { "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz") } tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def get_bams(vcf_file, bam_dir): out = [] for sample in vcfutils.get_samples(vcf_file): bam_files = glob.glob(os.path.join(bam_dir, "*", "final", sample, "%s-*am" % sample)) assert len(bam_files) > 0, "Did not find BAM files for %s: %s" % (sample, bam_files) if len(bam_files) > 1: bam_files = [x for x in bam_files if x.endswith(".bam")] out.append(bam_files[0]) return out
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(vrn_file) sample_file = os.path.join( base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data)) rtg_ref = os.path.normpath( os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) assert os.path.exists(rtg_ref), ( "Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) cmd = [ "rtg", "vcfeval", "--threads", "6", "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir ] caller = _get_caller(data) # flexible quality scores for building ROC curves, handle multiple cases # MuTect has no quality scores # not clear how to get t_lod_fstar into VCF cleanly if caller == "mutect": cmd += ["--vcf-score-field=BQ"] # otherwise use quality score as a standard # Discussion point: is it worth using caller specific annotations or settling # on a single metric for comparison else: cmd += ["--vcf-score-field=QUAL"] cmd = "export RTG_JAVA_OPTS='-Xms1g' export RTG_MEM=5g && " + " ".join( cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) return { "tp": os.path.join(out_dir, "tp.vcf.gz"), "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz") }
def _get_vcf_samples(calls): all_samples = set([]) for f in utils.flatten(calls): cur = set(vcfutils.get_samples(f)) if cur: if not all_samples: all_samples = cur else: all_samples &= set(cur) return list(all_samples)
def run_peddy(samples, out_dir=None): vcf_file = None for d in samples: vcinfo = variant.get_active_vcinfo(d) if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(vcinfo["vrn_file"]): if vcinfo["vrn_file"] and dd.get_sample_name(d) in vcfutils.get_samples(vcinfo["vrn_file"]): vcf_file = vcinfo["vrn_file"] break data = samples[0] peddy = config_utils.get_program("peddy", data) if config_utils.program_installed("peddy", data) else None if not peddy or not vcf_file or not is_human(data): logger.info("peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking " "for %s." % vcf_file) return samples batch = dd.get_batch(data) or dd.get_sample_name(data) if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" peddyfiles = expected_peddy_files(peddy_report, batch) if file_exists(peddy_report): return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles) if file_exists(peddy_prefix + "-failed.log"): return samples num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") cmd = "{peddy} -p {num_cores} --plot --prefix {peddy_prefix_tx} {vcf_file} {ped_file} 2> {stderr_log}" message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) if any([l.find("IndexError") >=0 and l.find("is out of bounds for axis") >= 0 for l in to_show]): logger.info("Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write("peddy did not find overlaps with 1kg sites in VCF, skipping") return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(vrn_file) sample_file = os.path.join( base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data)) rtg_ref = os.path.normpath( os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) assert os.path.exists(rtg_ref), ( "Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) cmd = [ "rtg", "vcfeval", "--threads", "6", "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir ] cmd += [ "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file)) ] cmd = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=5g && " + " ".join( cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = { "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz") } tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def get_bams(vcf_file, bam_dir): out = [] for sample in vcfutils.get_samples(vcf_file): bam_files = glob.glob( os.path.join(bam_dir, "*", "final", sample, "%s-*am" % sample)) assert len(bam_files) > 0, "Did not find BAM files for %s: %s" % ( sample, bam_files) if len(bam_files) > 1: bam_files = [x for x in bam_files if x.endswith(".bam")] out.append(bam_files[0]) return out
def _validate_caller_vcf(call_vcf, truth_vcf, callable_bed, svcaller, work_dir, data): """Validate a caller VCF against truth within callable regions using SURVIVOR. Combines files with SURIVOR merge and counts (https://github.com/fritzsedlazeck/SURVIVOR/) """ stats = _calculate_comparison_stats(truth_vcf) call_vcf = _prep_vcf(call_vcf, callable_bed, dd.get_sample_name(data), dd.get_sample_name(data), stats, work_dir, data) truth_vcf = _prep_vcf(truth_vcf, callable_bed, vcfutils.get_samples(truth_vcf)[0], "%s-truth" % dd.get_sample_name(data), stats, work_dir, data) cmp_vcf = _survivor_merge(call_vcf, truth_vcf, stats, work_dir, data) return _comparison_stats_from_merge(cmp_vcf, stats, svcaller, data)
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data, validate_method): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) vrn_file, rm_file, interval_bed = _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data) rtg_ref = tz.get_in(["reference", "rtg"], data) if isinstance(rtg_ref, dict) and "base" in rtg_ref: rtg_ref = os.path.dirname(rtg_ref["base"]) assert rtg_ref and os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) # handle CWL where we have a reference to a single file in the RTG directory if os.path.isfile(rtg_ref): rtg_ref = os.path.dirname(rtg_ref) # get core and memory usage from standard configuration threads = min(dd.get_num_cores(data), 6) resources = config_utils.get_resources("rtg", data["config"]) memory = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), {"algorithm": {"memory_adjust": {"magnitude": threads, "direction": "increase"}}}) jvm_stack = [x for x in memory if x.startswith("-Xms")] jvm_mem = [x for x in memory if x.startswith("-Xmx")] jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m" jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g" cmd = ["rtg", "vcfeval", "--threads", str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] if validate_method == "rtg-squash-ploidy": cmd += ["--squash-ploidy"] rm_samples = vcfutils.get_samples(rm_file) if len(rm_samples) > 1 and dd.get_sample_name(data) in rm_samples: cmd += ["--sample=%s" % dd.get_sample_name(data)] cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))] mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % (utils.local_path_export(), jvm_stack, jvm_mem) cmd = mem_export + " && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = {"fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")} tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data): """Prepare input VCF and BED files for validation. """ if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(os.path.basename(vrn_file)) sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data) return vrn_file, rm_file, interval_bed
def create_ped_file(samples, base_vcf, out_dir=None): """Create a GEMINI-compatible PED file, including gender, family and phenotype information. Checks for a specified `ped` file in metadata, and will use sample information from this file before reconstituting from metadata information. """ out_file = "%s.ped" % utils.splitext_plus(base_vcf)[0] if out_dir: out_file = os.path.join(out_dir, os.path.basename(out_file)) sample_ped_lines = {} header = [ "#Family_ID", "Individual_ID", "Paternal_ID", "Maternal_ID", "Sex", "Phenotype", "Ethnicity" ] for md_ped in list( set([ x for x in [tz.get_in(["metadata", "ped"], data) for data in samples] if x is not None ])): with open(md_ped) as in_handle: reader = csv.reader(in_handle, dialect="excel-tab") for parts in reader: if parts[0].startswith("#") and len(parts) > len(header): header = header + parts[len(header):] else: sample_ped_lines[parts[1]] = parts if not utils.file_exists(out_file): with file_transaction(samples[0], out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: want_samples = set(vcfutils.get_samples(base_vcf)) writer = csv.writer(out_handle, dialect="excel-tab") writer.writerow(header) for data in samples: ped_info = get_ped_info(data, samples) sname = ped_info["individual_id"] if sname in want_samples: want_samples.remove(sname) if sname in sample_ped_lines: writer.writerow(sample_ped_lines[sname]) else: writer.writerow([ ped_info["family_id"], ped_info["individual_id"], ped_info["paternal_id"], ped_info["maternal_id"], ped_info["gender"], ped_info["affected"], ped_info["ethnicity"] ]) return out_file
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data): """Prepare input VCF and BED files for validation. """ if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base = utils.splitext_plus(os.path.basename(vrn_file))[0] sample_file = os.path.join(base_dir, "%s-%s.vcf.gz" % (base, dd.get_sample_name(data))) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) # rtg fails on bgzipped VCFs produced by GatherVcfs so we re-prep them else: vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data) return vrn_file, rm_file, interval_bed
def _get_vcf_samples(calls, data): have_full_file = False all_samples = set([]) for f in utils.flatten(calls): if have_full_file: cur = set(vcfutils.get_samples(f)) if cur: if not all_samples: all_samples = cur else: all_samples &= set(cur) else: for test_name in [dd.get_sample_name(data)] + dd.get_batches(data): if os.path.basename(f).startswith("%s-" % test_name): all_samples.add(dd.get_sample_name(data)) return list(all_samples)
def extract(data, items): """Extract germline calls for the given sample, if tumor only. For germline calling done separately, fix VCF sample naming to match. """ if vcfutils.get_paired_phenotype(data): if dd.get_batches(data) and len(items) == 1: germline_vcf = _remove_prioritization(data["vrn_file"], data) germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"]) data["vrn_file_plus"] = {"germline": germline_vcf} elif dd.get_phenotype(data) == "germline": sample_name = dd.get_sample_name(data) vcf_samples = vcfutils.get_samples(data["vrn_file"]) if (sample_name.endswith("-germline") and len(vcf_samples) == 1 and sample_name.replace("-germline", "") == vcf_samples[0]): data["vrn_file"] = fix_germline_samplename(data["vrn_file"], sample_name, data) return data
def extract(data, items): """Extract germline calls for the given sample, if tumor only. For germline calling done separately, fix VCF sample naming to match. """ if vcfutils.get_paired_phenotype(data): if dd.get_batches(data) and len(items) == 1: germline_vcf = _remove_prioritization(data["vrn_file"], data) germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"]) data["vrn_file_plus"] = {"germline": germline_vcf} elif dd.get_phenotype(data) == "germline": sample_name = dd.get_sample_name(data) vcf_samples = vcfutils.get_samples(data["vrn_file"]) if (sample_name.endswith("-germline") and len(vcf_samples) == 1 and sample_name.replace("-germline", "") == vcf_samples[0]): data["vrn_file"] = _fix_germline_samplename(data["vrn_file"], sample_name, data) return data
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, assign the combined file to the tumor sample instead of splitting, and remove variant files from the normal. """ config = data["config"] vrn_file = data["vrn_file"] out = [] # cancer tumor/normal if vcfutils.get_paired_phenotype(data): # handle trailing normals, which we don't need to process if len(data["group_orig"]) == 1 and vcfutils.get_paired_phenotype(data["group_orig"][0][0]) == "normal": sub_data, sub_vrn_file = data["group_orig"][0] sub_data.pop("vrn_file", None) sub_data["vrn_file-shared"] = sub_vrn_file out.append(sub_data) else: has_tumor = False for sub_data, sub_vrn_file in data["group_orig"]: paired_phenotype = vcfutils.get_paired_phenotype(sub_data) if paired_phenotype == "tumor": has_tumor = True if not os.path.exists(sub_vrn_file): utils.symlink_plus(vrn_file, sub_vrn_file) sub_data["vrn_file"] = sub_vrn_file out.append(sub_data) else: sub_data.pop("vrn_file", None) sub_data["vrn_file-shared"] = sub_vrn_file out.append(sub_data) if not has_tumor: raise ValueError("Did not find tumor sample in paired analysis") # population or single sample else: for sub_data, sub_vrn_file in data["group_orig"]: if len(vcfutils.get_samples(vrn_file)) > 1: vcfutils.select_sample(vrn_file, sub_data["name"][-1], sub_vrn_file, config) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(vrn_file, sub_vrn_file) if sub_vrn_file: sub_data["vrn_file"] = sub_vrn_file out.append(sub_data) return out
def _comparison_stats_from_merge(in_file, stats, svcaller, data): """Extract true/false positive/negatives from a merged SURIVOR VCF. """ truth_stats = {"tp": [], "fn": [], "fp": []} samples = ["truth" if x.endswith("-truth") else "eval" for x in vcfutils.get_samples(in_file)] with open(in_file) as in_handle: for call in (l.rstrip().split("\t") for l in in_handle if not l.startswith("#")): supp_vec_str = [x for x in call[7].split(";") if x.startswith("SUPP_VEC=")][0] _, supp_vec = supp_vec_str.split("=") calls = dict(zip(samples, [int(x) for x in supp_vec])) if calls["truth"] and calls["eval"]: metric = "tp" elif calls["truth"]: metric = "fn" else: metric = "fp" truth_stats[metric].append(_summarize_call(call)) return _to_csv(truth_stats, stats, dd.get_sample_name(data), svcaller)
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(os.path.basename(vrn_file)) sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) rtg_ref = tz.get_in(["reference", "rtg"], data) assert rtg_ref and os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) # handle CWL where we have a reference to a single file in the RTG directory if os.path.isfile(rtg_ref): rtg_ref = os.path.dirname(rtg_ref) threads = min(dd.get_num_cores(data), 6) mem = "%sg" % threads cmd = ["rtg", "vcfeval", "--threads", str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))] mem_export = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=%s" % mem cmd = mem_export + " && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = {"fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")} tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(vrn_file) sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data)) rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) cmd = ["rtg", "vcfeval", "--threads", "6", "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] caller = _get_caller(data) # flexible quality scores for building ROC curves, handle multiple cases # MuTect has no quality scores # not clear how to get t_lod_fstar into VCF cleanly if caller == "mutect": cmd += ["--vcf-score-field=BQ"] # otherwise use quality score as a standard # Discussion point: is it worth using caller specific annotations or settling # on a single metric for comparison else: cmd += ["--vcf-score-field=QUAL"] cmd = "export RTG_JAVA_OPTS='-Xms1g' export RTG_MEM=5g && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) return {"tp": os.path.join(out_dir, "tp.vcf.gz"), "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")}
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(vrn_file) sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data)) rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) cmd = ["rtg", "vcfeval", "--threads", "6", "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))] cmd = "export RTG_JAVA_OPTS='-Xms1g' && export RTG_MEM=5g && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = {"fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")} tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def run_peddy(samples, out_dir=None): data = samples[0] batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, (list, tuple)): batch = batch[0] if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" vcf_file = None for d in samples: vcinfo = None if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [ "tumor" ]: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if not vcinfo and dd.get_phenotype(d) in ["tumor"]: vcinfo = variant.extract_germline_vcinfo(d, peddy_dir) if vcinfo: for key in ["germline", "vrn_file"]: if vcinfo and vcinfo.get(key) and utils.file_exists( vcinfo[key]): if vcinfo[key] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo[key]): if vcinfo[ key] and vcfutils.vcf_has_nonfiltered_variants( vcinfo[key]): vcf_file = vcinfo[key] break peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples]) if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips: if not peddy: reason = "peddy executable not found" elif config_skips: reason = "peddy in tools_off configuration" elif not vcfanno.is_human(data): reason = "sample is not human" else: assert not vcf_file reason = "no suitable VCF files found with the sample and non-filtered variants" msg = "Skipping peddy QC, %s: %s" % ( reason, [dd.get_sample_name(d) for d in samples]) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write(msg) logger.info(msg) return samples if file_exists(peddy_prefix + "-failed.log"): return samples if not file_exists(peddy_report): ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" locale = utils.locale_export() cmd = ( "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return ( (l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and min") >= 0) or (l.find( "Input contains NaN, infinity or a value too large for dtype" ) >= 0)) def all_line_errors(l): return (l.find("no intervals found for") >= 0) if any([allowed_errors(l) for l in to_show]) or all( [all_line_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) peddyfiles = expected_peddy_files(peddy_report, batch) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)