def run(items, background=None): """Detect copy number variations from batched set of samples using WHAM. """ if not background: background = [] paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if paired: inputs = [paired.tumor_data] background_bams = [paired.normal_bam] background_names = [paired.normal_name] else: assert not background inputs, background = shared.find_case_control(items) background_bams = [x["align_bam"] for x in background] background_names = [dd.get_sample_name(x) for x in background] orig_vcf_file = _run_wham(inputs, background_bams) wclass_vcf_file = _add_wham_classification(orig_vcf_file, inputs) vcf_file = _fix_vcf(wclass_vcf_file, inputs, background_names) bed_file = _convert_to_bed(vcf_file, inputs, use_lrt=len(background_bams) > 0) out = [] for data in items: if "sv" not in data: data["sv"] = [] data["sv"].append({"variantcaller": "wham", "vrn_file": _subset_to_sample(bed_file, data), "vcf_file": vcf_file}) out.append(data) return out
def _run_cnvkit_population(items, background): """Run CNVkit on a population of samples. Tries to calculate background based on case/controls, otherwise uses samples from the same batch as background. """ if background and len(background) > 0: inputs = items else: inputs, background = shared.find_case_control(items) # if we have case/control organized background or a single sample if len(inputs) == 1 or len(background) > 0: ckouts = _run_cnvkit_shared(inputs, background) return _associate_cnvkit_out(ckouts, inputs) + background # otherwise run each sample with the others in the batch as background else: out = [] for cur_input in items: background = [ d for d in items if dd.get_sample_name(d) != dd.get_sample_name(cur_input) ] ckouts = _run_cnvkit_shared([cur_input], background) out.extend(_associate_cnvkit_out(ckouts, [cur_input])) return out
def run(items, background=None): """Detect copy number variations from batched set of samples using WHAM. """ if not background: background = [] paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if paired: inputs = [paired.tumor_data] background_bams = [paired.normal_bam] background_names = [paired.normal_name] else: assert not background inputs, background = shared.find_case_control(items) background_bams = [x["align_bam"] for x in background] background_names = [dd.get_sample_name(x) for x in background] orig_bedpe = _run_wham(inputs, background_bams) #vcf_file = _fix_vcf(wclass_vcf_file, inputs, background_names) out = [] for data in inputs: if "sv" not in data: data["sv"] = [] data["sv"].append({ "variantcaller": "wham", "vrn_file": _get_sample_bed(orig_bedpe, data, background_names), "vrn_bedpe": orig_bedpe }) out.append(data) return out
def run(items, background=None): """Detect copy number variations from batched set of samples using WHAM. """ if not background: background = [] background_bams = [] paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if paired: inputs = [paired.tumor_data] if paired.normal_bam: background = [paired.normal_data] background_bams = [paired.normal_bam] else: assert not background inputs, background = shared.find_case_control(items) background_bams = [x["align_bam"] for x in background] orig_vcf = _run_wham(inputs, background_bams) out = [] for data in inputs: if "sv" not in data: data["sv"] = [] sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data)) sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"]) if background: sample_vcf = filter_by_background(sample_vcf, orig_vcf, background, data) data["sv"].append({"variantcaller": "wham", "vrn_file": sample_vcf}) out.append(data) return out
def run(items, background=None): """Detect copy number variations from batched set of samples using WHAM. """ if not background: background = [] background_bams = [] paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if paired: inputs = [paired.tumor_data] if paired.normal_bam: background = [paired.normal_data] background_bams = [paired.normal_bam] else: assert not background inputs, background = shared.find_case_control(items) background_bams = [x["align_bam"] for x in background] orig_vcf = _run_wham(inputs, background_bams) out = [] for data in inputs: if "sv" not in data: data["sv"] = [] sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data)) sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"]) if background: sample_vcf = filter_by_background(sample_vcf, orig_vcf, background, data) effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff") data["sv"].append({"variantcaller": "wham", "vrn_file": effects_vcf or sample_vcf}) out.append(data) return out
def normalize_sv_coverage(*items): """Normalize CNV coverage, providing flexible point for multiple methods. Don't normalize when running purecn alone """ out = [] items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] from bcbio.structural import get_svcallers sv_callers = get_svcallers(items[0]) if "gatk-cnv" in sv_callers or "cnvkit" in sv_callers: calcfns = {"cnvkit": _normalize_sv_coverage_cnvkit, "gatk-cnv": _normalize_sv_coverage_gatk} from bcbio.structural import cnvkit from bcbio.structural import shared as sshared if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out_files = {} back_files = {} for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)): # No CNVkit calling for this particular set of samples if group_id is None: continue inputs, backgrounds = sshared.find_case_control(list(gitems)) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items)) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural", dd.get_sample_name(inputs[0]), "bins")) back_files, out_files = calcfns[cnvkit.bin_approach(inputs[0])](group_id, inputs, backgrounds, work_dir, back_files, out_files) for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)] data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)] out.append([data]) else: out = [[d] for d in items] return out
def normalize_sv_coverage(*items): """Normalize CNV coverage, providing flexible point for multiple methods. """ calcfns = {"cnvkit": _normalize_sv_coverage_cnvkit, "gatk-cnv": _normalize_sv_coverage_gatk} from bcbio.structural import cnvkit from bcbio.structural import shared as sshared items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out_files = {} back_files = {} for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)): # No CNVkit calling for this particular set of samples if group_id is None: continue inputs, backgrounds = sshared.find_case_control(list(gitems)) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items)) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural", dd.get_sample_name(inputs[0]), "bins")) back_files, out_files = calcfns[cnvkit.bin_approach(inputs[0])](group_id, inputs, backgrounds, work_dir, back_files, out_files) out = [] for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)] data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)] out.append([data]) return out
def _run_cnvkit_population(items, background, access_file): """Run CNVkit on a population of samples. Currently uses a flat background for each sample and calls independently. Could be improved to use population information but this is a starting point. """ assert not background inputs, background = shared.find_case_control(items) return [_run_cnvkit_single(data, access_file, background)[0] for data in inputs]
def _run_cnvkit_population(items, background): """Run CNVkit on a population of samples. Tries to calculate background based on case/controls, otherwise uses a flat background for each sample and calls independently. """ assert not background inputs, background = shared.find_case_control(items) return [_run_cnvkit_single(data, background)[0] for data in inputs] + \ [_run_cnvkit_single(data, inputs)[0] for data in background]
def _run_cnvkit_population(items, background): """Run CNVkit on a population of samples. Tries to calculate background based on case/controls, otherwise uses a flat background for each sample and calls independently. """ assert not background inputs, background = shared.find_case_control(items) access_file = _create_access_file(dd.get_ref_file(inputs[0]), _sv_workdir(inputs[0]), inputs[0]) return [_run_cnvkit_single(data, access_file, background)[0] for data in inputs] + \ [_run_cnvkit_single(data, access_file, inputs)[0] for data in background]
def _run_cnvkit_population(items, background): """Run CNVkit on a population of samples. Tries to calculate background based on case/controls, otherwise uses a flat background for each sample and calls independently. """ assert not background inputs, background = shared.find_case_control(items) work_dir = _sv_workdir(inputs[0]) ckouts = _run_cnvkit_shared(inputs, [x["align_bam"] for x in inputs], [x["align_bam"] for x in background], work_dir, background_name=dd.get_sample_name(background[0]) if len(background) > 0 else None) return _associate_cnvkit_out(ckouts, inputs) + background
def _run_cnvkit_population(items, background): """Run CNVkit on a population of samples. Tries to calculate background based on case/controls, otherwise uses a flat background for each sample and calls independently. """ assert not background inputs, background = shared.find_case_control(items) work_dir = _sv_workdir(inputs[0]) ckouts = _run_cnvkit_shared( inputs, [x["align_bam"] for x in inputs], [x["align_bam"] for x in background], work_dir, background_name=dd.get_sample_name(background[0]) if len(background) > 0 else None) return _associate_cnvkit_out(ckouts, inputs) + background
def normalize_sv_coverage(*items): """Normalize CNV coverage depths by GC, repeats and background. Provides normalized output based on CNVkit approaches, provides a point for providing additional methods in the future: - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit from bcbio.structural import shared as sshared orig_items = items items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] if all(not cnvkit.use_general_sv_bins(x) for x in items): return orig_items out_files = {} for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)): inputs, backgrounds = sshared.find_case_control(list(gitems)) cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], []) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items)) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[00]), "structural", dd.get_sample_name(inputs[0]), "bins")) back_file = cnvkit.cnvkit_background(cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) for data in inputs: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = cnvkit.run_fix(tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))), data) out_files[dd.get_sample_name(data)] = fix_file out = [] for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)] out.append([data]) return out
def run(items, background=None): """Perform detection of structural variations with Manta. """ paired = vcfutils.get_paired(items) if paired: inputs = [paired.tumor_data] background = [paired.normal_data] if paired.normal_bam else [] else: assert not background inputs, background = sshared.find_case_control(items) work_dir = _sv_workdir(inputs[0]) variant_file = _run_gridss(inputs, background, work_dir) out = [] for data in items: sample_file = variant_file if "sv" not in data: data["sv"] = [] effects_vcf, _ = effects.add_to_vcf(sample_file, data, "snpeff") data["sv"].append({"variantcaller": "gridss", "vrn_file": effects_vcf or sample_file}) out.append(data) return out
def _run_cnvkit_population(items, background): """Run CNVkit on a population of samples. Tries to calculate background based on case/controls, otherwise uses samples from the same batch as background. """ if background and len(background) > 0: inputs = items else: inputs, background = shared.find_case_control(items) # if we have case/control organized background or a single sample if len(inputs) == 1 or len(background) > 0: ckouts = _run_cnvkit_shared(inputs, background) return _associate_cnvkit_out(ckouts, inputs) + background # otherwise run each sample with the others in the batch as background else: out = [] for cur_input in items: background = [d for d in items if dd.get_sample_name(d) != dd.get_sample_name(cur_input)] ckouts = _run_cnvkit_shared([cur_input], background) out.extend(_associate_cnvkit_out(ckouts, [cur_input])) return out
def run(items, background=None): """Detect copy number variations from batched set of samples using WHAM. """ if not background: background = [] background_bams = [] paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if paired: inputs = [paired.tumor_data] if paired.normal_bam: background = [paired.normal_data] background_bams = [paired.normal_bam] else: assert not background inputs, background = shared.find_case_control(items) background_bams = [x["align_bam"] for x in background] orig_vcf = _run_wham(inputs, background_bams) out = [] for data in inputs: if "sv" not in data: data["sv"] = [] final_vcf = shared.finalize_sv(orig_vcf, data, items) data["sv"].append({"variantcaller": "wham", "vrn_file": final_vcf}) out.append(data) return out
def run(items, background=None): """Perform detection of structural variations with Manta. """ paired = vcfutils.get_paired(items) if paired: inputs = [paired.tumor_data] background = [paired.normal_data] if paired.normal_bam else [] else: assert not background inputs, background = sshared.find_case_control(items) work_dir = _sv_workdir(inputs[0]) variant_file = _run_gridss(inputs, background, work_dir) out = [] for data in items: sample_file = variant_file if "sv" not in data: data["sv"] = [] effects_vcf, _ = effects.add_to_vcf(sample_file, data, "snpeff") data["sv"].append({ "variantcaller": "gridss", "vrn_file": effects_vcf or sample_file }) out.append(data) return out
def normalize_sv_coverage(*items): """Normalize CNV coverage depths by GC, repeats and background. Provides normalized output based on CNVkit approaches, provides a point for providing additional methods in the future: - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit from bcbio.structural import shared as sshared items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out_files = {} back_files = {} for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)): # No CNVkit calling for this particular set of samples if group_id is None: continue inputs, backgrounds = sshared.find_case_control(list(gitems)) cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], []) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items)) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural", dd.get_sample_name(inputs[0]), "bins")) input_backs = set(filter(lambda x: x is not None, [dd.get_background_cnv_reference(d) for d in inputs])) if input_backs: assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs) back_file = list(input_backs)[0] else: back_file = cnvkit.cnvkit_background(cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) fix_cmd_inputs = [] for data in inputs: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))) fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, fix_file, data)) out_files[dd.get_sample_name(data)] = fix_file back_files[dd.get_sample_name(data)] = back_file parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel) out = [] for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)] data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)] out.append([data]) return out