def use_general_sv_bins(data): """Check if we should use a general binning approach for a sample. Checks if CNVkit is enabled and we haven't already run CNVkit. """ if "cnvkit" in dd.get_svcaller(data) or "titancna" in dd.get_svcaller( data): if not _get_original_coverage(data): return True return False
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. """ calcfns = { "cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk } from bcbio.structural import cnvkit data = utils.to_single_data(data) if not cnvkit.use_general_sv_bins(data): out_target_file, out_anti_file = (None, None) else: work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)]( data, work_dir) if not os.path.exists(out_target_file): out_target_file, out_anti_file = (None, None) if "seq2c" in dd.get_svcaller(data): from bcbio.structural import seq2c seq2c_target = seq2c.precall(data) else: seq2c_target = None if not tz.get_in(["depth", "bins"], data): data = tz.update_in(data, ["depth", "bins"], lambda x: {}) data["depth"]["bins"] = { "target": out_target_file, "antitarget": out_anti_file, "seq2c": seq2c_target } return [[data]]
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. """ calcfns = {"cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk} from bcbio.structural import cnvkit data = utils.to_single_data(data) if not cnvkit.use_general_sv_bins(data): out_target_file, out_anti_file = (None, None) else: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)](data, work_dir) if not os.path.exists(out_target_file): out_target_file, out_anti_file = (None, None) if "seq2c" in dd.get_svcaller(data): from bcbio.structural import seq2c seq2c_target = seq2c.precall(data) else: seq2c_target = None if not tz.get_in(["depth", "bins"], data): data = tz.update_in(data, ["depth", "bins"], lambda x: {}) data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file, "seq2c": seq2c_target} return [[data]]
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. data is one sample """ calcfns = {"cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk} from bcbio.structural import cnvkit data = utils.to_single_data(data) from bcbio.structural import get_svcallers sv_callers = get_svcallers(data) has_cnvkit_or_gatkcnv = bool(set(["cnvkit", "gatk-cnv"]) & set(sv_callers)) if not cnvkit.use_general_sv_bins(data) or not has_cnvkit_or_gatkcnv: out_target_file, out_anti_file = (None, None) else: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)](data, work_dir) if not os.path.exists(out_target_file): out_target_file, out_anti_file = (None, None) if "seq2c" in dd.get_svcaller(data): from bcbio.structural import seq2c seq2c_target = seq2c.precall(data) else: seq2c_target = None if "purecn" in dd.get_svcaller(data): # set purecn_pon_build flag batches = dd.get_batch(data) if batches and "pon_build" in dd.get_batch(data): data["config"]["algorithm"]["purecn_pon_build"] = True from bcbio.structural import purecn # still calculate coverage even when not building pon - for t-only analysis purecn_target = purecn.get_coverage(data) else: purecn_target = None if not tz.get_in(["depth", "bins"], data): data = tz.update_in(data, ["depth", "bins"], lambda x: {}) data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file, "seq2c": seq2c_target, "purecn": purecn_target} return [[data]]
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) return checkpoints
def use_general_sv_bins(data): """Check if we should use a general binning approach for a sample. Checks if CNVkit is enabled and we haven't already run CNVkit. """ if any([c in dd.get_svcaller(data) for c in ["cnvkit", "titancna", "purecn", "gatk-cnv"]]): if not _get_original_coverage(data): return True return False
def bin_approach(data): """Check for binning approach from configuration or normalized file. """ for approach in ["cnvkit", "gatk-cnv"]: if approach in dd.get_svcaller(data): return approach norm_file = tz.get_in(["depth", "bins", "normalized"], data) if norm_file.endswith(("-crstandardized.tsv", "-crdenoised.tsv")): return "gatk-cnv" if norm_file.endswith(".cnr"): return "cnvkit"
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d))) and dd.get_batch(d) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) return checkpoints
def detect_sv(items, all_items=None, stage="standard"): """Top level parallel target for examining structural variation. items = sample-sv_caller list, from one batch """ items = [utils.to_single_data(x) for x in items] items = cwlutils.unpack_tarballs(items, items[0]) svcaller = items[0]["config"]["algorithm"].get("svcaller") caller_fn = _get_callers(items, stage, special_cases=True).get(svcaller) out = [] batch = dd.get_batch(items[0]) # no SV calling when just creating a PON for PureCN if batch == "pon_build" and "purecn" in dd.get_svcaller(items[0]): return out if svcaller and caller_fn: if (all_items and svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis( [x.get("align_bam") for x in items], items)): names = set([dd.get_sample_name(x) for x in items]) background = [ x for x in all_items if dd.get_sample_name(x) not in names ] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) # Avoid nesting of callers for CWL runs for easier extraction if cwlutils.is_cwl_run(items[0]): out_cwl = [] for data in [utils.to_single_data(x) for x in out]: # Run validation directly from CWL runs since we're single stage data = validate.evaluate(data) data["svvalidate"] = { "summary": tz.get_in(["sv-validate", "csv"], data) } svs = data.get("sv") if svs: assert len(svs) == 1, svs data["sv"] = svs[0] else: data["sv"] = {} data = _add_supplemental(data) out_cwl.append([data]) return out_cwl return out
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples) return checkpoints
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["archive"] = any([dd.get_archive(d) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples) return checkpoints
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. items = 1 sample or T/N pair """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): # call somatic variants keeping germline sites and using germline 1KG resource # use --native-pair-hmm-threads? broad_runner = broad.runner_from_config(items[0]["config"]) gatk_type = broad_runner.gatk_type() # shared Mutect2 settings for PureCN analysis in the case of: # - PON creation # - Tumor-only PureCN run # - T/N PureCN run # PURECN requirement alters Mutect2 variants calling! if "purecn" in dd.get_svcaller(items[0]): # mutect call for PON creation or purecn T-only analysis _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: germline_resource = tz.get_in(["genome_resources", "variation", "af_only_gnomad"], items[0]) germline_path = os.path.normpath(os.path.join(os.path.dirname(ref_file), germline_resource)) input_bam = dd.get_work_bam(items[0]) tx_prefilt_vcf = utils.splitext_plus(tx_out_file)[0] + ".prefilt.vcf" tx_vcf = os.path.splitext(tx_out_file)[0] out_file_ungz = os.path.splitext(out_file)[0] params = ["-T", "Mutect2"] # T/N pair if len(items) == 2: paired = vcfutils.get_paired_bams(align_bams, items) # not really running purecn with mutect1/gatk3 params += _add_tumor_params(paired, items, gatk_type) logger.debug("You are running mutect2 in PureCN analysis in T/N mode, T-only + PON is recommended") else: #T only params += ["-I", input_bam] # adding SNV PON from background/variant snv_pon = tz.get_in(["config", "algorithm", "background", "variant"], items[0]) if snv_pon and dd.get_batch(items[0]) != "pon_build": params += ["-pon", snv_pon] params += ["--genotype-pon-sites"] opt_list = config_utils.get_resources("mutect2", items[0]["config"]).get("options") # default is 50, sometimes 100 or 200 is recommended for better sensitivity in detection # hom del CNVs (calling more variants helps) interval_padding = 50 if opt_list: opt_dict = dict(zip(opt_list[::2], opt_list[1::2])) if "--interval_padding" in opt_dict: interval_padding = opt_dict["--interval_padding"] params += ["--max-mnp-distance", "0", "--interval-padding", interval_padding, "--germline-resource", germline_path, "--genotype-germline-sites", "--reference", ref_file, "-O", tx_prefilt_vcf] params += _add_region_params(region, out_file, items, gatk_type) broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) filter_cmd = _mutect2_filter(broad_runner, items, tx_prefilt_vcf, out_file_ungz, ref_file) cmd = "{gatk_cmd} && {filter_cmd}" do.run(cmd.format(**locals()), "MuTect2") # no AF filter for PureCN variants out_file = vcfutils.bgzip_and_index(out_file_ungz, items[0]["config"]) else: # a regular mutect call paired = vcfutils.get_paired_bams(align_bams, items) f1r2_file = None _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = ["-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] if gatk_type == "gatk4": params += ["--reference", ref_file] else: params += ["-R", ref_file] for a in annotation.get_gatk_annotations(items[0]["config"], include_baseqranksum=False): params += ["--annotation", a] # Avoid issues with BAM CIGAR reads that GATK doesn't like if gatk_type == "gatk4": params += ["--read-validation-stringency", "LENIENT"] params += _add_tumor_params(paired, items, gatk_type) params += _add_region_params(region, out_file, items, gatk_type) if all(is_paired(bam) for bam in align_bams) and ( "mutect2_readmodel" in utils.get_in(items[0], "config", "tools_on")): orientation_filter = True else: orientation_filter = False if gatk_type == "gatk4" and orientation_filter: f1r2_file = "{}-f1r2.tar.gz".format(utils.splitext_plus(out_file)[0]) params += ["--f1r2-tar-gz", f1r2_file] # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm # Not yet clear how this helps or hurts in a general case. #params += _add_assoc_params(assoc_files) resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) if gatk_type == "gatk4": tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(out_file) tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(tx_out_file) if orientation_filter: tx_f1r2_file = "{}-read-orientation-model.tar.gz" tx_f1r2_file = tx_f1r2_file.format(utils.splitext_plus(f1r2_file)[0]) tx_read_orient_cmd = _mutect2_read_filter(broad_runner, f1r2_file, tx_f1r2_file) filter_cmd = _mutect2_filter(broad_runner, items, tx_raw_prefilt_file, tx_raw_file, ref_file, tx_f1r2_file) else: filter_cmd = _mutect2_filter(broad_runner, items, tx_raw_prefilt_file, tx_raw_file, ref_file) if orientation_filter: cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {tx_read_orient_cmd} && {filter_cmd}" else: cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}" else: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) cmd = "{gatk_cmd} > {tx_raw_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])