def _titan_cn_file(cnr_file, work_dir, data): """Convert CNVkit or GATK4 normalized input into TitanCNA ready format. """ out_file = os.path.join( work_dir, "%s.cn" % (utils.splitext_plus(os.path.basename(cnr_file))[0])) support_cols = { "cnvkit": ["chromosome", "start", "end", "log2"], "gatk-cnv": ["CONTIG", "START", "END", "LOG2_COPY_RATIO"] } cols = support_cols[cnvkit.bin_approach(data)] if not utils.file_uptodate(out_file, cnr_file): with file_transaction(data, out_file) as tx_out_file: iterator = pd.read_table(cnr_file, sep="\t", iterator=True, header=0, comment="@") with open(tx_out_file, "w") as handle: for chunk in iterator: chunk = chunk[cols] chunk.columns = ["chrom", "start", "end", "logR"] if cnvkit.bin_approach(data) == "cnvkit": chunk['start'] += 1 chunk.to_csv(handle, mode="a", sep="\t", index=False) return out_file
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. """ calcfns = { "cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk } from bcbio.structural import cnvkit data = utils.to_single_data(data) if not cnvkit.use_general_sv_bins(data): out_target_file, out_anti_file = (None, None) else: work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)]( data, work_dir) if not os.path.exists(out_target_file): out_target_file, out_anti_file = (None, None) if "seq2c" in dd.get_svcaller(data): from bcbio.structural import seq2c seq2c_target = seq2c.precall(data) else: seq2c_target = None if not tz.get_in(["depth", "bins"], data): data = tz.update_in(data, ["depth", "bins"], lambda x: {}) data["depth"]["bins"] = { "target": out_target_file, "antitarget": out_anti_file, "seq2c": seq2c_target } return [[data]]
def _run_purecn(paired, work_dir): """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs. """ segfns = {"cnvkit": _segment_normalized_cnvkit, "gatk-cnv": _segment_normalized_gatk} out_base, out, all_files = _get_purecn_files(paired, work_dir) cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data) if not utils.file_uptodate(out["rds"], cnr_file): cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)](cnr_file, work_dir, paired) from bcbio import heterogeneity vcf_file = heterogeneity.get_variants(paired.tumor_data, include_germline=False)[0]["vrn_file"] vcf_file = germline.filter_to_pass_and_reject(vcf_file, paired, out_dir=work_dir) with file_transaction(paired.tumor_data, out_base) as tx_out_base: # Use UCSC style naming for human builds to support BSgenome genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in ["GRCh37", "hg19"] else dd.get_genome_build(paired.tumor_data)) cmd = ["PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds", "%s.rds" % tx_out_base, "--sampleid", dd.get_sample_name(paired.tumor_data), "--genome", genome, "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file, "--funsegmentation", "Hclust", "--maxnonclonal", "0.3"] if dd.get_num_cores(paired.tumor_data) > 1: cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))] do.run(cmd, "PureCN copy number calling") for f in all_files: shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) return out
def normalize_sv_coverage(*items): """Normalize CNV coverage, providing flexible point for multiple methods. Don't normalize when running purecn alone """ out = [] items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] from bcbio.structural import get_svcallers sv_callers = get_svcallers(items[0]) if "gatk-cnv" in sv_callers or "cnvkit" in sv_callers: calcfns = {"cnvkit": _normalize_sv_coverage_cnvkit, "gatk-cnv": _normalize_sv_coverage_gatk} from bcbio.structural import cnvkit from bcbio.structural import shared as sshared if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out_files = {} back_files = {} for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)): # No CNVkit calling for this particular set of samples if group_id is None: continue inputs, backgrounds = sshared.find_case_control(list(gitems)) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items)) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural", dd.get_sample_name(inputs[0]), "bins")) back_files, out_files = calcfns[cnvkit.bin_approach(inputs[0])](group_id, inputs, backgrounds, work_dir, back_files, out_files) for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)] data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)] out.append([data]) else: out = [[d] for d in items] return out
def calculate_sv_bins(*items): """Determine bin sizes and regions to use for samples. Unified approach to prepare regional bins for coverage calculations across multiple CNV callers. Splits into target and antitarget regions allowing callers to take advantage of both. Provides consistent target/anti-target bin sizes across batches. Uses callable_regions as the access BED file and mosdepth regions in variant_regions to estimate depth for bin sizes. """ calcfns = {"cnvkit": _calculate_sv_bins_cnvkit, "gatk-cnv": _calculate_sv_bins_gatk} from bcbio.structural import cnvkit items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out = [] for i, cnv_group in enumerate(_group_by_cnv_method(multi.group_by_batch(items, False))): size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes for data in cnv_group.items: if cnvkit.use_general_sv_bins(data): target_bed, anti_bed, gcannotated_tsv = calcfns[cnvkit.bin_approach(data)](data, cnv_group, size_calc_fn) if not data.get("regions"): data["regions"] = {} data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed, "group": str(i), "gcannotated": gcannotated_tsv} out.append([data]) if not len(out) == len(items): raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" % (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]), sorted([dd.get_sample_name(x) for x in items]))) return out
def normalize_sv_coverage(*items): """Normalize CNV coverage, providing flexible point for multiple methods. """ calcfns = {"cnvkit": _normalize_sv_coverage_cnvkit, "gatk-cnv": _normalize_sv_coverage_gatk} from bcbio.structural import cnvkit from bcbio.structural import shared as sshared items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out_files = {} back_files = {} for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)): # No CNVkit calling for this particular set of samples if group_id is None: continue inputs, backgrounds = sshared.find_case_control(list(gitems)) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items)) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural", dd.get_sample_name(inputs[0]), "bins")) back_files, out_files = calcfns[cnvkit.bin_approach(inputs[0])](group_id, inputs, backgrounds, work_dir, back_files, out_files) out = [] for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)] data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)] out.append([data]) return out
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. """ calcfns = {"cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk} from bcbio.structural import cnvkit data = utils.to_single_data(data) if not cnvkit.use_general_sv_bins(data): out_target_file, out_anti_file = (None, None) else: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)](data, work_dir) if not os.path.exists(out_target_file): out_target_file, out_anti_file = (None, None) if "seq2c" in dd.get_svcaller(data): from bcbio.structural import seq2c seq2c_target = seq2c.precall(data) else: seq2c_target = None if not tz.get_in(["depth", "bins"], data): data = tz.update_in(data, ["depth", "bins"], lambda x: {}) data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file, "seq2c": seq2c_target} return [[data]]
def calculate_sv_bins(*items): """Determine bin sizes and regions to use for samples. Unified approach to prepare regional bins for coverage calculations across multiple CNV callers. Splits into target and antitarget regions allowing callers to take advantage of both. Provides consistent target/anti-target bin sizes across batches. Uses callable_regions as the access BED file and mosdepth regions in variant_regions to estimate depth for bin sizes. """ calcfns = {"cnvkit": _calculate_sv_bins_cnvkit, "gatk-cnv": _calculate_sv_bins_gatk} from bcbio.structural import cnvkit items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out = [] for i, cnv_group in enumerate(_group_by_cnv_method(multi.group_by_batch(items, False))): size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes for data in cnv_group.items: if cnvkit.use_general_sv_bins(data): target_bed, anti_bed, gcannotated_tsv = calcfns[cnvkit.bin_approach(data)](data, cnv_group, size_calc_fn) if not data.get("regions"): data["regions"] = {} data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed, "group": str(i), "gcannotated": gcannotated_tsv} out.append([data]) if not len(out) == len(items): raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" % (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]), sorted([dd.get_sample_name(x) for x in items]))) return out
def _run_purecn(paired, work_dir): """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs. """ segfns = { "cnvkit": _segment_normalized_cnvkit, "gatk-cnv": _segment_normalized_gatk } out_base, out, all_files = _get_purecn_files(paired, work_dir) failed_file = out_base + "-failed.log" cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data) if not utils.file_uptodate( out["rds"], cnr_file) and not utils.file_exists(failed_file): cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)]( cnr_file, work_dir, paired) from bcbio import heterogeneity vcf_file = heterogeneity.get_variants( paired.tumor_data, include_germline=False)[0]["vrn_file"] vcf_file = germline.filter_to_pass_and_reject(vcf_file, paired, out_dir=work_dir) with file_transaction(paired.tumor_data, out_base) as tx_out_base: # Use UCSC style naming for human builds to support BSgenome genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in [ "GRCh37", "hg19" ] else dd.get_genome_build(paired.tumor_data)) cmd = [ "PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds", "%s.rds" % tx_out_base, "--sampleid", dd.get_sample_name(paired.tumor_data), "--genome", genome, "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file, "--funsegmentation", "Hclust", "--maxnonclonal", "0.3" ] if dd.get_num_cores(paired.tumor_data) > 1: cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))] try: cmd = "export R_LIBS_USER=%s && %s && %s" % ( utils.R_sitelib(), utils.get_R_exports(), " ".join( [str(x) for x in cmd])) do.run(cmd, "PureCN copy number calling") except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): logger.info( "PureCN failed to find solution for %s: skipping" % dd.get_sample_name(paired.tumor_data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.exception() raise for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) out = _get_purecn_files(paired, work_dir, require_exist=True)[1] return out if (out.get("rds") and os.path.exists(out["rds"])) else None
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. data is one sample """ calcfns = {"cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk} from bcbio.structural import cnvkit data = utils.to_single_data(data) from bcbio.structural import get_svcallers sv_callers = get_svcallers(data) has_cnvkit_or_gatkcnv = bool(set(["cnvkit", "gatk-cnv"]) & set(sv_callers)) if not cnvkit.use_general_sv_bins(data) or not has_cnvkit_or_gatkcnv: out_target_file, out_anti_file = (None, None) else: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)](data, work_dir) if not os.path.exists(out_target_file): out_target_file, out_anti_file = (None, None) if "seq2c" in dd.get_svcaller(data): from bcbio.structural import seq2c seq2c_target = seq2c.precall(data) else: seq2c_target = None if "purecn" in dd.get_svcaller(data): # set purecn_pon_build flag batches = dd.get_batch(data) if batches and "pon_build" in dd.get_batch(data): data["config"]["algorithm"]["purecn_pon_build"] = True from bcbio.structural import purecn # still calculate coverage even when not building pon - for t-only analysis purecn_target = purecn.get_coverage(data) else: purecn_target = None if not tz.get_in(["depth", "bins"], data): data = tz.update_in(data, ["depth", "bins"], lambda x: {}) data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file, "seq2c": seq2c_target, "purecn": purecn_target} return [[data]]
def _run_purecn(paired, work_dir): """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs. """ segfns = {"cnvkit": _segment_normalized_cnvkit, "gatk-cnv": _segment_normalized_gatk} out_base, out, all_files = _get_purecn_files(paired, work_dir) failed_file = out_base + "-failed.log" cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data) if not utils.file_uptodate(out["rds"], cnr_file) and not utils.file_exists(failed_file): cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)](cnr_file, work_dir, paired) from bcbio import heterogeneity vcf_file = heterogeneity.get_variants(paired.tumor_data, include_germline=False)[0]["vrn_file"] vcf_file = germline.filter_to_pass_and_reject(vcf_file, paired, out_dir=work_dir) with file_transaction(paired.tumor_data, out_base) as tx_out_base: # Use UCSC style naming for human builds to support BSgenome genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in ["GRCh37", "hg19"] else dd.get_genome_build(paired.tumor_data)) cmd = ["PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds", "%s.rds" % tx_out_base, "--sampleid", dd.get_sample_name(paired.tumor_data), "--genome", genome, "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file, "--funsegmentation", "Hclust", "--maxnonclonal", "0.3"] if dd.get_num_cores(paired.tumor_data) > 1: cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))] try: cmd = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(), utils.get_R_exports(), " ".join([str(x) for x in cmd])) do.run(cmd, "PureCN copy number calling") except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): logger.info("PureCN failed to find solution for %s: skipping" % dd.get_sample_name(paired.tumor_data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.exception() raise for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) out = _get_purecn_files(paired, work_dir, require_exist=True)[1] return out if (out.get("rds") and os.path.exists(out["rds"])) else None
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. """ calcfns = { "cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk } from bcbio.structural import cnvkit data = utils.to_single_data(data) if not cnvkit.use_general_sv_bins(data): return [[data]] work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)]( data, work_dir) if os.path.exists(out_target_file): data["depth"]["bins"] = { "target": out_target_file, "antitarget": out_anti_file } return [[data]]