def prep_recal(data): """Do pre-BQSR recalibration, calculation of recalibration tables. """ if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Prepare BQSR tables with GATK: %s " % str(dd.get_sample_name(data))) dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info( "Skipping GATK BaseRecalibrator because no VCF file of known variants was found." ) return data broad_runner = broad.runner_from_config(data["config"]) data["prep_recal"] = _gatk_base_recalibrator( broad_runner, dd.get_align_bam(data), dd.get_ref_file(data), dd.get_platform(data), dbsnp_file, dd.get_variant_regions(data), data) elif dd.get_recalibrate(data) == "sentieon": logger.info("Prepare BQSR tables with sentieon: %s " % str(dd.get_sample_name(data))) data["prep_recal"] = sentieon.bqsr_table(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) return data
def _do_run(paired): """Perform Battenberg caling with the paired dataset. This purposely does not use a temporary directory for the output since Battenberg does smart restarts. """ work_dir = _sv_workdir(paired.tumor_data) ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt") out = _get_battenberg_out(paired, work_dir) if len(_missing_files(out)) > 0: ref_file = dd.get_ref_file(paired.tumor_data) bat_datadir = os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg")) ignore_file = _make_ignore_file(work_dir, ref_file, os.path.join(bat_datadir, "impute", "impute_info.txt"), ignore_file) local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") perllib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "perl5") tumor_bam = paired.tumor_bam normal_bam = paired.normal_bam platform = dd.get_platform(paired.tumor_data) genome_build = paired.tumor_data["genome_build"] # scale cores to avoid over-using memory during imputation cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5)) cmd = ("export R_LIBS_USER={local_sitelib} && " "export PERL5LIB={perllib}:$PERL5LIB " "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai " "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt " "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt " "-ig {ignore_file} " "-assembly {genome_build} -species Human -platform {platform}") do.run(cmd.format(**locals()), "Battenberg CNV calling") assert len(_missing_files(out)) == 0, "Missing Battenberg output: %s" % _missing_files(out) out["ignore"] = ignore_file return out
def _do_run(paired): """Perform Battenberg caling with the paired dataset. This purposely does not use a temporary directory for the output since Battenberg does smart restarts. """ work_dir = _sv_workdir(paired.tumor_data) out = _get_battenberg_out(paired, work_dir) ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt") if len(_missing_files(out)) > 0: ref_file = dd.get_ref_file(paired.tumor_data) bat_datadir = os.path.normpath( os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg")) ignore_file, gl_file = _make_ignore_file( work_dir, ref_file, ignore_file, os.path.join(bat_datadir, "impute", "impute_info.txt")) local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") tumor_bam = paired.tumor_bam normal_bam = paired.normal_bam platform = dd.get_platform(paired.tumor_data) genome_build = paired.tumor_data["genome_build"] # scale cores to avoid over-using memory during imputation cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5)) gender = { "male": "XY", "female": "XX", "unknown": "L" }.get(population.get_gender(paired.tumor_data)) if gender == "L": gender_str = "-ge %s -gl %s" % (gender, gl_file) else: gender_str = "-ge %s" % (gender) r_export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) cmd = ( "export R_LIBS_USER={local_sitelib} && {r_export_cmd}" "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai " "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt " "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt " "-ig {ignore_file} {gender_str} " "-assembly {genome_build} -species Human -platform {platform}") do.run(cmd.format(**locals()), "Battenberg CNV calling") assert len(_missing_files( out)) == 0, "Missing Battenberg output: %s" % _missing_files(out) out["plot"] = _get_battenberg_out_plots(paired, work_dir) out["ignore"] = ignore_file return out
def prep_recal(data): """Do pre-BQSR recalibration, calculation of recalibration tables. """ if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Prepare BQSR tables with GATK: %s " % str(dd.get_sample_name(data))) dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.") return data broad_runner = broad.runner_from_config(data["config"]) data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dd.get_align_bam(data), dd.get_ref_file(data), dd.get_platform(data), dbsnp_file, dd.get_variant_regions(data), data) elif dd.get_recalibrate(data) == "sentieon": logger.info("Prepare BQSR tables with sentieon: %s " % str(dd.get_sample_name(data))) data["prep_recal"] = sentieon.bqsr_table(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) return data
def _do_run(paired): """Perform Battenberg caling with the paired dataset. This purposely does not use a temporary directory for the output since Battenberg does smart restarts. """ work_dir = _sv_workdir(paired.tumor_data) out = _get_battenberg_out(paired, work_dir) ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt") if len(_missing_files(out)) > 0: ref_file = dd.get_ref_file(paired.tumor_data) bat_datadir = os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg")) ignore_file, gl_file = _make_ignore_file(work_dir, ref_file, ignore_file, os.path.join(bat_datadir, "impute", "impute_info.txt")) local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") tumor_bam = paired.tumor_bam normal_bam = paired.normal_bam platform = dd.get_platform(paired.tumor_data) genome_build = paired.tumor_data["genome_build"] # scale cores to avoid over-using memory during imputation cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5)) gender = {"male": "XY", "female": "XX", "unknown": "L"}.get(population.get_gender(paired.tumor_data)) if gender == "L": gender_str = "-ge %s -gl %s" % (gender, gl_file) else: gender_str = "-ge %s" % (gender) r_export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd()) cmd = ("export R_LIBS_USER={local_sitelib} && {r_export_cmd}" "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai " "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt " "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt " "-ig {ignore_file} {gender_str} " "-assembly {genome_build} -species Human -platform {platform}") do.run(cmd.format(**locals()), "Battenberg CNV calling") assert len(_missing_files(out)) == 0, "Missing Battenberg output: %s" % _missing_files(out) out["plot"] = _get_battenberg_out_plots(paired, work_dir) out["ignore"] = ignore_file return out