def apply_recal(data): """Apply recalibration tables to the sorted aligned BAM, producing recalibrated BAM. """ orig_bam = dd.get_align_bam(data) or dd.get_work_bam(data) had_work_bam = "work_bam" in data if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Applying BQSR recalibration with GATK: %s " % str(dd.get_sample_name(data))) data["work_bam"] = _gatk_apply_bqsr(data) elif dd.get_recalibrate(data) == "sentieon": logger.info("Applying BQSR recalibration with sentieon: %s " % str(dd.get_sample_name(data))) data["work_bam"] = sentieon.apply_bqsr(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) # CWL does not have work/alignment BAM separation if not had_work_bam and dd.get_work_bam(data): data["align_bam"] = dd.get_work_bam(data) if orig_bam != dd.get_work_bam(data) and orig_bam != dd.get_align_bam( data): utils.save_diskspace(orig_bam, "BAM recalibrated to %s" % dd.get_work_bam(data), data["config"]) return data
def prep_recal(data): """Do pre-BQSR recalibration, calculation of recalibration tables. """ if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Prepare BQSR tables with GATK: %s " % str(dd.get_sample_name(data))) dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info( "Skipping GATK BaseRecalibrator because no VCF file of known variants was found." ) return data broad_runner = broad.runner_from_config(data["config"]) data["prep_recal"] = _gatk_base_recalibrator( broad_runner, dd.get_align_bam(data), dd.get_ref_file(data), dd.get_platform(data), dbsnp_file, dd.get_variant_regions(data), data) elif dd.get_recalibrate(data) == "sentieon": logger.info("Prepare BQSR tables with sentieon: %s " % str(dd.get_sample_name(data))) data["prep_recal"] = sentieon.bqsr_table(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) return data
def prep_recal(data): """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM. """ if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Recalibrating %s with GATK" % str(dd.get_sample_name(data))) ref_file = data["sam_ref"] config = data["config"] dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.") return [[data]] platform = config["algorithm"].get("platform", "illumina") broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) if config["algorithm"].get("mark_duplicates", True): (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates", data["work_bam"]) else: dup_align_bam = data["work_bam"] bam.index(dup_align_bam, config) intervals = config["algorithm"].get("variant_regions", None) data["work_bam"] = dup_align_bam broad_runner = broad.runner_from_config(config) data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data) return [[data]]
def prep_recal(data): """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM. """ if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Recalibrating %s with GATK" % str(dd.get_sample_name(data))) ref_file = data["sam_ref"] config = data["config"] dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info( "Skipping GATK BaseRecalibrator because no VCF file of known variants was found." ) return [[data]] platform = config["algorithm"].get("platform", "illumina") broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) if config["algorithm"].get("mark_duplicates", True): (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates", data["work_bam"]) else: dup_align_bam = data["work_bam"] bam.index(dup_align_bam, config) intervals = config["algorithm"].get("variant_regions", None) data["work_bam"] = dup_align_bam broad_runner = broad.runner_from_config(config) data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data) return [[data]]
def _get_prep_params(data): """Retrieve configuration parameters with defaults for preparing BAM files. """ recal_param = dd.get_recalibrate(data) recal_param = "gatk" if recal_param is True else recal_param realign_param = dd.get_realign(data) realign_param = "gatk" if realign_param is True else realign_param return {"recal": recal_param, "realign": realign_param}
def prep_recal(data): """Do pre-BQSR recalibration, calculation of recalibration tables. """ if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Prepare BQSR tables with GATK: %s " % str(dd.get_sample_name(data))) dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.") return data broad_runner = broad.runner_from_config(data["config"]) data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dd.get_align_bam(data), dd.get_ref_file(data), dd.get_platform(data), dbsnp_file, dd.get_variant_regions(data), data) elif dd.get_recalibrate(data) == "sentieon": logger.info("Prepare BQSR tables with sentieon: %s " % str(dd.get_sample_name(data))) data["prep_recal"] = sentieon.bqsr_table(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) return data
def apply_recal(data): """Apply recalibration tables to the sorted aligned BAM, producing recalibrated BAM. """ orig_bam = dd.get_align_bam(data) or dd.get_work_bam(data) had_work_bam = "work_bam" in data if dd.get_recalibrate(data) in [True, "gatk"]: if data.get("prep_recal"): logger.info("Applying BQSR recalibration with GATK: %s " % str(dd.get_sample_name(data))) data["work_bam"] = _gatk_apply_bqsr(data) elif dd.get_recalibrate(data) == "sentieon": if data.get("prep_recal"): logger.info("Applying BQSR recalibration with sentieon: %s " % str(dd.get_sample_name(data))) data["work_bam"] = sentieon.apply_bqsr(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) # CWL does not have work/alignment BAM separation if not had_work_bam and dd.get_work_bam(data): data["align_bam"] = dd.get_work_bam(data) if orig_bam != dd.get_work_bam(data) and orig_bam != dd.get_align_bam(data): utils.save_diskspace(orig_bam, "BAM recalibrated to %s" % dd.get_work_bam(data), data["config"]) return data
def parallel_prep_region(samples, run_parallel): """Perform full pre-variant calling BAM prep work on regions. """ file_key = "work_bam" split_fn = _split_by_regions("bamprep", "-prep.bam", file_key) # identify samples that do not need preparation -- no recalibration or realignment extras = [] torun = [] for data in [x[0] for x in samples]: if data.get("work_bam"): data["align_bam"] = data["work_bam"] if (not dd.get_recalibrate(data) and not dd.get_realign(data) and not dd.get_variantcaller(data)): extras.append([data]) elif not data.get(file_key): extras.append([data]) else: torun.append([data]) return extras + parallel_split_combine(torun, split_fn, run_parallel, "piped_bamprep", _add_combine_info, file_key, ["config"])