Exemplo n.º 1
0
def apply_recal(data):
    """Apply recalibration tables to the sorted aligned BAM, producing recalibrated BAM.
    """
    orig_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    had_work_bam = "work_bam" in data
    if dd.get_recalibrate(data) in [True, "gatk"]:
        logger.info("Applying BQSR recalibration with GATK: %s " %
                    str(dd.get_sample_name(data)))
        data["work_bam"] = _gatk_apply_bqsr(data)
    elif dd.get_recalibrate(data) == "sentieon":
        logger.info("Applying BQSR recalibration with sentieon: %s " %
                    str(dd.get_sample_name(data)))
        data["work_bam"] = sentieon.apply_bqsr(data)
    elif dd.get_recalibrate(data):
        raise NotImplementedError("Unsupported recalibration type: %s" %
                                  (dd.get_recalibrate(data)))
    # CWL does not have work/alignment BAM separation
    if not had_work_bam and dd.get_work_bam(data):
        data["align_bam"] = dd.get_work_bam(data)
    if orig_bam != dd.get_work_bam(data) and orig_bam != dd.get_align_bam(
            data):
        utils.save_diskspace(orig_bam,
                             "BAM recalibrated to %s" % dd.get_work_bam(data),
                             data["config"])
    return data
Exemplo n.º 2
0
def prep_recal(data):
    """Do pre-BQSR recalibration, calculation of recalibration tables.
    """
    if dd.get_recalibrate(data) in [True, "gatk"]:
        logger.info("Prepare BQSR tables with GATK: %s " %
                    str(dd.get_sample_name(data)))
        dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"),
                               data)
        if not dbsnp_file:
            logger.info(
                "Skipping GATK BaseRecalibrator because no VCF file of known variants was found."
            )
            return data
        broad_runner = broad.runner_from_config(data["config"])
        data["prep_recal"] = _gatk_base_recalibrator(
            broad_runner, dd.get_align_bam(data), dd.get_ref_file(data),
            dd.get_platform(data), dbsnp_file, dd.get_variant_regions(data),
            data)
    elif dd.get_recalibrate(data) == "sentieon":
        logger.info("Prepare BQSR tables with sentieon: %s " %
                    str(dd.get_sample_name(data)))
        data["prep_recal"] = sentieon.bqsr_table(data)
    elif dd.get_recalibrate(data):
        raise NotImplementedError("Unsupported recalibration type: %s" %
                                  (dd.get_recalibrate(data)))
    return data
Exemplo n.º 3
0
def prep_recal(data):
    """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM.
    """
    if dd.get_recalibrate(data) in [True, "gatk"]:
        logger.info("Recalibrating %s with GATK" % str(dd.get_sample_name(data)))
        ref_file = data["sam_ref"]
        config = data["config"]
        dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data)
        if not dbsnp_file:
            logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.")
            return [[data]]
        platform = config["algorithm"].get("platform", "illumina")
        broad_runner = broad.runner_from_path("picard", config)
        broad_runner.run_fn("picard_index_ref", ref_file)
        if config["algorithm"].get("mark_duplicates", True):
            (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates", data["work_bam"])
        else:
            dup_align_bam = data["work_bam"]
        bam.index(dup_align_bam, config)
        intervals = config["algorithm"].get("variant_regions", None)
        data["work_bam"] = dup_align_bam
        broad_runner = broad.runner_from_config(config)
        data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file,
                                                     platform, dbsnp_file, intervals, data)
    return [[data]]
Exemplo n.º 4
0
def prep_recal(data):
    """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM.
    """
    if dd.get_recalibrate(data) in [True, "gatk"]:
        logger.info("Recalibrating %s with GATK" %
                    str(dd.get_sample_name(data)))
        ref_file = data["sam_ref"]
        config = data["config"]
        dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"),
                               data)
        if not dbsnp_file:
            logger.info(
                "Skipping GATK BaseRecalibrator because no VCF file of known variants was found."
            )
            return [[data]]
        platform = config["algorithm"].get("platform", "illumina")
        broad_runner = broad.runner_from_path("picard", config)
        broad_runner.run_fn("picard_index_ref", ref_file)
        if config["algorithm"].get("mark_duplicates", True):
            (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates",
                                                     data["work_bam"])
        else:
            dup_align_bam = data["work_bam"]
        bam.index(dup_align_bam, config)
        intervals = config["algorithm"].get("variant_regions", None)
        data["work_bam"] = dup_align_bam
        broad_runner = broad.runner_from_config(config)
        data["prep_recal"] = _gatk_base_recalibrator(broad_runner,
                                                     dup_align_bam, ref_file,
                                                     platform, dbsnp_file,
                                                     intervals, data)
    return [[data]]
Exemplo n.º 5
0
def _get_prep_params(data):
    """Retrieve configuration parameters with defaults for preparing BAM files.
    """
    recal_param = dd.get_recalibrate(data)
    recal_param = "gatk" if recal_param is True else recal_param
    realign_param = dd.get_realign(data)
    realign_param = "gatk" if realign_param is True else realign_param
    return {"recal": recal_param, "realign": realign_param}
Exemplo n.º 6
0
def _get_prep_params(data):
    """Retrieve configuration parameters with defaults for preparing BAM files.
    """
    recal_param = dd.get_recalibrate(data)
    recal_param = "gatk" if recal_param is True else recal_param
    realign_param = dd.get_realign(data)
    realign_param = "gatk" if realign_param is True else realign_param
    return {"recal": recal_param, "realign": realign_param}
Exemplo n.º 7
0
def prep_recal(data):
    """Do pre-BQSR recalibration, calculation of recalibration tables.
    """
    if dd.get_recalibrate(data) in [True, "gatk"]:
        logger.info("Prepare BQSR tables with GATK: %s " % str(dd.get_sample_name(data)))
        dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data)
        if not dbsnp_file:
            logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.")
            return data
        broad_runner = broad.runner_from_config(data["config"])
        data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dd.get_align_bam(data),
                                                     dd.get_ref_file(data), dd.get_platform(data),
                                                     dbsnp_file, dd.get_variant_regions(data), data)
    elif dd.get_recalibrate(data) == "sentieon":
        logger.info("Prepare BQSR tables with sentieon: %s " % str(dd.get_sample_name(data)))
        data["prep_recal"] = sentieon.bqsr_table(data)
    elif dd.get_recalibrate(data):
        raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data)))
    return data
Exemplo n.º 8
0
def apply_recal(data):
    """Apply recalibration tables to the sorted aligned BAM, producing recalibrated BAM.
    """
    orig_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    had_work_bam = "work_bam" in data
    if dd.get_recalibrate(data) in [True, "gatk"]:
        if data.get("prep_recal"):
            logger.info("Applying BQSR recalibration with GATK: %s " % str(dd.get_sample_name(data)))
            data["work_bam"] = _gatk_apply_bqsr(data)
    elif dd.get_recalibrate(data) == "sentieon":
        if data.get("prep_recal"):
            logger.info("Applying BQSR recalibration with sentieon: %s " % str(dd.get_sample_name(data)))
            data["work_bam"] = sentieon.apply_bqsr(data)
    elif dd.get_recalibrate(data):
        raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data)))
    # CWL does not have work/alignment BAM separation
    if not had_work_bam and dd.get_work_bam(data):
        data["align_bam"] = dd.get_work_bam(data)
    if orig_bam != dd.get_work_bam(data) and orig_bam != dd.get_align_bam(data):
        utils.save_diskspace(orig_bam, "BAM recalibrated to %s" % dd.get_work_bam(data), data["config"])
    return data
Exemplo n.º 9
0
def parallel_prep_region(samples, run_parallel):
    """Perform full pre-variant calling BAM prep work on regions.
    """
    file_key = "work_bam"
    split_fn = _split_by_regions("bamprep", "-prep.bam", file_key)
    # identify samples that do not need preparation -- no recalibration or realignment
    extras = []
    torun = []
    for data in [x[0] for x in samples]:
        if data.get("work_bam"):
            data["align_bam"] = data["work_bam"]
        if (not dd.get_recalibrate(data) and not dd.get_realign(data) and not dd.get_variantcaller(data)):
            extras.append([data])
        elif not data.get(file_key):
            extras.append([data])
        else:
            torun.append([data])
    return extras + parallel_split_combine(torun, split_fn, run_parallel,
                                           "piped_bamprep", _add_combine_info, file_key, ["config"])