Пример #1
0
def _normalize_cwl_inputs(items):
    """Extract variation and validation data from CWL input list of batched samples.
    """
    with_validate = {}
    vrn_files = []
    ready_items = []
    batch_samples = []
    for data in (cwlutils.normalize_missing(utils.to_single_data(d))
                 for d in items):
        batch_samples.append(dd.get_sample_name(data))
        if tz.get_in(["config", "algorithm", "validate"], data):
            with_validate[_checksum(
                tz.get_in(["config", "algorithm", "validate"], data))] = data
        if data.get("vrn_file"):
            vrn_files.append(data["vrn_file"])
        ready_items.append(data)
    if len(with_validate) == 0:
        ready_items[0]["batch_samples"] = batch_samples
        return ready_items[0]
    else:
        assert len(with_validate) == 1, len(with_validate)
        assert len(set(vrn_files)) == 1, set(vrn_files)
        data = with_validate.values()[0]
        data["batch_samples"] = batch_samples
        data["vrn_file"] = vrn_files[0]
        return data
Пример #2
0
def _normalize_cwl_inputs(items):
    """Extract variation and validation data from CWL input list of batched samples.
    """
    with_validate = {}
    vrn_files = []
    ready_items = []
    batch_samples = []
    for data in (cwlutils.normalize_missing(utils.to_single_data(d)) for d in items):
        batch_samples.append(dd.get_sample_name(data))
        if tz.get_in(["config", "algorithm", "validate"], data):
            with_validate[_checksum(tz.get_in(["config", "algorithm", "validate"], data))] = data
        if data.get("vrn_file"):
            vrn_files.append(data["vrn_file"])
        ready_items.append(data)
    if len(with_validate) == 0:
        data = _pick_lead_item(ready_items)
        data["batch_samples"] = batch_samples
        return data
    else:
        assert len(with_validate) == 1, len(with_validate)
        assert len(set(vrn_files)) == 1, set(vrn_files)
        data = _pick_lead_item(with_validate.values())
        data["batch_samples"] = batch_samples
        data["vrn_file"] = vrn_files[0]
        return data
Пример #3
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(
            ".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
                         dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data)
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": callable_bed,
            "highdepth": covinfo.highdepth,
            "sample_callable": covinfo.callable,
            "coverage_depth_bed": covinfo.depth,
            "avg_coverage": covinfo.avg_coverage
        }
        data = coverage.assign_interval(data)
        if (os.path.exists(callable_region_bed)
                and not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"][
                "variant_regions"] = callable_region_bed
            data = clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [[data]]
Пример #4
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                  dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data)
        data["regions"] = {"nblock": nblock_bed,
                           "callable": callable_bed,
                           "sample_callable": covinfo.callable,
                           "mapped_stats": readstats.get_cache_file(data)}
        data["depth"] = covinfo.depth_files
        data = coverage.assign_interval(data)
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
            data = clean_inputs(data)
        data = recalibrate.prep_recal(data)
        data = recalibrate.apply_recal(data)
    return [[data]]
Пример #5
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(
            ".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
                         dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed = \
            callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data)
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": covinfo.raw_callable,
            "sample_callable": covinfo.callable,
            "mapped_stats": readstats.get_cache_file(data)
        }
        data["depth"] = covinfo.depth_files
        data = coverage.assign_interval(data)
        data = samtools.run_and_save(data)
        data = recalibrate.prep_recal(data)
        data = recalibrate.apply_recal(data)
    return [[data]]
Пример #6
0
def create_inputs(data):
    """Index input reads and prepare groups of reads to process concurrently.

    Allows parallelization of alignment beyond processors available on a single
    machine. Prepares a rtg SDF format file with build in indexes for retrieving
    sections of files.

    Retains back compatibility with bgzip/grabix approach.
    """
    data = cwlutils.normalize_missing(data)
    aligner = tz.get_in(("config", "algorithm", "aligner"), data)
    # CRAM files must be converted to bgzipped fastq, unless not aligning.
    # Also need to prep and download remote files.
    if not ("files" in data and data["files"] and aligner and
            (_is_cram_input(data["files"])
             or objectstore.is_remote(data["files"][0]))):
        # skip indexing on samples without input files or not doing alignment
        if ("files" not in data or not data["files"]
                or data["files"][0] is None or not aligner):
            return [[data]]
    approach = "grabix" if _has_grabix_indices(
        data) else dd.get_align_prep_method(data)
    data["files_orig"] = data["files"]
    if approach == "rtg":
        data["files"] = [rtg.to_sdf(data["files"], data)]
    else:
        data["files"] = _prep_grabix_indexes(data["files"], data["dirs"], data)
    # preparation converts illumina into sanger format
    data["config"]["algorithm"]["quality_format"] = "standard"
    data = _set_align_split_size(data)
    out = []
    if tz.get_in(["config", "algorithm", "align_split_size"], data):
        if approach == "rtg":
            splits = rtg.calculate_splits(
                data["files"][0],
                data["config"]["algorithm"]["align_split_size"])
        else:
            splits = _find_read_splits(
                data["files"][0],
                data["config"]["algorithm"]["align_split_size"])
        for split in splits:
            cur_data = copy.deepcopy(data)
            cur_data["align_split"] = split
            out.append([cur_data])
    else:
        out.append([data])
    if "output_cwl_keys" in data:
        out = cwlutils.samples_to_records(
            [utils.to_single_data(x) for x in out],
            ["files", "align_split", "config__algorithm__quality_format"])
    return out
Пример #7
0
def prep_samples(*items):
    """Handle any global preparatory steps for samples with potentially shared data.

    Avoids race conditions in postprocess alignment when performing prep tasks
    on shared files between multiple similar samples.

    Cleans input BED files to avoid issues with overlapping input segments.
    """
    out = []
    for data in (utils.to_single_data(x) for x in items):
        data = cwlutils.normalize_missing(data)
        data = clean_inputs(data)
        out.append([data])
    return out
Пример #8
0
def prep_samples(*items):
    """Handle any global preparatory steps for samples with potentially shared data.

    Avoids race conditions in postprocess alignment when performing prep tasks
    on shared files between multiple similar samples.

    Cleans input BED files to avoid issues with overlapping input segments.
    """
    out = []
    for data in (utils.to_single_data(x) for x in items):
        data = cwlutils.normalize_missing(data)
        data = clean_inputs(data)
        out.append([data])
    return out
Пример #9
0
def _finalize_cwl_in(data, work_dir, passed_keys, output_cwl_keys, runtime):
    """Finalize data object with inputs from CWL.
    """
    data["dirs"] = {"work": work_dir}
    if not tz.get_in(["config", "algorithm"], data):
        if "config" not in data:
            data["config"] = {}
        data["config"]["algorithm"] = {}
    if "rgnames" not in data and "description" in data:
        data["rgnames"] = {"sample": data["description"]}
    data["cwl_keys"] = passed_keys
    data["output_cwl_keys"] = output_cwl_keys
    data = _add_resources(data, runtime)
    data = run_info.normalize_world(data)
    data = cwlutils.normalize_missing(data)
    return data
Пример #10
0
def _finalize_cwl_in(data, work_dir, passed_keys, output_cwl_keys, runtime):
    """Finalize data object with inputs from CWL.
    """
    data["dirs"] = {"work": work_dir}
    if not tz.get_in(["config", "algorithm"], data):
        if "config" not in data:
            data["config"] = {}
        data["config"]["algorithm"] = {}
    if "rgnames" not in data and "description" in data:
        data["rgnames"] = {"sample": data["description"]}
    data["cwl_keys"] = passed_keys
    data["output_cwl_keys"] = output_cwl_keys
    data = _add_resources(data, runtime)
    data = run_info.normalize_world(data)
    data = cwlutils.normalize_missing(data)
    return data
Пример #11
0
def create_inputs(data):
    """Index input reads and prepare groups of reads to process concurrently.

    Allows parallelization of alignment beyond processors available on a single
    machine. Prepares a bgzip and grabix indexed file for retrieving sections
    of files.
    """
    from bcbio.pipeline import sample
    data = cwlutils.normalize_missing(data)
    aligner = tz.get_in(("config", "algorithm", "aligner"), data)
    # CRAM files must be converted to bgzipped fastq, unless not aligning.
    # Also need to prep and download remote files.
    if not ("files" in data and data["files"] and aligner and
            (_is_cram_input(data["files"])
             or objectstore.is_remote(data["files"][0]))):
        # skip indexing on samples without input files or not doing alignment
        if ("files" not in data or not data["files"]
                or data["files"][0] is None or not aligner):
            return [[data]]
        # if this is a DRAGEN BAM, we need to do further alignments with this BAM, so don't convert it
        if dd.get_umi_type(data) == "dragen":
            return [[data]]
    data["files_orig"] = data["files"]
    data["files"] = prep_fastq_inputs(data["files"], data)
    # preparation converts illumina into sanger format
    data["config"]["algorithm"]["quality_format"] = "standard"
    # Handle any necessary trimming
    data = utils.to_single_data(sample.trim_sample(data)[0])
    _prep_grabix_indexes(data["files"], data)
    data = _set_align_split_size(data)
    out = []
    if tz.get_in(["config", "algorithm", "align_split_size"], data):
        splits = _find_read_splits(
            data["files"][0],
            int(data["config"]["algorithm"]["align_split_size"]))
        for split in splits:
            cur_data = copy.deepcopy(data)
            cur_data["align_split"] = split
            out.append([cur_data])
    else:
        out.append([data])
    if "output_cwl_keys" in data:
        out = cwlutils.samples_to_records(
            [utils.to_single_data(x) for x in out],
            ["files", "align_split", "config__algorithm__quality_format"])
    return out
Пример #12
0
def create_inputs(data):
    """Index input reads and prepare groups of reads to process concurrently.

    Allows parallelization of alignment beyond processors available on a single
    machine. Prepares a bgzip and grabix indexed file for retrieving sections
    of files.
    """
    from bcbio.pipeline import sample
    data = cwlutils.normalize_missing(data)
    aligner = tz.get_in(("config", "algorithm", "aligner"), data)
    # CRAM files must be converted to bgzipped fastq, unless not aligning.
    # Also need to prep and download remote files.
    if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or
                                                               objectstore.is_remote(data["files"][0]))):
        # skip indexing on samples without input files or not doing alignment
        if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner):
            return [[data]]
    data["files_orig"] = data["files"]
    data["files"] = _prep_fastq_inputs(data["files"], data)
    # preparation converts illumina into sanger format
    data["config"]["algorithm"]["quality_format"] = "standard"
    # Handle any necessary trimming
    data = utils.to_single_data(sample.trim_sample(data)[0])
    _prep_grabix_indexes(data["files"], data)
    data = _set_align_split_size(data)
    out = []
    if tz.get_in(["config", "algorithm", "align_split_size"], data):
        splits = _find_read_splits(data["files"][0], data["config"]["algorithm"]["align_split_size"])
        for split in splits:
            cur_data = copy.deepcopy(data)
            cur_data["align_split"] = split
            out.append([cur_data])
    else:
        out.append([data])
    if "output_cwl_keys" in data:
        out = cwlutils.samples_to_records([utils.to_single_data(x) for x in out],
                                          ["files", "align_split", "config__algorithm__quality_format"])
    return out
Пример #13
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" %
                    (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        if dd.get_correct_umis(data):
            data["work_bam"] = postalign.correct_umis(data)
        if dd.get_umi_consensus(data):
            data["umi_bam"] = dd.get_work_bam(data)
            if fastq2:
                f1, f2, avg_cov = postalign.umi_consensus(data)
                data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov
                del data["config"]["algorithm"]["umi_type"]
                data["config"]["algorithm"]["mark_duplicates"] = False
                data = align_to_sort_bam(f1, f2, aligner, data)
            else:
                raise ValueError(
                    "Single fastq input for UMI processing; fgbio needs paired reads: %s"
                    % dd.get_sample_name(data))
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError(
                    "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                    % sort_method)
            ref_file = dd.get_ref_file(data)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file,
                                           data["dirs"], data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"],
                                     dd.get_ref_file(data), data["dirs"], data)
        elif bamclean == "remove_extracontigs":
            out_bam = cleanbam.remove_extracontigs(fastq1, data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(
                data["dirs"]["work"], "{}-sort.bam".format(
                    os.path.splitext(os.path.basename(fastq1))[0]))
            if not utils.file_exists(out_file):
                work_dir = utils.safe_makedir(
                    os.path.join(dd.get_work_dir(data), "bamclean",
                                 dd.get_sample_name(data)))
                out_file = os.path.join(
                    work_dir, "{}-sort.bam".format(dd.get_sample_name(data)))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method,
                                    out_file)
        else:
            out_bam = _link_bam_file(
                fastq1,
                os.path.join(dd.get_work_dir(data), "prealign",
                             dd.get_sample_name(data)), data)
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data),
                         data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and not dd.get_aligner(data):
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" %
                         dd.get_sample_name(data))
    elif "kraken" in config["algorithm"]:  # kraken doesn's need bam
        pass
    else:
        raise ValueError(
            "Could not process input file from sample configuration. \n" +
            fastq1 + "\nIs the path to the file correct or is empty?\n" +
            "If it is a fastq file (not pre-aligned BAM or CRAM), "
            "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]
Пример #14
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        if dd.get_umi_consensus(data):
            data["umi_bam"] = dd.get_work_bam(data)
            if fastq2:
                f1, f2 = postalign.umi_consensus(data)
                del data["config"]["algorithm"]["umi_type"]
                data["config"]["algorithm"]["mark_duplicates"] = False
                data = align_to_sort_bam(f1, f2, aligner, data)
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                                 % sort_method)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"],
                                           data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data)
        elif bamclean == "remove_extracontigs":
            out_bam = cleanbam.remove_extracontigs(fastq1, data)
            data["reference"]["fasta"] = bam.ref_file_from_bam(out_bam, data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        else:
            out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign",
                                                         data["rgnames"]["sample"]))
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data))
    elif "kraken" in config["algorithm"]:  # kraken doesn's need bam
        pass
    else:
        raise ValueError("Could not process input file from sample configuration. \n" +
                         fastq1 +
                         "\nIs the path to the file correct or is empty?\n" +
                         "If it is a fastq file (not pre-aligned BAM or CRAM), "
                         "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]