Пример #1
0
def create_inputs(data):
    """Index input reads and prepare groups of reads to process concurrently.

    Allows parallelization of alignment beyond processors available on a single
    machine. Prepares a rtg SDF format file with build in indexes for retrieving
    sections of files.

    Retains back compatibility with bgzip/grabix approach.
    """
    data = cwlutils.normalize_missing(data)
    aligner = tz.get_in(("config", "algorithm", "aligner"), data)
    # CRAM files must be converted to bgzipped fastq, unless not aligning.
    # Also need to prep and download remote files.
    if not ("files" in data and data["files"] and aligner and
            (_is_cram_input(data["files"])
             or objectstore.is_remote(data["files"][0]))):
        # skip indexing on samples without input files or not doing alignment
        if ("files" not in data or not data["files"]
                or data["files"][0] is None or not aligner):
            return [[data]]
    approach = "grabix" if _has_grabix_indices(
        data) else dd.get_align_prep_method(data)
    data["files_orig"] = data["files"]
    if approach == "rtg":
        data["files"] = [rtg.to_sdf(data["files"], data)]
    else:
        data["files"] = _prep_grabix_indexes(data["files"], data["dirs"], data)
    # preparation converts illumina into sanger format
    data["config"]["algorithm"]["quality_format"] = "standard"
    data = _set_align_split_size(data)
    out = []
    if tz.get_in(["config", "algorithm", "align_split_size"], data):
        if approach == "rtg":
            splits = rtg.calculate_splits(
                data["files"][0],
                data["config"]["algorithm"]["align_split_size"])
        else:
            splits = _find_read_splits(
                data["files"][0],
                data["config"]["algorithm"]["align_split_size"])
        for split in splits:
            cur_data = copy.deepcopy(data)
            cur_data["align_split"] = split
            out.append([cur_data])
    else:
        out.append([data])
    if "output_cwl_keys" in data:
        out = cwlutils.samples_to_records(
            [utils.to_single_data(x) for x in out],
            ["files", "align_split", "config__algorithm__quality_format"])
    return out
Пример #2
0
def create_inputs(data):
    """Index input reads and prepare groups of reads to process concurrently.

    Allows parallelization of alignment beyond processors available on a single
    machine. Prepares a rtg SDF format file with build in indexes for retrieving
    sections of files.

    Retains back compatibility with bgzip/grabix approach.
    """
    aligner = tz.get_in(("config", "algorithm", "aligner"), data)
    # CRAM files must be converted to bgzipped fastq, unless not aligning.
    # Also need to prep and download remote files.
    if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or
                                                               objectstore.is_remote(data["files"][0]))):
        # skip indexing on samples without input files or not doing alignment
        if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner):
            return [[data]]
    approach = "grabix" if _has_grabix_indices(data) else dd.get_align_prep_method(data)
    data["files_orig"] = data["files"]
    if approach == "rtg":
        data["files"] = [rtg.to_sdf(data["files"], data)]
    else:
        data["files"] = _prep_grabix_indexes(data["files"], data["dirs"], data)
    # preparation converts illumina into sanger format
    data["config"]["algorithm"]["quality_format"] = "standard"
    data = _set_align_split_size(data)
    if tz.get_in(["config", "algorithm", "align_split_size"], data):
        out = []
        if approach == "rtg":
            splits = rtg.calculate_splits(data["files"][0], data["config"]["algorithm"]["align_split_size"])
        else:
            splits = _find_read_splits(data["files"][0], data["config"]["algorithm"]["align_split_size"])
        for split in splits:
            cur_data = copy.deepcopy(data)
            cur_data["align_split"] = split
            out.append([cur_data])
        return out
    else:
        return [[data]]