def create_inputs(data): """Index input reads and prepare groups of reads to process concurrently. Allows parallelization of alignment beyond processors available on a single machine. Prepares a rtg SDF format file with build in indexes for retrieving sections of files. Retains back compatibility with bgzip/grabix approach. """ data = cwlutils.normalize_missing(data) aligner = tz.get_in(("config", "algorithm", "aligner"), data) # CRAM files must be converted to bgzipped fastq, unless not aligning. # Also need to prep and download remote files. if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or objectstore.is_remote(data["files"][0]))): # skip indexing on samples without input files or not doing alignment if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner): return [[data]] approach = "grabix" if _has_grabix_indices( data) else dd.get_align_prep_method(data) data["files_orig"] = data["files"] if approach == "rtg": data["files"] = [rtg.to_sdf(data["files"], data)] else: data["files"] = _prep_grabix_indexes(data["files"], data["dirs"], data) # preparation converts illumina into sanger format data["config"]["algorithm"]["quality_format"] = "standard" data = _set_align_split_size(data) out = [] if tz.get_in(["config", "algorithm", "align_split_size"], data): if approach == "rtg": splits = rtg.calculate_splits( data["files"][0], data["config"]["algorithm"]["align_split_size"]) else: splits = _find_read_splits( data["files"][0], data["config"]["algorithm"]["align_split_size"]) for split in splits: cur_data = copy.deepcopy(data) cur_data["align_split"] = split out.append([cur_data]) else: out.append([data]) if "output_cwl_keys" in data: out = cwlutils.samples_to_records( [utils.to_single_data(x) for x in out], ["files", "align_split", "config__algorithm__quality_format"]) return out
def create_inputs(data): """Index input reads and prepare groups of reads to process concurrently. Allows parallelization of alignment beyond processors available on a single machine. Prepares a rtg SDF format file with build in indexes for retrieving sections of files. Retains back compatibility with bgzip/grabix approach. """ aligner = tz.get_in(("config", "algorithm", "aligner"), data) # CRAM files must be converted to bgzipped fastq, unless not aligning. # Also need to prep and download remote files. if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or objectstore.is_remote(data["files"][0]))): # skip indexing on samples without input files or not doing alignment if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner): return [[data]] approach = "grabix" if _has_grabix_indices(data) else dd.get_align_prep_method(data) data["files_orig"] = data["files"] if approach == "rtg": data["files"] = [rtg.to_sdf(data["files"], data)] else: data["files"] = _prep_grabix_indexes(data["files"], data["dirs"], data) # preparation converts illumina into sanger format data["config"]["algorithm"]["quality_format"] = "standard" data = _set_align_split_size(data) if tz.get_in(["config", "algorithm", "align_split_size"], data): out = [] if approach == "rtg": splits = rtg.calculate_splits(data["files"][0], data["config"]["algorithm"]["align_split_size"]) else: splits = _find_read_splits(data["files"][0], data["config"]["algorithm"]["align_split_size"]) for split in splits: cur_data = copy.deepcopy(data) cur_data["align_split"] = split out.append([cur_data]) return out else: return [[data]]