def run_single(self, reads_ref, params): """ Performs a single run of HISAT2 against a single reads reference. The rest of the info is taken from the params dict - see the spec for details. """ # 1. Get hisat2 index from genome. # a. If it exists in cache, use that. # b. Otherwise, build it idx_prefix = self.build_index(params["genome_ref"]) # 2. Fetch the reads file and deal make sure input params are correct. reads = fetch_reads_from_reference(reads_ref["ref"], self.callback_url) # if the reads ref came from a different sample set, then we need to drop that # reference inside the reads info object so it can be linked in the alignment if reads_ref["ref"] != params["sampleset_ref"]: reads["sampleset_ref"] = params["sampleset_ref"] # make sure condition info carries over if we have it if "condition" in reads_ref: reads["condition"] = reads_ref["condition"] elif "condition" in params: reads["condition"] = params["condition"] reads["name"] = reads_ref["name"] output_file = "accepted_hits" # 3. Finally all set, do the alignment and upload the output. alignment_file = self.run_hisat2(idx_prefix, reads, params, output_file=output_file) alignment_name = reads["name"] + params["alignment_suffix"] output_ref = self.upload_alignment(params, reads, alignment_name, alignment_file) alignment_set_ref = None if is_set(params["sampleset_ref"], self.workspace_url): # alignment_items, alignmentset_name, ws_name set_name = get_object_names( [params["sampleset_ref"]], self.workspace_url)[params["sampleset_ref"]] alignment_set_name = set_name + params["alignmentset_suffix"] alignment_set_ref = self.upload_alignment_set( [{ "ref": output_ref, "label": reads["condition"] }], alignment_set_name, params["ws_name"]) alignments = dict() alignments[reads_ref["ref"]] = { "ref": output_ref, "name": alignment_name } os.remove(reads["file_fwd"]) if "file_rev" in reads: os.remove(reads["file_rev"]) return (alignments, output_ref, alignment_set_ref)
def run_batch(self, reads_refs, params): """ Runs HISAT2 in batch mode. reads_refs should be a list of dicts, where each looks like the following: { "ref": reads object reference, "condition": condition for that ref (string) } """ # build task list and send it to KBParallel tasks = list() set_name = get_object_names( [params["sampleset_ref"]], self.workspace_url)[params["sampleset_ref"]] for idx, reads_ref in enumerate(reads_refs): single_param = dict(params) # need a copy of the params single_param["build_report"] = 0 single_param["sampleset_ref"] = reads_ref["ref"] if "condition" in reads_ref: single_param["condition"] = reads_ref["condition"] else: single_param["condition"] = "unspecified" tasks.append({ "module_name": "kb_hisat2", "function_name": "run_hisat2", "version": self.my_version, "parameters": single_param }) # UNCOMMENT BELOW FOR LOCAL TESTING batch_run_params = { "tasks": tasks, "runner": "parallel", # "concurrent_local_tasks": 3, # "concurrent_njsw_tasks": 0, "max_retries": 2 } parallel_runner = KBParallel(self.callback_url) results = parallel_runner.run_batch(batch_run_params)["results"] alignment_items = list() alignments = dict() for idx, result in enumerate(results): # idx of the result is the same as the idx of the inputs AND reads_refs if result["is_error"] != 0: raise RuntimeError( "Failed a parallel run of HISAT2! {}".format( result["result_package"]["error"])) reads_ref = tasks[idx]["parameters"]["sampleset_ref"] alignment_items.append({ "ref": result["result_package"]["result"][0]["alignment_objs"] [reads_ref]["ref"], "label": reads_refs[idx].get("condition", params.get("condition", "unspecified")) }) alignments[reads_ref] = result["result_package"]["result"][0][ "alignment_objs"][reads_ref] # build the final alignment set output_ref = self.upload_alignment_set( alignment_items, set_name + params["alignmentset_suffix"], params["ws_name"]) return (alignments, output_ref)