def fetch_reads_refs_from_sampleset(ref, ws_url, srv_wiz_url): """ From the given object ref, return a list of all reads objects that are a part of that object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows: { "ref": reads object reference, "condition": condition string associated with that reads object, "name": reads object name (needed for saving an AlignmentSet) } The only one required is "ref", all other keys may or may not be present, based on the reads object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info for each reads object, but a single PairedEndLibrary may not have that info. If ref is already a Reads library, just returns a list with ref as a single element. """ obj_type = get_object_type(ref, ws_url) refs = list() if "KBaseSets.ReadsSet" in obj_type or "KBaseRNASeq.RNASeqSampleSet" in obj_type: print("Looking up reads references in ReadsSet object") set_client = SetAPI(srv_wiz_url) reads_set = set_client.get_reads_set_v1({ "ref": ref, "include_item_info": 0, "include_set_item_ref_paths": 1 }) print("Got results from ReadsSet object") pprint(reads_set) ref_list = [r["ref_path"] for r in reads_set["data"]["items"]] reads_names = get_object_names(ref_list, ws_url) for reads in reads_set["data"]["items"]: ref = reads["ref_path"] refs.append({ "ref": ref, "condition": reads["label"], "name": reads_names[ref] }) elif ("KBaseAssembly.SingleEndLibrary" in obj_type or "KBaseFile.SingleEndLibrary" in obj_type or "KBaseAssembly.PairedEndLibrary" in obj_type or "KBaseFile.PairedEndLibrary" in obj_type): refs.append({ "ref": ref, "name": get_object_names([ref], ws_url)[ref] }) else: raise ValueError("Unable to fetch reads reference from object {} " "which is a {}".format(ref, obj_type)) return refs
def test_run_hisat2_sampleset_ok(self): res = self.get_impl().run_hisat2(self.get_context(), { "ws_name": self.ws_name, "sampleset_ref": self.single_end_sampleset, "genome_ref": self.genome_ref, "alignmentset_suffix": "_sampleset_alignments", "alignment_suffix": "_sampleset_alignment", "num_threads": 2, "quality_score": "phred33", "skip": 0, "trim3": 0, "trim5": 0, "np": 1, "min_intron_length": 20, "max_intron_length": 500000, "no_spliced_alignment": 0, "transcriptome_mapping_only": 0, "build_report": 1 })[0] self.assertIsNotNone(res) print("Done with HISAT2 run! {}".format(res)) self.assertIn("report_ref", res) self.assertTrue(check_reference(res["report_ref"])) self.assertIn("report_name", res) self.assertIn("alignmentset_ref", res) self.assertTrue(check_reference(res["alignmentset_ref"])) self.assertTrue(get_object_names([res["alignmentset_ref"]], self.wsURL)[res["alignmentset_ref"]].endswith("_sampleset_alignments")) self.assertIn("alignment_objs", res) self.assertTrue(len(res["alignment_objs"].keys()) == 2) for reads_ref in res["alignment_objs"]: ref_from_refpath = reads_ref.split(';')[-1] self.assertIn(ref_from_refpath, self.reads_refs) self.assertTrue(res["alignment_objs"][reads_ref]["name"].endswith("_sampleset_alignment")) self.assertTrue(check_reference(res["alignment_objs"][reads_ref]["ref"]))
def run_single(self, reads_ref, params): """ Performs a single run of HISAT2 against a single reads reference. The rest of the info is taken from the params dict - see the spec for details. """ # 1. Get hisat2 index from genome. # a. If it exists in cache, use that. # b. Otherwise, build it idx_prefix = self.build_index(params["genome_ref"]) # 2. Fetch the reads file and deal make sure input params are correct. reads = fetch_reads_from_reference(reads_ref["ref"], self.callback_url) # if the reads ref came from a different sample set, then we need to drop that # reference inside the reads info object so it can be linked in the alignment if reads_ref["ref"] != params["sampleset_ref"]: reads["sampleset_ref"] = params["sampleset_ref"] # make sure condition info carries over if we have it if "condition" in reads_ref: reads["condition"] = reads_ref["condition"] elif "condition" in params: reads["condition"] = params["condition"] reads["name"] = reads_ref["name"] output_file = "accepted_hits" # 3. Finally all set, do the alignment and upload the output. alignment_file = self.run_hisat2(idx_prefix, reads, params, output_file=output_file) alignment_name = reads["name"] + params["alignment_suffix"] output_ref = self.upload_alignment(params, reads, alignment_name, alignment_file) alignment_set_ref = None if is_set(params["sampleset_ref"], self.workspace_url): # alignment_items, alignmentset_name, ws_name set_name = get_object_names( [params["sampleset_ref"]], self.workspace_url)[params["sampleset_ref"]] alignment_set_name = set_name + params["alignmentset_suffix"] alignment_set_ref = self.upload_alignment_set( [{ "ref": output_ref, "label": reads["condition"] }], alignment_set_name, params["ws_name"]) alignments = dict() alignments[reads_ref["ref"]] = { "ref": output_ref, "name": alignment_name } os.remove(reads["file_fwd"]) if "file_rev" in reads: os.remove(reads["file_rev"]) return (alignments, output_ref, alignment_set_ref)
def run_batch(self, reads_refs, params): """ Runs HISAT2 in batch mode. reads_refs should be a list of dicts, where each looks like the following: { "ref": reads object reference, "condition": condition for that ref (string) } """ # build task list and send it to KBParallel tasks = list() set_name = get_object_names( [params["sampleset_ref"]], self.workspace_url)[params["sampleset_ref"]] for idx, reads_ref in enumerate(reads_refs): single_param = dict(params) # need a copy of the params single_param["build_report"] = 0 single_param["sampleset_ref"] = reads_ref["ref"] if "condition" in reads_ref: single_param["condition"] = reads_ref["condition"] else: single_param["condition"] = "unspecified" tasks.append({ "module_name": "kb_hisat2", "function_name": "run_hisat2", "version": self.my_version, "parameters": single_param }) # UNCOMMENT BELOW FOR LOCAL TESTING batch_run_params = { "tasks": tasks, "runner": "parallel", # "concurrent_local_tasks": 3, # "concurrent_njsw_tasks": 0, "max_retries": 2 } parallel_runner = KBParallel(self.callback_url) results = parallel_runner.run_batch(batch_run_params)["results"] alignment_items = list() alignments = dict() for idx, result in enumerate(results): # idx of the result is the same as the idx of the inputs AND reads_refs if result["is_error"] != 0: raise RuntimeError( "Failed a parallel run of HISAT2! {}".format( result["result_package"]["error"])) reads_ref = tasks[idx]["parameters"]["sampleset_ref"] alignment_items.append({ "ref": result["result_package"]["result"][0]["alignment_objs"] [reads_ref]["ref"], "label": reads_refs[idx].get("condition", params.get("condition", "unspecified")) }) alignments[reads_ref] = result["result_package"]["result"][0][ "alignment_objs"][reads_ref] # build the final alignment set output_ref = self.upload_alignment_set( alignment_items, set_name + params["alignmentset_suffix"], params["ws_name"]) return (alignments, output_ref)