Exemplo n.º 1
0
def fetch_reads_refs_from_sampleset(ref, ws_url, srv_wiz_url):
    """
    From the given object ref, return a list of all reads objects that are a part of that
    object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary
    refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows:
    {
        "ref": reads object reference,
        "condition": condition string associated with that reads object,
        "name": reads object name (needed for saving an AlignmentSet)
    }
    The only one required is "ref", all other keys may or may not be present, based on the reads
    object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info
    for each reads object, but a single PairedEndLibrary may not have that info.

    If ref is already a Reads library, just returns a list with ref as a single element.
    """
    obj_type = get_object_type(ref, ws_url)
    refs = list()
    if "KBaseSets.ReadsSet" in obj_type or "KBaseRNASeq.RNASeqSampleSet" in obj_type:
        print("Looking up reads references in ReadsSet object")
        set_client = SetAPI(srv_wiz_url)
        reads_set = set_client.get_reads_set_v1({
                                            "ref": ref,
                                            "include_item_info": 0,
                                            "include_set_item_ref_paths": 1
        })
        print("Got results from ReadsSet object")
        pprint(reads_set)
        ref_list = [r["ref_path"] for r in reads_set["data"]["items"]]
        reads_names = get_object_names(ref_list, ws_url)
        for reads in reads_set["data"]["items"]:
            ref = reads["ref_path"]
            refs.append({
                "ref": ref,
                "condition": reads["label"],
                "name": reads_names[ref]
            })
    elif ("KBaseAssembly.SingleEndLibrary" in obj_type or
          "KBaseFile.SingleEndLibrary" in obj_type or
          "KBaseAssembly.PairedEndLibrary" in obj_type or
          "KBaseFile.PairedEndLibrary" in obj_type):
        refs.append({
            "ref": ref,
            "name": get_object_names([ref], ws_url)[ref]
        })
    else:
        raise ValueError("Unable to fetch reads reference from object {} "
                         "which is a {}".format(ref, obj_type))

    return refs
 def test_run_hisat2_sampleset_ok(self):
     res = self.get_impl().run_hisat2(self.get_context(), {
         "ws_name": self.ws_name,
         "sampleset_ref": self.single_end_sampleset,
         "genome_ref": self.genome_ref,
         "alignmentset_suffix": "_sampleset_alignments",
         "alignment_suffix": "_sampleset_alignment",
         "num_threads": 2,
         "quality_score": "phred33",
         "skip": 0,
         "trim3": 0,
         "trim5": 0,
         "np": 1,
         "min_intron_length": 20,
         "max_intron_length": 500000,
         "no_spliced_alignment": 0,
         "transcriptome_mapping_only": 0,
         "build_report": 1
     })[0]
     self.assertIsNotNone(res)
     print("Done with HISAT2 run! {}".format(res))
     self.assertIn("report_ref", res)
     self.assertTrue(check_reference(res["report_ref"]))
     self.assertIn("report_name", res)
     self.assertIn("alignmentset_ref", res)
     self.assertTrue(check_reference(res["alignmentset_ref"]))
     self.assertTrue(get_object_names([res["alignmentset_ref"]], self.wsURL)[res["alignmentset_ref"]].endswith("_sampleset_alignments"))
     self.assertIn("alignment_objs", res)
     self.assertTrue(len(res["alignment_objs"].keys()) == 2)
     for reads_ref in res["alignment_objs"]:
         ref_from_refpath = reads_ref.split(';')[-1]
         self.assertIn(ref_from_refpath, self.reads_refs)
         self.assertTrue(res["alignment_objs"][reads_ref]["name"].endswith("_sampleset_alignment"))
         self.assertTrue(check_reference(res["alignment_objs"][reads_ref]["ref"]))
Exemplo n.º 3
0
    def run_single(self, reads_ref, params):
        """
        Performs a single run of HISAT2 against a single reads reference. The rest of the info
        is taken from the params dict - see the spec for details.
        """
        # 1. Get hisat2 index from genome.
        #    a. If it exists in cache, use that.
        #    b. Otherwise, build it
        idx_prefix = self.build_index(params["genome_ref"])

        # 2. Fetch the reads file and deal make sure input params are correct.
        reads = fetch_reads_from_reference(reads_ref["ref"], self.callback_url)
        # if the reads ref came from a different sample set, then we need to drop that
        # reference inside the reads info object so it can be linked in the alignment
        if reads_ref["ref"] != params["sampleset_ref"]:
            reads["sampleset_ref"] = params["sampleset_ref"]
        # make sure condition info carries over if we have it
        if "condition" in reads_ref:
            reads["condition"] = reads_ref["condition"]
        elif "condition" in params:
            reads["condition"] = params["condition"]
        reads["name"] = reads_ref["name"]
        output_file = "accepted_hits"

        # 3. Finally all set, do the alignment and upload the output.
        alignment_file = self.run_hisat2(idx_prefix,
                                         reads,
                                         params,
                                         output_file=output_file)
        alignment_name = reads["name"] + params["alignment_suffix"]
        output_ref = self.upload_alignment(params, reads, alignment_name,
                                           alignment_file)
        alignment_set_ref = None
        if is_set(params["sampleset_ref"], self.workspace_url):
            # alignment_items, alignmentset_name, ws_name
            set_name = get_object_names(
                [params["sampleset_ref"]],
                self.workspace_url)[params["sampleset_ref"]]
            alignment_set_name = set_name + params["alignmentset_suffix"]
            alignment_set_ref = self.upload_alignment_set(
                [{
                    "ref": output_ref,
                    "label": reads["condition"]
                }], alignment_set_name, params["ws_name"])
        alignments = dict()
        alignments[reads_ref["ref"]] = {
            "ref": output_ref,
            "name": alignment_name
        }
        os.remove(reads["file_fwd"])
        if "file_rev" in reads:
            os.remove(reads["file_rev"])
        return (alignments, output_ref, alignment_set_ref)
Exemplo n.º 4
0
    def run_batch(self, reads_refs, params):
        """
        Runs HISAT2 in batch mode.
        reads_refs should be a list of dicts, where each looks like the following:
        {
            "ref": reads object reference,
            "condition": condition for that ref (string)
        }
        """
        # build task list and send it to KBParallel
        tasks = list()
        set_name = get_object_names(
            [params["sampleset_ref"]],
            self.workspace_url)[params["sampleset_ref"]]
        for idx, reads_ref in enumerate(reads_refs):
            single_param = dict(params)  # need a copy of the params
            single_param["build_report"] = 0
            single_param["sampleset_ref"] = reads_ref["ref"]
            if "condition" in reads_ref:
                single_param["condition"] = reads_ref["condition"]
            else:
                single_param["condition"] = "unspecified"

            tasks.append({
                "module_name": "kb_hisat2",
                "function_name": "run_hisat2",
                "version": self.my_version,
                "parameters": single_param
            })
        # UNCOMMENT BELOW FOR LOCAL TESTING
        batch_run_params = {
            "tasks": tasks,
            "runner": "parallel",
            # "concurrent_local_tasks": 3,
            # "concurrent_njsw_tasks": 0,
            "max_retries": 2
        }
        parallel_runner = KBParallel(self.callback_url)
        results = parallel_runner.run_batch(batch_run_params)["results"]
        alignment_items = list()
        alignments = dict()
        for idx, result in enumerate(results):
            # idx of the result is the same as the idx of the inputs AND reads_refs
            if result["is_error"] != 0:
                raise RuntimeError(
                    "Failed a parallel run of HISAT2! {}".format(
                        result["result_package"]["error"]))
            reads_ref = tasks[idx]["parameters"]["sampleset_ref"]
            alignment_items.append({
                "ref":
                result["result_package"]["result"][0]["alignment_objs"]
                [reads_ref]["ref"],
                "label":
                reads_refs[idx].get("condition",
                                    params.get("condition", "unspecified"))
            })
            alignments[reads_ref] = result["result_package"]["result"][0][
                "alignment_objs"][reads_ref]
        # build the final alignment set
        output_ref = self.upload_alignment_set(
            alignment_items, set_name + params["alignmentset_suffix"],
            params["ws_name"])
        return (alignments, output_ref)