예제 #1
    def __init__(self, config):
        self.scratch = config["scratch"]
        self.ctx = config['ctx']
        self.callback_url = config["SDK_CALLBACK_URL"]

        self.ws_client = workspaceService(config["workspace-url"])
        self.kbr = KBaseReport(self.callback_url)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.output_workspace = None
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
            'provenance': [{
                'service': 'GenomeFileUtil',
                'method': 'please_never_use_it_in_production',
                'method_params': []
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        for nameval in config.items('GenomeFileUtil'):
            cls.cfg[nameval[0]] = nameval[1]
        cls.wsURL = cls.cfg['workspace-url']
        cls.ws = workspaceService(cls.wsURL, token=token)
        cls.gaa = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL'])
        cls.serviceImpl = GenomeFileUtil(cls.cfg)

        # create one WS for all tests
        suffix = int(time.time() * 1000)
        wsName = "test_GenomeAnnotationAPI_" + str(suffix)
        ret = cls.ws.create_workspace({'workspace': wsName})
        cls.wsName = wsName
 def __init__(self, config):
     os.makedirs(self.workdir, exist_ok=True)
     self.config = config
     self.timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     self.genome_api = GenomeAnnotationAPI(self.callback_url)
     self.dfu = DataFileUtil(self.callback_url)
     self.gfu = GenomeFileUtil(self.callback_url)
     self.kbr = KBaseReport(self.callback_url)
     self.ws_client = Workspace(config["workspace-url"])
예제 #4
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        for nameval in config.items('kb_functional_enrichment_1'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
            'provenance': [{
                'service': 'kb_functional_enrichment_1',
                'method': 'please_never_use_it_in_production',
                'method_params': []
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = Workspace(cls.wsURL)
        cls.serviceImpl = kb_functional_enrichment_1(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']

        cls.fe1_runner = FunctionalEnrichmentUtil(cls.cfg)
        cls.dfu = DataFileUtil(cls.callback_url)
        cls.gaa = GenomeAnnotationAPI(cls.callback_url)
        cls.ws = Workspace(cls.wsURL, token=token)

        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_functional_enrichment_1_" + str(suffix)
        cls.wsClient.create_workspace({'workspace': cls.wsName})

    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        for nameval in config.items('PanGenomeAPI'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
            'provenance': [{
                'service': 'PanGenomeAPI',
                'method': 'please_never_use_it_in_production',
                'method_params': []
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL)
        cls.serviceImpl = PanGenomeAPI(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']

        suffix = int(time.time() * 1000)
        wsName = "test_pangenome_api_" + str(suffix)
        cls.ws_info = cls.wsClient.create_workspace({'workspace': wsName})
        cls.gcs = GenomeComparisonSDK(cls.callback_url)
        cls.gaa = GenomeAnnotationAPI(cls.callback_url)
예제 #6
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
            'provenance': [{
                'service': 'GenomeFileUtil',
                'method': 'please_never_use_it_in_production',
                'method_params': []
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        for nameval in config.items('GenomeFileUtil'):
            cls.cfg[nameval[0]] = nameval[1]
        cls.wsURL = cls.cfg['workspace-url']
        cls.ws = workspaceService(cls.wsURL, token=token)
        cls.gaa = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL'])
        cls.serviceImpl = GenomeFileUtil(cls.cfg)

        # create one WS for all tests
        suffix = int(time.time() * 1000)
        wsName = "test_GenomeAnnotationAPI_" + str(suffix)
        ret = cls.ws.create_workspace({'workspace': wsName})
        cls.wsName = wsName

        cls.ara_ref = cls.serviceImpl.genbank_to_genome(
            cls.ctx, {
                'file': {
                'workspace_name': cls.wsName,
                'genome_name': "arab",
                'source': 'Ensembl',

        # preload with reference data
        data = json.load(open('data/rhodobacter.json'))
        # save to ws
        save_info = {'workspace': wsName, 'data': data, 'name': 'rhodobacter'}
        info = cls.gaa.save_one_genome_v1(save_info)['info']
        cls.rhodobacter_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(
        print(('created rhodobacter test genome: ' + cls.rhodobacter_ref))

        # save new genome
        assembly_file_path = os.path.join(cls.cfg['scratch'],
        shutil.copy('data/e_coli/e_coli_assembly.fasta', assembly_file_path)
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        assembly_ref = au.save_assembly_from_fasta({
            'workspace_name': cls.wsName,
            'assembly_name': 'ecoli.assembly',
            'file': {
                'path': assembly_file_path
        data = json.load(open('data/e_coli/new_ecoli_genome.json'))
        data['assembly_ref'] = assembly_ref
        # save to ws
        save_info = {
            'objects': [{
                'type': 'KBaseGenomes.Genome',
                'data': data,
                'name': 'new_ecoli'
        result = cls.ws.save_objects(save_info)
        info = result[0]
        cls.ecoli_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
        print(('created ecoli test genome: ' + cls.rhodobacter_ref))

        # save a GFF file to shock, preload a genome pointing to it
        dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        shutil.copy('data/rhodobacter.gtf', cls.cfg['scratch'])
        shock_file = dfu.file_to_shock({
            os.path.join(cls.cfg['scratch'], 'rhodobacter.gtf'),
        data['gff_handle_ref'] = shock_file['handle']['hid']

        # save to ws
        save_info['objects'][0]['name'] = 'rhodobacter_with_gff'
        result = cls.ws.save_objects(save_info)
        info = result[0]
        cls.rhodobacter_ref_with_gff = str(info[6]) + '/' + str(
            info[0]) + '/' + str(info[4])
        print(('created rhodobacter test genome with handle: ' +
예제 #7
class ProkkaUtils:

    def __init__(self, config):
        self.scratch = config["scratch"]
        self.ctx = config['ctx'];
        self.callback_url = config["SDK_CALLBACK_URL"]

        self.ws_client = workspaceService(config["workspace-url"])
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)

        self.sso_ref = None
        self.sso_event = None
        self.ec_to_sso = {}
        self.output_workspace = None

    def _get_input_value(params, key):
        """Get value of key after checking for its existence

        :param params: Params dictionary haystack
        :param key: Key to search in Params
        :return: Parameter Value
        :raises ValueError: raises an exception if the key doesn"t exist
        if not key in params:
            raise ValueError("Parameter " + key + " should be set in input parameters")
        return params[key]

    def _get_qualifier_value(qualifier):
        """Get first qualifier from the list of qualifiers

        :param qualifier: list contents of the qualifier from BCBio GFF Tools
        :return: first element in the list
        return qualifier[0] if (qualifier and len(qualifier) > 0) else None

    def download_seed_data(self):
        """Download Seed Data Ontology, and set the gene_ontology reference (sso_ref) and
        the create a table from ec numbers to sso (ec_to_sso)

        :return: None
        # Download Seed Reference Data
        sso_ret = self.ws_client.get_objects([{"ref": "KBaseOntology/seed_subsystem_ontology"}])[0]
        sso = sso_ret["data"]
        for sso_id in sso["term_hash"]:
            sso_name = sso["term_hash"][sso_id]["name"]
            if "(EC " in sso_name and sso_name.endswith(")"):
                ec = sso_name[sso_name.index("(EC ") + 4: -1].strip()
                sso_list = self.ec_to_sso.get(ec, None)
                if not sso_list:
                    sso_list = []
                    self.ec_to_sso[ec] = sso_list
        print("EC found in SSO: " + str(len(self.ec_to_sso)))
        sso_info = sso_ret["info"]
        sso_ref = str(sso_info[6]) + "/" + str(sso_info[0]) + "/" + str(sso_info[4])
        with open("/kb/module/work/seed_so.json", "w") as outfile:
            json.dump(sso, outfile, sort_keys=True, indent=4)
        self.sso_ref = sso_ref

    def inspect_assembly(self, assembly_meta, assembly_ref):
        """Check to see if assembly has too many contigs and might not be a metagenome or
        non prokaryotic dataset

        :param assembly_meta: information about the assembly reference
        :param assembly_ref: the assembly reference number
        :return: a tuple containing gc_content and dna_size
        gc_content = float(assembly_meta.get("GC content"))
        dna_size = int(assembly_meta.get("Size"))
        n_contigs = 0
        if "N Contigs" in assembly_meta:
            n_contigs = int(assembly_meta.get("N Contigs"))
            contig = self.ws_client.get_objects([{"ref": assembly_ref}])[0]
            n_contigs = len(contig["data"]["contigs"])
        if n_contigs >= 30000:
            message = """
             Hmmm.  There are over 30,000 contigs in this Assembly. 
             It looks like you are trying to run Prokka on a metagenome or non-prokaryotic data set. 
             If this is a metagenome data set we recommend using an App like MaxBin to first bin the contigs into genome-like bins. 
             These bins can then be individually annotated as a single genome using Prokka. 
             If this data comes from a Eukaryotic sample, KBase does not currently have an annotation app designed for Eukaryotes. 
             Alternatively, you can try reducing the number of contigs using a filter app.")
             raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions
            raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions")

        assembly_info = namedtuple("assembly_info", "gc_content dna_size")
        return assembly_info(gc_content, dna_size)

    def create_renamed_assembly(assembly_fasta_filepath):
        """Rename records to be in the format of contig_N and output a new fasta file

        :param assembly_fasta_filepath:
        :return: A tuple with The path to the fasta file with renamed contigs the number of contigs,
        the mapping from old ids to new ids, and the contigs as SeqRecords
        records = []
        new_ids_to_old = {}
        contig_counter = 0
        for record in SeqIO.parse(assembly_fasta_filepath, "fasta"):
            contig_counter += 1
            old_id = record.id
            new_id = "contig_" + str(contig_counter)
            sequence = record.seq  # it has type "Seq"
            record = SeqRecord(sequence, id=new_id, description="(" + old_id + ")")
            new_ids_to_old[new_id] = old_id

        renamed_assembly_fasta_filepath = assembly_fasta_filepath + "_renamed.fna"
        SeqIO.write(records, renamed_assembly_fasta_filepath, "fasta")

        renamed_assembly = namedtuple("renamed_assembly",
                                      "filepath contig_counter new_ids_to_old records")
        return renamed_assembly(renamed_assembly_fasta_filepath, contig_counter, new_ids_to_old,

    def run_prokka(self, params, subject_fasta_filepath):
        """Run Prokka

        :param params: Prokka parameters
        :param subject_fasta_filepath: The contigs or genes to run prokka against
        :return: The directory with all of the prokka output files
        output_dir = "/kb/module/work/tmp/temp_" + str(uuid.uuid4())

        prokka_cmd_list = ["perl", "/kb/prokka/bin/prokka", "--outdir", output_dir, "--prefix", "mygenome"]

        # --kingdom [X]  Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default "Bacteria")
        if "kingdom" in params and params["kingdom"]:
            prokka_cmd_list.extend(["--kingdom", str(params['kingdom'])])
        # --genus [X]       Genus name (triggers to use --usegenus)
        if "genus" in params and params["genus"]:
            prokka_cmd_list.extend(["--genus", str(params["genus"]), "--usegenus"])
        # --gcode [N]       Genetic code / Translation table (set if --kingdom is set) (default "0")
        if "gcode" in params and params["gcode"]:
            prokka_cmd_list.extend(["--gcode", str(params["gcode"])])
            prokka_cmd_list.extend(["--gcode", "0"])
        # --gram [X]        Gram: -/neg +/pos (default "")
        if "gram" in params and params["gram"]:
            raise ValueError("gram parameter is not supported in current Prokka installation")
        # --metagenome      Improve gene predictions for highly fragmented genomes (default OFF)
        if "metagenome" in params and params["metagenome"] == 1:
        # --rawproduct      Do not clean up /product annotation (default OFF)
        if "rawproduct" in params and params["rawproduct"] == 1:
        # --fast            Fast mode - skip CDS /product searching (default OFF)
        if "fast" in params and params["fast"] == 1:
        # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default "1")
        if "mincontiglen" in params and params["mincontiglen"]:
            prokka_cmd_list.extend(["--mincontiglen", str(params["mincontiglen"])])
        # --evalue [n.n]    Similarity e-value cut-off (default "1e-06")
        if "evalue" in params and params["evalue"]:
            prokka_cmd_list.extend(["--evalue", str(params["evalue"])])
        # --rfam            Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default "0")
        if "rfam" in params and params["rfam"] == 1:
        # --norrna          Don"t run rRNA search (default OFF)
        if "norrna" in params and params["norrna"] == 1:
        # --notrna          Don"t run tRNA search (default OFF)
        if "notrna" in params and params["notrna"] == 1:
        print("Prokka command line: " + str(prokka_cmd_list))

        #tbl2asn or some other non essential prokka binary will fail, so supress that
            check_output(prokka_cmd_list, cwd=self.scratch)
        except CalledProcessError as e:
        return output_dir

    def retrieve_prokka_results(output_dir):
        """ Gather up the relevant prokka results, load the records from the results files

        :param output_dir:
        :return: A tuple containing Sequences from the .faa .ffn files and the gff_filepath
        faa_file = output_dir + "/mygenome.faa"
        cds_to_prot = {}
        for record in SeqIO.parse(faa_file, "fasta"):
            cds_to_prot[record.id] = str(record.seq)
        ffn_file = output_dir + "/mygenome.ffn"
        cds_to_dna = {}
        for record in SeqIO.parse(ffn_file, "fasta"):
            cds_to_dna[record.id] = str(record.seq)
        gff_file = output_dir + "/mygenome.gff"
        if not os.path.isfile(gff_file):
            raise ValueError("PROKKA output GFF file is not found")

        prokka_results = namedtuple("prokka_results", "cds_to_prot cds_to_dna gff_filepath")
        return prokka_results(cds_to_prot, cds_to_dna, gff_file)

    def parse_prokka_results(self, **prokka_parse_parameters):
        """ Go through the prokka results from the input contigs and then
        create the features, mrnas and cdss components of the KbaseGenome.Genome object for
        genome annotation only.

        :param prokka_parse_parameters: gff_filepath, mappings
        :return: A tuple with Genome:features Genome:cdss  Genome:mrnas report_message of genes discovered
        gff_filepath = prokka_parse_parameters["gff_filepath"]
        cds_to_dna = prokka_parse_parameters["cds_to_dna"]
        cds_to_prot = prokka_parse_parameters["cds_to_prot"]
        new_ids_to_old = prokka_parse_parameters["new_ids_to_old"]

        evidence = self.make_annotation_evidence()

        cdss = []
        mrnas = []
        features = []
        non_hypothetical = 0
        genes_with_ec = 0
        genes_with_sso = 0
        prot_lengths = []
        with open(gff_filepath, "r") as f1:
            for rec in GFF.parse(f1):
                contig_id = new_ids_to_old[str(rec.id)]
                for ft in rec.features:
                    loc = ft.location
                    min_pos = int(loc.start) + 1
                    max_pos = int(loc.end)
                    strand = "+" if loc.strand == 1 else "-"
                    flen = max_pos - min_pos + 1
                    start = min_pos if strand == "+" else max_pos
                    location = [[contig_id, start, strand, flen]]
                    qualifiers = ft.qualifiers
                    generated_id = self._get_qualifier_value(qualifiers.get("ID"))
                    if not generated_id:
                        # Skipping feature with no ID (mostly repeat regions)
                    dna = cds_to_dna.get(generated_id)
                    if not dna:
                        # Skipping feature with no DNA (mostly repeat regions)
                    name = self._get_qualifier_value(qualifiers.get("Name"))
                    ec = self._get_qualifier_value(qualifiers.get("eC_number"))
                    gene = self._get_qualifier_value(qualifiers.get("gene"))
                    product = self._get_qualifier_value(qualifiers.get("product"))
                    fid = generated_id
                    aliases = []
                    if name:
                    if gene:
                    if ec:
                        genes_with_ec += 1
                    md5 = hashlib.md5(dna.encode()).hexdigest()
                    feature = {"id": fid, "location": location, "type": "gene",
                               "aliases": aliases, "md5": md5, "dna_sequence": dna,
                               "dna_sequence_length": len(dna),
                    if product:
                        feature["function"] = product
                        if product != "hypothetical protein":
                            non_hypothetical += 1
                    if ec and ec in self.ec_to_sso:
                        sso_list = self.ec_to_sso[ec]
                        sso_terms = {}
                        for sso_item in sso_list:
                            sso_terms[sso_item["id"]] = {"id": sso_item["id"],
                                                         "evidence": [evidence],
                                                         "term_name": sso_item["name"],
                                                         "ontology_ref": self.sso_ref,
                                                         "term_lineage": []}
                        feature["ontology_terms"] = {"SSO": sso_terms}
                        genes_with_sso += 1
                    cds = None
                    mrna = None
                    prot = cds_to_prot.get(generated_id)
                    if prot:
                        cds_id = fid + "_CDS"
                        mrna_id = fid + "_mRNA"
                        prot_len = len(prot)
                        feature["protein_translation"] = prot
                        feature["protein_translation_length"] = prot_len
                        feature["cdss"] = [cds_id]
                        feature["mrnas"] = [mrna_id]
                        cds = {"id": cds_id, "location": location, "md5": md5, "parent_gene": fid,
                               "parent_mrna": mrna_id, "function": (product if product else ""),
                               "ontology_terms": {}, "protein_translation": prot,
                               "protein_translation_length": prot_len, "aliases": aliases}
                        mrna = {"id": mrna_id, "location": location, "md5": md5,
                                "parent_gene": fid, "cds": cds_id}
                    if cds:
                    if mrna:

        # Prepare report
        report = ""
        report += "Number of genes predicted: " + str(len(features)) + "\n"
        report += "Number of protein coding genes: " + str(len(prot_lengths)) + "\n"
        report += "Number of genes with non-hypothetical function: " + str(non_hypothetical) + "\n"
        report += "Number of genes with EC-number: " + str(genes_with_ec) + "\n"
        report += "Number of genes with Seed Subsystem Ontology: " + str(genes_with_sso) + "\n"
        report += "Average protein length: " + str(int(sum(prot_lengths) /
                                                       float(len(prot_lengths)))) + " aa.\n"

        annotated_assembly = namedtuple("annotated_assembly", "features cdss mrnas report_message")
        return annotated_assembly(features, cdss, mrnas, report)

    def get_new_annotations(self, gff_filepath):

        :param gff_filepath: A dictionary of ids with products and ec numbers
        evidence = self.make_annotation_evidence()
        genome = {}
        with open(gff_filepath, "r") as f:
            for rec in GFF.parse(f):
                gid = rec.id
                gene_features = {"id": id}

                for feature in rec.features:
                    qualifiers = feature.qualifiers
                    if "product" in qualifiers:
                        gene_features["function"] = " ".join(qualifiers["product"])

                    if "eC_number" in qualifiers:
                        ec_numbers = qualifiers["eC_number"]
                        sso_terms = dict()
                        for ec in ec_numbers:
                            sso_list = self.ec_to_sso.get(ec, [])
                            for sso_item in sso_list:
                                sso_terms[sso_item["id"]] = {"id": sso_item["id"],
                                                             "evidence": [evidence],
                                                             "term_name": sso_item["name"],
                                                             "ontology_ref": self.sso_ref,
                                                             "term_lineage": []}

                        gene_features["ontology_terms"] = sso_terms
                genome[gid] = gene_features

        return genome

    def write_genome_to_fasta(self, genome_data):

        :param genome_data:
        fasta_for_prokka_filepath = os.path.join(self.scratch,
                                                 "features_" + str(uuid.uuid4()) + ".fasta")
        count = 0
        with open(fasta_for_prokka_filepath, "w") as f:
            for item in genome_data["data"]["features"]:
                if "id" not in item or "dna_sequence" not in item:
                    print("This feature does not have a valid dna sequence.")
                    f.write(">" + item["id"] + "\n" + item["dna_sequence"] + "\n")
                    count += 1

        print("Finished printing to" + fasta_for_prokka_filepath)
        if os.stat(fasta_for_prokka_filepath).st_size == 0:
            raise Exception(
                "This genome does not contain features with DNA_SEQUENCES. Fasta file is empty.")

        return fasta_for_prokka_filepath

    def make_sso_ontology_event(self):

        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        time_string = str(
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation",
            "method_version": version,
            "timestamp": time_string,
            "id": "SSO",
            "ontology_ref": self.sso_ref

    def make_annotation_evidence(self):
        Create a dict for the evidence field for the genome
        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        time_string = str(
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation (Evidence)",
            "method_version": version,
            "timestamp": time_string,

    def create_genome_ontology_fields(self, genome_data):
        Create ontology event fields for a genome object
        :param genome_data:  A genome object's data filed
        :return: a named tuple containg the modified genome object and a new ontology event index
        # Make sure ontologies_events exist
        sso_event = self.make_sso_ontology_event()
        ontology_event_index = 0

        if 'ontology_events' in genome_data['data']:
            ontology_event_index += len(genome_data['data']['ontology_events']) - 1
            genome_data['data']['ontology_events'] = [sso_event]

        genome_obj_modified = namedtuple('genome_obj_modified', 'genome_data ontology_event_index')
        return genome_obj_modified(genome_data, ontology_event_index)

    def old_genome_ontologies(feature, new_ontology):
        Update the feature's ontologies for an old genome
        :param feature: Feature to update
        :param new_ontology: New Ontology to update with
        :return: The feature with the ontology updated, in the old style
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}
        for key in new_ontology.keys():
            feature["ontology_terms"]["SSO"][key] = new_ontology[key]
        return feature

    def new_genome_ontologies(feature, new_ontology, ontology_event_index):
        Update the feature's ontologies for a new genome
        :param feature: Feature to update
        :param new_ontology: New Ontology to update with
        :param ontology_event_index: Ontology index to update the feature with
        :return: the updated feature
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}

        for key in new_ontology.keys():
            id = new_ontology[key]["id"]
            if id in feature["ontology_terms"]["SSO"]:
                feature["ontology_terms"]["SSO"][id] = [ontology_event_index]
        return feature

    def annotate_genome_with_new_annotations(self, **annotation_args):
        Annotate the genome with new annotations for  Genome ReAnnotation
        :param annotation_args:  genome_data from the genome obj, new_annotations from prokka, and the output_genome_name
        :return: A tuple containg the genome_ref, filepaths for the function and ontology summary, and stats about the annotations
        genome_data = annotation_args["genome_data"]
        new_annotations = annotation_args["new_annotations"]

        new_genome = False
        if 'feature_counts' in genome_data['data']:
            new_genome = True
            genome_obj_modified = self.create_genome_ontology_fields(genome_data)
            genome_data = genome_obj_modified.genome_data
            ontology_event_index = genome_obj_modified.ontology_event_index

        stats = {"current_functions": len(genome_data["data"]["features"]), "new_functions": 0,
                 "found_functions": 0, "new_ontologies": 0}

        function_summary_fp = os.path.join(self.scratch, "ontology_report")
        ontology_summary_fp = os.path.join(self.scratch, "function_report")
        onto_r = open(function_summary_fp, "w")
        func_r = open(ontology_summary_fp, "w")
        func_r.write("function_id current_function new_function\n")
        onto_r.write("function_id current_ontology new_ontology\n")

        ontologies_present = {"SSO": {}}
        for i, feature in enumerate(genome_data["data"]["features"]):
            fid = feature["id"]
            current_function = feature.get("function", "")
            current_functions = feature.get("functions", [])
            current_ontology = feature.get("ontology_terms", None)
            new_function = ""
            new_ontology = dict()

            if fid in new_annotations:
                # Set Function
                new_function = new_annotations[fid].get("function", "")
                if new_function and "hypothetical protein" not in new_function:
                    if (new_function != current_function and new_function not in current_functions):
                        stats['new_functions'] += 1
                    genome_data["data"]["features"][i]["function"] = new_function
                    genome_data["data"]["features"][i]["functions"] = [new_function]
                    stats['found_functions'] += 1

                # Set Ontologies
                new_ontology = new_annotations[fid].get("ontology_terms", None)
                if new_ontology:
                    stats['new_ontologies'] += 1
                    if new_genome:
                        # New style
                        genome_data["data"]["features"][i] = self. \
                            new_genome_ontologies(feature, new_ontology, ontology_event_index)

                        # Add to ontologies Present
                        for key in new_ontology.keys():
                            oid = new_ontology[key]["id"]
                            name = new_ontology[key].get("name", "Unknown")
                            ontologies_present["SSO"][oid] = name

                        genome_data["data"]["features"][i] = self. \
                            old_genome_ontologies(feature, new_ontology)

            if current_function:
                func_r.write(json.dumps([fid, [current_function], [new_function]]) + "\n")
                func_r.write(json.dumps([fid, current_functions, [new_function]]) + "\n")

            onto_r.write(json.dumps([fid, current_ontology, new_ontology]) + "\n")


        if ontologies_present:
            if "ontologies_present" in genome_data["data"]:
                if "SSO" in genome_data["data"]["ontologies_present"]:
                    for key, value in ontologies_present["SSO"].items():
                        genome_data["data"]["ontologies_present"]["SSO"][key] = value
                    genome_data["data"]["ontologies_present"] = ontologies_present["SSO"]

                genome_data["data"]["ontologies_present"] = ontologies_present

        info = self.gfu.save_one_genome({"workspace": self.output_workspace,
                                         "name": annotation_args["output_genome_name"],
                                         "data": genome_data["data"],
                                         "provenance": self.ctx.provenance()})["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])
        annotated_genome = namedtuple("annotated_genome",
                                      "genome_ref function_summary_filepath ontology_summary_filepath stats")

        return annotated_genome(genome_ref, function_summary_fp, ontology_summary_fp,

    def upload_file(self, filepath, message="Annotation report generated by kb_prokka"):
        Upload a file to shock
        :param filepath: File to upload
        :param message: Optional Upload Message
        output_file_shock_id = self.dfu.file_to_shock({"file_path": filepath})["shock_id"]
        print(f"Uploaded filepath {filepath} to shock and got id {output_file_shock_id}")
        return {"shock_id": output_file_shock_id,
                "name": os.path.basename(filepath),
                "label": os.path.basename(filepath),
                "description": message}

    def report_annotated_genome(self, genome):
        """ Create report output with newly reannotated genome, and some stats

        :param genome: Reannotated Genome Reference, Report Files and Stats
        :return: Reference to Report Object
        genome_ref = genome.genome_ref
        stats = genome.stats

        file_links = [self.upload_file(genome.ontology_summary_filepath),

        report_message = ("Genome Ref:{0}\n"
                          "Number of features sent into prokka:{1}\n"
                          "New functions found:{2}\n"
                          "Ontology terms found:{3}\n"
                          ).format(genome_ref, stats["current_functions"], stats["new_functions"],

        report_info = self.kbr.create_extended_report(
            {"message": report_message,
             "objects_created": [{"ref": genome_ref, "description": "Annotated genome"}],
             "file_links": file_links,
             "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()),
             "workspace_name": self.output_workspace

        return {"output_genome_ref": genome_ref, "report_name": report_info["name"],
                "report_ref": report_info["ref"]}

    def annotate_genome(self, params):
        """ User input an existing genome to re-annotate.

        :param params: Reference to the genome, Output File Name, UI Parameters
        :return: Report with Reannotated Genome and Stats about it
        self.output_workspace = params["output_workspace"]

        genome_ref = self._get_input_value(params, "object_ref")
        output_name = self._get_input_value(params, "output_genome_name")
        # genome_data = self.dfu.get_objects({"object_refs": [genome_ref]})["data"][0]

        genome_data = \
            self.genome_api.get_genome_v1({"genomes": [{"ref": genome_ref}], 'downgrade': 0})[

        fasta_for_prokka_filepath = self.write_genome_to_fasta(genome_data)
        output_dir = self.run_prokka(params, fasta_for_prokka_filepath)
        prokka_results = self.retrieve_prokka_results(output_dir)
        new_annotations = self.get_new_annotations(prokka_results.gff_filepath)
        annotated_genome = self.annotate_genome_with_new_annotations(genome_data=genome_data,
        return self.report_annotated_genome(annotated_genome)

    def save_genome(self, params, prokka_results, renamed_assembly, assembly_ref):
        Save KBaseGenomes.Genome object,
            params           - input parameters from .spec
            prokka_results   - result files from prokka run
            renamed_assembly - assembly object with renamed contigs
            assembly_ref     - reference to input assembly object
            genome_ref: saved genome object reference
            report_message: message associated with er
        # Parse Results
        output_genome_name = self._get_input_value(params, "output_genome_name")
        output_workspace = self._get_input_value(params, "output_workspace")

        annotated_assembly = self.parse_prokka_results(gff_filepath=prokka_results.gff_filepath,

        # Force defaults for optional parameters that may be set to None
        scientific_name = 'Unknown'
        if 'scientific_name' in params and params['scientific_name']:
            scientific_name = params['scientific_name']
        domain = "Bacteria"
        if 'kingdom' in params and params['kingdom']:
            domain = params['kingdom']
        gcode = 0
        if 'gcode' in params and params['gcode']:
            gcode = params['gcode']

        genome = {"id": "Unknown",
                  "features": annotated_assembly.features,
                  "scientific_name": scientific_name,
                  "domain": domain,
                  "genetic_code": gcode,
                  "assembly_ref": assembly_ref,
                  "cdss": annotated_assembly.cdss,
                  "mrnas": annotated_assembly.mrnas,
                  "source": "PROKKA annotation pipeline",
                  "gc_content": assembly_info.gc_content,
                  "dna_size": assembly_info.dna_size,
                  "reference_annotation": 0}

        info = self.gfu.save_one_genome({"workspace": output_workspace,
                                         "name": output_genome_name,
                                         "data": genome,
                                         "provenance": self.ctx.provenance()})["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])

        return genome_ref, annotated_assembly.report_message

    def _replace_id(self, line, new_ids_to_old, fasta=False):
            line           - text line to replace id of
            new_ids_to_old - dict of newly assigned ids to input (old) ids
            fasta.         - bool wheter file is fasta, else is gff file
            returned text line
        if fasta:
            if '>' ==  line[0]:
                tokens = line.split()
                if len(tokens) > 1:
                    id_ = tokens[0][1:].strip()
                    rest = ' '.join(tokens[1:])
                    return '>' + new_ids_to_old[id_] + ' ' + rest
                    id_ = tokens[0][1:].strip()
                    return '>' + new_ids_to_old[id_]
                return line
            id_, rest = line.split('\t')[0], line.split('\t')[1:]
            return '\t'.join([new_ids_to_old[id_]] + rest)

    def _rename_and_separate_gff(self, gff, new_ids_to_old):
        rename the output gff file ids and separate the fasta file from the gff3.
            gff            - path to gff_file
            new_ids_to_old - dict of newly assigned ids to input (old) ids
        fasta = []
        save = []
        with open(gff) as f:
            for l in f:
                if '##FASTA' in l:
                    for line in f:
                        fasta.append(self._replace_id(line, new_ids_to_old, True))
                if '##' in l:
                save.append(self._replace_id(l, new_ids_to_old))
        gff_path = gff + "_edited.gff"
        with open(gff_path, 'w') as f:
            for l in save:
                f.write(l.strip() + '\n')
        fasta_path = gff  + "edited.fa"
        with open(fasta_path, 'w') as f:
            for l in fasta:
                f.write(l.strip() + '\n')
        return gff_path, fasta_path

    def save_metagenome(self, params, gff_file, fasta_file):
            params     - input "params" from .spec
            gff_file   - path to gff_file to save as Metagenome
            fasta_file - path to fasta_file to save as Metagenome
            metagenome_ref - saved KBaseMetagenomes.AnnotatedMetagenomeAssembly object ref
        output_name = self._get_input_value(params, "output_metagenome_name")
        output_workspace = self._get_input_value(params, "output_workspace")

        metagenome_ref = self.gfu.fasta_gff_to_metagenome({
            "fasta_file": {'path': fasta_file},
            "gff_file": {'path': gff_file},
            "genome_name": output_name,
            "workspace_name": output_workspace,
            "generate_missing_genes": True

        return metagenome_ref

    def annotate_metagenome(self, params):
        Given a KBaseMetagenome.AnnotatedMetagenomeAssembly object, reannotate it using Prokka.
        Saves a KBaseMetagenome.AnnotatedMetagenomeAssembly as output.
            params - input "params" from .spec
            output_metagenome_ref - saved KBaseMetagenomes.AnnotatedMetagenomeAssembly object ref
            report_name - name of outgoing report object
            report_ref  - reference to Report object
        metagenome_ref = self._get_input_value(params, "object_ref")
        output_genome_name = self._get_input_value(params, "output_metagenome_name")
        output_workspace = self._get_input_value(params, "output_workspace")

        # orig_fasta_file = self.au.get_fastas({'ref_lst': [metagenome_ref]})
        obj_data = self.dfu.get_objects({"object_refs": [metagenome_ref]})['data'][0]['data']
        orig_fasta_file = self.au.get_assembly_as_fasta({"ref": obj_data['assembly_ref']})["path"]

        renamed_assembly = self.create_renamed_assembly(orig_fasta_file)
        output_dir = self.run_prokka(params, renamed_assembly.filepath)

        # need to analyse output gff and fastas from prokka.
        gff_file, fasta_file = self._rename_and_separate_gff(output_dir + "/mygenome.gff", renamed_assembly.new_ids_to_old)
        metagenome_ref = self.save_metagenome(params, gff_file, fasta_file)

        report_message = "Metagenome saved to: " + output_workspace + "/" + \
                         output_genome_name + "\n"

        report_info = self.kbr.create_extended_report({
            "message": report_message,
            "objects_created": [{"ref": metagenome_ref, "description": "Annotated Metagenome Assembly"}],
            "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()),
            "workspace_name": output_workspace

        return {
            "output_metagenome_ref": metagenome_ref,
            "report_name": report_info["name"],
            "report_ref": report_info["ref"]

    def annotate_assembly(self, params, assembly_info):
        Annotate an assembly with Prokka. The steps include to download the assembly as a fasta file,
        rename the contigs, run prokka against the contigs, parse the results, and finally,
        create and upload a genome object.

        :param params: object reference, output_genome_name and output_workspace
        :param assembly_info: Information used to determine if the assembly is too big
        :return: Report with newly annotated assembly as a genome, and stats about it

        output_workspace = params["output_workspace"]
        if params.get('metagenome'):
            save_type = "Annotated Metagenome Assembly"
            output_field_name = 'output_metagenome_ref'
            output_name = self._get_input_value(params, "output_metagenome_name")
            save_type = "Annotated Genome"
            output_field_name = "output_genome_ref"
            output_name = self._get_input_value(params, "output_genome_name")

        assembly_ref = self._get_input_value(params, "object_ref")
        output_workspace = self._get_input_value(params, "output_workspace")
        # for now, don't do this check if we are using a metagenome
        if not params.get('metagenome'):
            assembly_info = self.inspect_assembly(assembly_info[10], assembly_ref)
        orig_fasta_file = self.au.get_assembly_as_fasta({"ref": assembly_ref})["path"]

        # Rename Assembly and Keep Track of Old Contigs
        renamed_assembly = self.create_renamed_assembly(orig_fasta_file)
        # Run Prokka with the modified, renamed fasta file
        output_dir = self.run_prokka(params, renamed_assembly.filepath)
        # Prokka_results

        if params.get('metagenome'):
            gff_file, fasta_file = self._rename_and_separate_gff(output_dir + "/mygenome.gff", renamed_assembly.new_ids_to_old)
            genome_ref = self.save_metagenome(params, gff_file, fasta_file)
            report_message = ""
            prokka_results = self.retrieve_prokka_results(output_dir)
            genome_ref, report_message = self.save_genome(params, prokka_results, renamed_assembly, assembly_ref)
        report_message = f"{save_type} saved to: " + output_workspace + "/" + \
                      output_name + "\n" + report_message

        report_info = self.kbr.create_extended_report({
            "message": report_message,
            "objects_created": [{"ref": genome_ref, "description": save_type}],
            "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()),
            "workspace_name": output_workspace

        return {
            output_field_name: genome_ref,
            "report_name": report_info["name"],
            "report_ref": report_info["ref"]
예제 #8
class GenomeReportUtils:
    Utilities for generating genome reports
    def __init__(self, config):
        self.scratch = config["scratch"]
        self.ctx = config['ctx']
        self.callback_url = config["SDK_CALLBACK_URL"]

        self.ws_client = workspaceService(config["workspace-url"])
        self.kbr = KBaseReport(self.callback_url)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.output_workspace = None

    def upload_file(self,
                    message="Annotation report generated by kb_prokka"):
        Upload a file to shock
        :param filepath: File to upload
        :param message: Optional Upload Message
        output_file_shock_id = self.dfu.file_to_shock({"file_path":
            f"Uploaded filepath {filepath} to shock and got id {output_file_shock_id}"
        return {
            "shock_id": output_file_shock_id,
            "name": os.path.basename(filepath),
            "label": os.path.basename(filepath),
            "description": message

    def _write_genome_js(self, js_template, dataset=[], rows=[]):
        _write_genome_js: Generate the js script that handles the data presentation and interaction
        log(f'start writing js script based on template {js_template}...')
        js_content = '<script>'
        with open(os.path.join(os.path.dirname(__file__),
                               js_template)) as js_file:
            js_content += js_file.read()
        js_content += '</script>'

        if dataset:
            js_content = js_content.replace('["put your data here"]',
        if rows:
            js_content = js_content.replace('["put your rows here"]',
        return js_content

    def _generate_genome_html(self, out_dir, genome_ref):
        _generate_genome_html: generate html report on genome
        log(f'start generating html report on {genome_ref}...')
        genome_obj = self.genome_api.get_genome_v1({
            "genomes": [{
                "ref": genome_ref

        curr_func = len(genome_obj["data"]["features"])

        g_feature_counts = genome_obj['data']['feature_counts']
        dataset = [{
            "name": p[0],
            "value": p[1]
        } for p in g_feature_counts.items()]
        rows = [[p[0], p[1]] for p in g_feature_counts.items()]
        rows.insert(0, ['feature name', 'feature counts'])

        genome_name = genome_obj['info'][1]
        report_title = f'Genome report on {genome_name}'

        html_report = list()
        report_file_path = os.path.join(out_dir, 'genome_report.html')

        header_content = f'<header><h3>Genome Report-{genome_name}</h3></header>'
        js_content1 = self._write_genome_js('./js/line_chart.js')
        js_content2 = self._write_genome_js('./js/pie_chart.js')
        js_content3 = self._write_genome_js('./js/bar_chart_anim.js',
        js_content4 = self._write_genome_js('./js/table_chart.js', rows=rows)
        log(f'adding the summary portion...')
        summ_content = (
            f'<div id="brief_description">\n'
            f'Genome {genome_name} was created by {genome_obj["creator"]} on {genome_obj["created"]}.'
            f'Path is {genome_obj["path"]}, orig_wsid={genome_obj["orig_wsid"]}.'
            f'<br>Details of the analysis, including genes of interest (Specialty Genes),'
            f'a functional categorization (Subsystems), and a phylogenetic tree'
            f'(Phylogenetic Analysis) are provided below.'
        log(f'start writing the reference portion based on template citations.html...'
        footer_content = ''
        with open(
                             './html/citations.html'), 'r') as footer_file:
            footer_content = footer_file.read()

        header_placeholder = '<header><h3 id="report_header_placeholder"></h3></header>'
        summary_placeholder = '<div id="brief_description"></div>'
        js_placeholder1 = '<script src="javascript_placeholder1.js"></script>'
        js_placeholder2 = '<script src="javascript_placeholder2.js"></script>'
        js_placeholder3 = '<script src="javascript_placeholder3.js"></script>'
        js_placeholder4 = '<script src="javascript_placeholder4.js"></script>'
        footer_placeholder = '<div id="report_footer_placeholder"></div>'

        log(f'assembling the report portions together...')
        with open(report_file_path, 'w') as report_file:
            with open(
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    header_placeholder, header_content)
                report_template = report_template.replace(
                    summary_placeholder, summ_content)
                report_template = report_template.replace(
                    js_placeholder1, js_content1)
                report_template = report_template.replace(
                    js_placeholder2, js_content2)
                report_template = report_template.replace(
                    js_placeholder3, js_content3)
                report_template = report_template.replace(
                    js_placeholder4, js_content4)
                report_template = report_template.replace(
                    footer_placeholder, f'<div>{footer_content}</div>')
                log(f'The report with js script has been written as in:\n {report_template}'

            'Genome report with table(s) and/or chart(s)'

        return (html_report, curr_func, report_title)

    def report_genome(self, params, html_links=[]):
        """ Create report output with (reannotated) assembly/genome, and some stats
        :param genome: (reannotated) Genome Reference, Report Files and Stats
        :return: Reference to Report Object
        self.output_workspace = params['output_workspace']
        genome_ref = params['object_ref']

        ann_by = ''
        if params.get('annotated_by', None):
            ann_by = params['annotated_by']

        report_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        html_files, curr_func, rpt_title = self._generate_genome_html(
            report_dir, genome_ref)

        if params.get('gn_stats', None):
            genome_stats = params['gn_stats']
            genome_stats = {
                "current_functions": curr_func,
                "new_functions": 0,
                "found_functions": 0,
                "new_ontologies": 0

        html_links += html_files

        # file_links = [self.upload_file(genome.ontology_summary_filepath, ann_by),
        #              self.upload_file(genome.function_summary_filepath, ann_by)]

        report_message = (
            f"Genome Ref: {genome_ref}"
            f"Number of features: {genome_stats['current_functions']}"
            f"New functions found:{genome_stats['new_functions']}"
            f"Ontology terms found:{genome_stats['new_ontologies']}")

        report_info = self.kbr.create_extended_report({
            "objects_created": [{
                "ref": genome_ref,
                "description": "Input genome"
            # "file_links": file_links,
            "genome_report_" + str(uuid.uuid4()),

        return {
            "genome_ref": genome_ref,
            "report_name": report_info["name"],
            "report_ref": report_info["ref"]