class ImportBulkAnnotationsUtil:

    workdir = 'tmp/work/'
    staging_dir = "/staging/"
    datadir = "/kb/module/data/"

    def __init__(self, config):
        os.makedirs(self.workdir, exist_ok=True)
        self.config = config
        self.timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.genome_api = GenomeAnnotationAPI(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.ws_client = Workspace(config["workspace-url"])

    def generate_report(self, params, genome_ref):
        """
        Reads in the results from the summary method, and creates the html
        report.
        """

        summary = mu.summarize(params, self.genes)

        output_html_files = list()

        # Make report directory and copy over files
        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(output_directory)
        result_file_path = os.path.join(output_directory,
                                        'import_annotations_summary.html')

        # Build HTML tables for results
        table_lines = []
        table_lines.append(f'<h2>Import Annotations</h2>')
        table_lines.append(f'<h3>Summary</h3>')
        table_lines.append(
            '<table cellspacing="0" cellpadding="3" border="1"><tr><th>TYPE</th><th>VALID</th><th>INVALID</th></tr>'
        )
        table_lines.append('<tr><td>GENES</td><td>' +
                           str(len(summary['valid_genes'])) + '</td><td>' +
                           str(len(summary['invalid_genes'])) + '</td></tr>')
        table_lines.append('<tr><td>TERMS</td><td>' +
                           str(len(summary['valid_terms'])) + '</td><td>' +
                           str(len(summary['invalid_terms'])) + '</td></tr>')
        table_lines.append('</table>')

        if len(summary['invalid_genes']) > 0:
            table_lines.append(f'<h3>Invalid Genes</h3>')
            table_lines.append(
                '<i>These are locus_tags not identified in the genome object. Frequency shown in parentheses.</i><br><br>'
            )

            invalid_genes_count = dict(Counter(summary['invalid_genes']))

            for gene in sorted(invalid_genes_count.keys()):
                gene_count = gene + '\t(' + str(
                    invalid_genes_count[gene]) + ')'
                table_lines.append(gene_count + '<br>')

        if len(summary['invalid_terms']) > 0:
            table_lines.append(f'<h3>Invalid Terms</h3>')
            table_lines.append(
                '<i>These are ontology terms not found in the ontology dictionary. Frequency shown in parentheses.</i><br><br>'
            )

            invalid_terms_count = dict(Counter(summary['invalid_terms']))

            for term in sorted(invalid_terms_count.keys()):
                term_count = term + '\t(' + str(
                    invalid_terms_count[term]) + ')'
                table_lines.append(term_count + '<br>')

        # Write to file
        with open(result_file_path, 'w') as result_file:
            for line in table_lines:
                result_file.write(line + "\n")

        output_html_files.append({
            'path':
            output_directory,
            'name':
            os.path.basename(result_file_path),
            'description':
            'HTML report for import_annotations app'
        })

        report_params = {
            'message':
            '',
            'html_links':
            output_html_files,
            'direct_html_link_index':
            0,
            'objects_created': [{
                'ref':
                genome_ref,
                'description':
                'Genome with imported annotations'
            }],
            'workspace_name':
            params['workspace_name'],
            'report_object_name':
            f'import_annotations_{uuid.uuid4()}'
        }

        output = self.kbr.create_extended_report(report_params)

        return {
            'output_genome_ref': genome_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }

    def run(self, ctx, params):

        # get genome object, store as a dictionary
        self.genome = mu.get_genome(params['genome'], self.genome_api)

        bulk_annotations = mu.get_bulk_annotations_file(
            params, self.staging_dir)

        # mu.validate_bulk(bulk_annotations)

        # identify and process each pair of descriptions/ontologies
        pairs = mu.get_description_ontology_pairs(bulk_annotations)

        for index, row in pairs.iterrows():

            # make copy of params and add event specific info
            pair_params = params
            pair_params['description'] = row['description']
            pair_params['ontology'] = row['ontology']

            genes = {}

            # sso_ref = mu.get_sso_data(pair_params['ontology'], self.ws_client)
            self.genome = mu.add_ontology_event(self.genome, pair_params,
                                                self.timestamp,
                                                "Import Bulk Annotations")

            self.current_ontology_event = len(
                self.genome['ontology_events']) - 1

            ontology_dict = mu.get_ontology_dict(pair_params['ontology'],
                                                 self.datadir,
                                                 mu.ontology_lookup)

            annotations = bulk_annotations[
                (bulk_annotations.description == row['description'])
                & (bulk_annotations.ontology == row['ontology'])][[
                    'gene', 'term'
                ]]

            genes = mu.annotations_to_genes(annotations, genes)

            for gene in genes:
                genes[gene].validate_gene_ID(self.genome)
                genes[gene].validate_annotation_ID(ontology_dict,
                                                   pair_params['ontology'])

            self.genome = mu.update_genome(self.genome,
                                           pair_params['ontology'], genes,
                                           self.current_ontology_event)

        info = self.gfu.save_one_genome({
            'workspace': params['workspace_name'],
            'name': params['output_name'],
            'data': self.genome,
            'provenance': ctx.provenance()
        })['info']

        genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
        logging.info("*** Genome ID: " + str(genome_ref))
class ProkkaUtils:
    def __init__(self, config):
        self.scratch = config["scratch"]
        self.ctx = config['ctx']
        self.callback_url = config["SDK_CALLBACK_URL"]

        self.ws_client = workspaceService(config["workspace-url"])
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)

        self.sso_ref = None
        self.sso_event = None
        self.ec_to_sso = {}
        self.output_workspace = None

    @staticmethod
    def _get_input_value(params, key):
        """Get value of key after checking for its existence

        :param params: Params dictionary haystack
        :param key: Key to search in Params
        :return: Parameter Value
        :raises ValueError: raises an exception if the key doesn"t exist
        """
        if not key in params:
            raise ValueError("Parameter " + key +
                             " should be set in input parameters")
        return params[key]

    @staticmethod
    def _get_qualifier_value(qualifier):
        """Get first qualifier from the list of qualifiers

        :param qualifier: list contents of the qualifier from BCBio GFF Tools
        :return: first element in the list
        """
        return qualifier[0] if (qualifier and len(qualifier) > 0) else None

    def download_seed_data(self):
        """Download Seed Data Ontology, and set the gene_ontology reference (sso_ref) and
        the create a table from ec numbers to sso (ec_to_sso)

        :return: None
        """
        # Download Seed Reference Data
        sso_ret = self.ws_client.get_objects([{
            "ref":
            "KBaseOntology/seed_subsystem_ontology"
        }])[0]
        sso = sso_ret["data"]
        for sso_id in sso["term_hash"]:
            sso_name = sso["term_hash"][sso_id]["name"]
            if "(EC " in sso_name and sso_name.endswith(")"):
                ec = sso_name[sso_name.index("(EC ") + 4:-1].strip()
                sso_list = self.ec_to_sso.get(ec, None)
                if not sso_list:
                    sso_list = []
                    self.ec_to_sso[ec] = sso_list
                sso_list.append(sso["term_hash"][sso_id])
        print("EC found in SSO: " + str(len(self.ec_to_sso)))
        sso_info = sso_ret["info"]
        sso_ref = str(sso_info[6]) + "/" + str(sso_info[0]) + "/" + str(
            sso_info[4])
        with open("/kb/module/work/seed_so.json", "w") as outfile:
            json.dump(sso, outfile, sort_keys=True, indent=4)
        self.sso_ref = sso_ref

    def inspect_assembly(self, assembly_meta, assembly_ref):
        """Check to see if assembly has too many contigs and might not be a metagenome or
        non prokaryotic dataset

        :param assembly_meta: information about the assembly reference
        :param assembly_ref: the assembly reference number
        :return: a tuple containing gc_content and dna_size
        """
        gc_content = float(assembly_meta.get("GC content"))
        dna_size = int(assembly_meta.get("Size"))
        n_contigs = 0
        if "N Contigs" in assembly_meta:
            n_contigs = int(assembly_meta.get("N Contigs"))
        else:
            contig = self.ws_client.get_objects([{"ref": assembly_ref}])[0]
            n_contigs = len(contig["data"]["contigs"])
        if n_contigs >= 30000:
            message = """
             Hmmm.  There are over 30,000 contigs in this Assembly. 
             It looks like you are trying to run Prokka on a metagenome or non-prokaryotic data set. 
             If this is a metagenome data set we recommend using an App like MaxBin to first bin the contigs into genome-like bins. 
             These bins can then be individually annotated as a single genome using Prokka. 
             If this data comes from a Eukaryotic sample, KBase does not currently have an annotation app designed for Eukaryotes. 
             Alternatively, you can try reducing the number of contigs using a filter app.")
             raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions
             """
            print(message)
            #raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions")

        assembly_info = namedtuple("assembly_info", "gc_content dna_size")
        return assembly_info(gc_content, dna_size)

    @staticmethod
    def create_renamed_assembly(assembly_fasta_filepath):
        """Rename records to be in the format of contig_N and output a new fasta file

        :param assembly_fasta_filepath:
        :return: A tuple with The path to the fasta file with renamed contigs the number of contigs,
        the mapping from old ids to new ids, and the contigs as SeqRecords
        """
        records = []
        new_ids_to_old = {}
        contig_counter = 0
        for record in SeqIO.parse(assembly_fasta_filepath, "fasta"):
            contig_counter += 1
            old_id = record.id
            new_id = "contig_" + str(contig_counter)
            sequence = record.seq  # it has type "Seq"
            record = SeqRecord(sequence,
                               id=new_id,
                               description="(" + old_id + ")")
            records.append(record)
            new_ids_to_old[new_id] = old_id

        renamed_assembly_fasta_filepath = assembly_fasta_filepath + "_renamed.fna"
        SeqIO.write(records, renamed_assembly_fasta_filepath, "fasta")

        renamed_assembly = namedtuple(
            "renamed_assembly",
            "filepath contig_counter new_ids_to_old records")
        return renamed_assembly(renamed_assembly_fasta_filepath,
                                contig_counter, new_ids_to_old, records)

    def run_prokka(self, params, subject_fasta_filepath):
        """Run Prokka

        :param params: Prokka parameters
        :param subject_fasta_filepath: The contigs or genes to run prokka against
        :return: The directory with all of the prokka output files
        """
        output_dir = "/kb/module/work/tmp/temp_" + str(uuid.uuid4())

        # --kingdom [X]  Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default "Bacteria")
        kingdom = "Bacteria"
        if "kingdom" in params and params["kingdom"]:
            kingdom = params["kingdom"]

        prokka_cmd_list = [
            "perl", "/kb/prokka/bin/prokka", "--metagenome", "--outdir",
            output_dir, "--prefix", "mygenome", "--kingdom", kingdom
        ]

        # --genus [X]       Genus name (triggers to use --usegenus)
        if "genus" in params and params["genus"]:
            prokka_cmd_list.extend(
                ["--genus", str(params["genus"]), "--usegenus"])
        # --gcode [N]       Genetic code / Translation table (set if --kingdom is set) (default "0")
        if "gcode" in params and params["gcode"]:
            prokka_cmd_list.extend(["--gcode", str(params["gcode"])])
        else:
            prokka_cmd_list.extend(["--gcode", "0"])
        # --gram [X]        Gram: -/neg +/pos (default "")
        if "gram" in params and params["gram"]:
            raise ValueError(
                "gram parameter is not supported in current Prokka installation"
            )
        # --metagenome      Improve gene predictions for highly fragmented genomes (default OFF)
        if "metagenome" in params and params["metagenome"] == 1:
            prokka_cmd_list.append("--metagenome")
        # --rawproduct      Do not clean up /product annotation (default OFF)
        if "rawproduct" in params and params["rawproduct"] == 1:
            prokka_cmd_list.append("--rawproduct")
        # --fast            Fast mode - skip CDS /product searching (default OFF)
        if "fast" in params and params["fast"] == 1:
            prokka_cmd_list.append("--fast")
        # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default "1")
        if "mincontiglen" in params and params["mincontiglen"]:
            prokka_cmd_list.extend(
                ["--mincontiglen",
                 str(params["mincontiglen"])])
        # --evalue [n.n]    Similarity e-value cut-off (default "1e-06")
        if "evalue" in params and params["evalue"]:
            prokka_cmd_list.extend(["--evalue", str(params["evalue"])])
        # --rfam            Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default "0")
        if "rfam" in params and params["rfam"] == 1:
            prokka_cmd_list.append("--rfam")
        # --norrna          Don"t run rRNA search (default OFF)
        if "norrna" in params and params["norrna"] == 1:
            prokka_cmd_list.append("--norrna")
        # --notrna          Don"t run tRNA search (default OFF)
        if "notrna" in params and params["notrna"] == 1:
            prokka_cmd_list.append("--notrna")
        prokka_cmd_list.append(subject_fasta_filepath)
        print("Prokka command line: " + str(prokka_cmd_list))

        #tbl2asn or some other non essential prokka binary will fail, so supress that
        try:
            check_output(prokka_cmd_list, cwd=self.scratch)
        except CalledProcessError as e:
            pprint(e)
        return output_dir

    @staticmethod
    def retrieve_prokka_results(output_dir):
        """ Gather up the relevant prokka results, load the records from the results files

        :param output_dir:
        :return: A tuple containing Sequences from the .faa .ffn files and the gff_filepath
        """
        faa_file = output_dir + "/mygenome.faa"
        cds_to_prot = {}
        for record in SeqIO.parse(faa_file, "fasta"):
            cds_to_prot[record.id] = str(record.seq)
        ffn_file = output_dir + "/mygenome.ffn"
        cds_to_dna = {}
        for record in SeqIO.parse(ffn_file, "fasta"):
            cds_to_dna[record.id] = str(record.seq)
        gff_file = output_dir + "/mygenome.gff"
        if not os.path.isfile(gff_file):
            raise ValueError("PROKKA output GFF file is not found")

        prokka_results = namedtuple("prokka_results",
                                    "cds_to_prot cds_to_dna gff_filepath")
        return prokka_results(cds_to_prot, cds_to_dna, gff_file)

    def parse_prokka_results(self, **prokka_parse_parameters):
        """ Go through the prokka results from the input contigs and then
        create the features, mrnas and cdss components of the KbaseGenome.Genome object for
        genome annotation only.

        :param prokka_parse_parameters: gff_filepath, mappings
        :return: A tuple with Genome:features Genome:cdss  Genome:mrnas report_message of genes discovered
        """
        gff_filepath = prokka_parse_parameters["gff_filepath"]
        cds_to_dna = prokka_parse_parameters["cds_to_dna"]
        cds_to_prot = prokka_parse_parameters["cds_to_prot"]
        new_ids_to_old = prokka_parse_parameters["new_ids_to_old"]

        evidence = self.make_annotation_evidence()

        cdss = []
        mrnas = []
        features = []
        non_hypothetical = 0
        genes_with_ec = 0
        genes_with_sso = 0
        prot_lengths = []
        with open(gff_filepath, "r") as f1:
            for rec in GFF.parse(f1):
                contig_id = new_ids_to_old[str(rec.id)]
                for ft in rec.features:
                    loc = ft.location
                    min_pos = int(loc.start) + 1
                    max_pos = int(loc.end)
                    strand = "+" if loc.strand == 1 else "-"
                    flen = max_pos - min_pos + 1
                    start = min_pos if strand == "+" else max_pos
                    location = [[contig_id, start, strand, flen]]
                    qualifiers = ft.qualifiers
                    generated_id = self._get_qualifier_value(
                        qualifiers.get("ID"))
                    if not generated_id:
                        # Skipping feature with no ID (mostly repeat regions)
                        continue
                    dna = cds_to_dna.get(generated_id)
                    if not dna:
                        # Skipping feature with no DNA (mostly repeat regions)
                        continue
                    name = self._get_qualifier_value(qualifiers.get("Name"))
                    ec = self._get_qualifier_value(qualifiers.get("eC_number"))
                    gene = self._get_qualifier_value(qualifiers.get("gene"))
                    product = self._get_qualifier_value(
                        qualifiers.get("product"))
                    fid = generated_id
                    aliases = []
                    if name:
                        aliases.append(name)
                    if gene:
                        aliases.append(gene)
                    if ec:
                        aliases.append(ec)
                        genes_with_ec += 1
                    md5 = hashlib.md5(dna).hexdigest()
                    feature = {
                        "id": fid,
                        "location": location,
                        "type": "gene",
                        "aliases": aliases,
                        "md5": md5,
                        "dna_sequence": dna,
                        "dna_sequence_length": len(dna),
                    }
                    if product:
                        feature["function"] = product
                        if product != "hypothetical protein":
                            non_hypothetical += 1
                    if ec and ec in self.ec_to_sso:
                        sso_list = self.ec_to_sso[ec]
                        sso_terms = {}
                        for sso_item in sso_list:
                            sso_terms[sso_item["id"]] = {
                                "id": sso_item["id"],
                                "evidence": [evidence],
                                "term_name": sso_item["name"],
                                "ontology_ref": self.sso_ref,
                                "term_lineage": []
                            }
                        feature["ontology_terms"] = {"SSO": sso_terms}
                        genes_with_sso += 1
                    cds = None
                    mrna = None
                    prot = cds_to_prot.get(generated_id)
                    if prot:
                        cds_id = fid + "_CDS"
                        mrna_id = fid + "_mRNA"
                        prot_len = len(prot)
                        prot_lengths.append(prot_len)
                        feature["protein_translation"] = prot
                        feature["protein_translation_length"] = prot_len
                        feature["cdss"] = [cds_id]
                        feature["mrnas"] = [mrna_id]
                        cds = {
                            "id": cds_id,
                            "location": location,
                            "md5": md5,
                            "parent_gene": fid,
                            "parent_mrna": mrna_id,
                            "function": (product if product else ""),
                            "ontology_terms": {},
                            "protein_translation": prot,
                            "protein_translation_length": prot_len,
                            "aliases": aliases
                        }
                        mrna = {
                            "id": mrna_id,
                            "location": location,
                            "md5": md5,
                            "parent_gene": fid,
                            "cds": cds_id
                        }
                    features.append(feature)
                    if cds:
                        cdss.append(cds)
                    if mrna:
                        mrnas.append(mrna)

        # Prepare report
        report = ""
        report += "Number of genes predicted: " + str(len(features)) + "\n"
        report += "Number of protein coding genes: " + str(
            len(prot_lengths)) + "\n"
        report += "Number of genes with non-hypothetical function: " + str(
            non_hypothetical) + "\n"
        report += "Number of genes with EC-number: " + str(
            genes_with_ec) + "\n"
        report += "Number of genes with Seed Subsystem Ontology: " + str(
            genes_with_sso) + "\n"
        report += "Average protein length: " + str(
            int(sum(prot_lengths) / float(len(prot_lengths)))) + " aa.\n"

        annotated_assembly = namedtuple("annotated_assembly",
                                        "features cdss mrnas report_message")
        return annotated_assembly(features, cdss, mrnas, report)

    def get_new_annotations(self, gff_filepath):
        """

        :param gff_filepath: A dictionary of ids with products and ec numbers
        :return:
        """
        evidence = self.make_annotation_evidence()
        genome = {}
        with open(gff_filepath, "r") as f:
            for rec in GFF.parse(f):
                gid = rec.id
                gene_features = {"id": id}

                for feature in rec.features:
                    qualifiers = feature.qualifiers
                    if "product" in qualifiers:
                        gene_features["function"] = " ".join(
                            qualifiers["product"])

                    if "eC_number" in qualifiers:
                        ec_numbers = qualifiers["eC_number"]
                        sso_terms = dict()
                        for ec in ec_numbers:
                            sso_list = self.ec_to_sso.get(ec, [])
                            for sso_item in sso_list:
                                sso_terms[sso_item["id"]] = {
                                    "id": sso_item["id"],
                                    "evidence": [evidence],
                                    "term_name": sso_item["name"],
                                    "ontology_ref": self.sso_ref,
                                    "term_lineage": []
                                }

                        gene_features["ontology_terms"] = sso_terms
                genome[gid] = gene_features

        return genome

    def write_genome_to_fasta(self, genome_data):
        """

        :param genome_data:
        :return:
        """
        fasta_for_prokka_filepath = os.path.join(
            self.scratch, "features_" + str(uuid.uuid4()) + ".fasta")
        count = 0
        with open(fasta_for_prokka_filepath, "w") as f:
            for item in genome_data["data"]["features"]:
                if "id" not in item or "dna_sequence" not in item:
                    print("This feature does not have a valid dna sequence.")
                else:
                    f.write(">" + item["id"] + "\n" + item["dna_sequence"] +
                            "\n")
                    count += 1

        print("Finished printing to" + fasta_for_prokka_filepath)
        if os.stat(fasta_for_prokka_filepath).st_size == 0:
            raise Exception(
                "This genome does not contain features with DNA_SEQUENCES. Fasta file is empty."
            )

        return fasta_for_prokka_filepath

    def make_sso_ontology_event(self):
        """

        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        """
        time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation",
            "method_version": version,
            "timestamp": time_string,
            "id": "SSO",
            "ontology_ref": self.sso_ref
        }

    def make_annotation_evidence(self):
        """
        Create a dict for the evidence field for the genome
        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        """
        time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation (Evidence)",
            "method_version": version,
            "timestamp": time_string,
        }

    def create_genome_ontology_fields(self, genome_data):
        """
        Create ontology event fields for a genome object
        :param genome_data:  A genome object's data filed
        :return: a named tuple containg the modified genome object and a new ontology event index
        """
        # Make sure ontologies_events exist
        sso_event = self.make_sso_ontology_event()
        ontology_event_index = 0

        if 'ontology_events' in genome_data['data']:
            genome_data['data']['ontology_events'].append(sso_event)
            ontology_event_index += len(
                genome_data['data']['ontology_events']) - 1
        else:
            genome_data['data']['ontology_events'] = [sso_event]

        genome_obj_modified = namedtuple('genome_obj_modified',
                                         'genome_data ontology_event_index')
        return genome_obj_modified(genome_data, ontology_event_index)

    @staticmethod
    def old_genome_ontologies(feature, new_ontology):
        """
        Update the feature's ontologies for an old genome
        :param feature: Feature to update
        :param new_ontology: New Ontology to update with
        :return: The feature with the ontology updated, in the old style
        """
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}
        for key in new_ontology.keys():
            feature["ontology_terms"]["SSO"][key] = new_ontology[key]
        return feature

    @staticmethod
    def new_genome_ontologies(feature, new_ontology, ontology_event_index):
        """
        Update the feature's ontologies for a new genome
        :param feature: Feature to update
        :param new_ontology: New Ontology to update with
        :param ontology_event_index: Ontology index to update the feature with
        :return: the updated feature
        """
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}

        for key in new_ontology.keys():
            id = new_ontology[key]["id"]
            if id in feature["ontology_terms"]["SSO"]:
                feature["ontology_terms"]["SSO"][id].append(
                    ontology_event_index)
            else:
                feature["ontology_terms"]["SSO"][id] = [ontology_event_index]
        return feature

    def annotate_genome_with_new_annotations(self, **annotation_args):
        """
        Annotate the genome with new annotations for  Genome ReAnnotation
        :param annotation_args:  genome_data from the genome obj, new_annotations from prokka, and the output_genome_name
        :return: A tuple containg the genome_ref, filepaths for the function and ontology summary, and stats about the annotations
          """
        genome_data = annotation_args["genome_data"]
        new_annotations = annotation_args["new_annotations"]

        new_genome = False
        if 'feature_counts' in genome_data['data']:
            new_genome = True
            genome_obj_modified = self.create_genome_ontology_fields(
                genome_data)
            genome_data = genome_obj_modified.genome_data
            ontology_event_index = genome_obj_modified.ontology_event_index

        stats = {
            "current_functions": len(genome_data["data"]["features"]),
            "new_functions": 0,
            "found_functions": 0,
            "new_ontologies": 0
        }

        function_summary_fp = os.path.join(self.scratch, "ontology_report")
        ontology_summary_fp = os.path.join(self.scratch, "function_report")
        onto_r = open(function_summary_fp, "w")
        func_r = open(ontology_summary_fp, "w")
        func_r.write("function_id current_function new_function\n")
        onto_r.write("function_id current_ontology new_ontology\n")

        ontologies_present = {"SSO": {}}
        for i, feature in enumerate(genome_data["data"]["features"]):
            fid = feature["id"]
            current_function = feature.get("function", "")
            current_functions = feature.get("functions", [])
            current_ontology = feature.get("ontology_terms", None)
            new_function = ""
            new_ontology = dict()

            if fid in new_annotations:
                # Set Function
                new_function = new_annotations[fid].get("function", "")
                if new_function and "hypothetical protein" not in new_function:
                    if (new_function != current_function
                            and new_function not in current_functions):
                        stats['new_functions'] += 1
                    genome_data["data"]["features"][i][
                        "function"] = new_function
                    genome_data["data"]["features"][i]["functions"] = [
                        new_function
                    ]
                    stats['found_functions'] += 1

                # Set Ontologies
                new_ontology = new_annotations[fid].get("ontology_terms", None)
                if new_ontology:
                    stats['new_ontologies'] += 1
                    if new_genome:
                        # New style
                        genome_data["data"]["features"][i] = self. \
                            new_genome_ontologies(feature, new_ontology, ontology_event_index)

                        # Add to ontologies Present
                        for key in new_ontology.keys():
                            oid = new_ontology[key]["id"]
                            name = new_ontology[key].get("name", "Unknown")
                            ontologies_present["SSO"][oid] = name

                    else:
                        genome_data["data"]["features"][i] = self. \
                            old_genome_ontologies(feature, new_ontology)

            if current_function:
                func_r.write(
                    json.dumps([fid, [current_function], [new_function]]) +
                    "\n")
            else:
                func_r.write(
                    json.dumps([fid, current_functions, [new_function]]) +
                    "\n")

            onto_r.write(
                json.dumps([fid, current_ontology, new_ontology]) + "\n")

        func_r.close()
        onto_r.close()

        if ontologies_present:
            if "ontologies_present" in genome_data["data"]:
                if "SSO" in genome_data["data"]["ontologies_present"]:
                    for key, value in ontologies_present["SSO"].items():
                        genome_data["data"]["ontologies_present"]["SSO"][
                            key] = value
                else:
                    genome_data["data"][
                        "ontologies_present"] = ontologies_present["SSO"]

            else:
                genome_data["data"]["ontologies_present"] = ontologies_present

        info = self.gfu.save_one_genome({
            "workspace":
            self.output_workspace,
            "name":
            annotation_args["output_genome_name"],
            "data":
            genome_data["data"],
            "provenance":
            self.ctx.provenance()
        })["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])
        annotated_genome = namedtuple(
            "annotated_genome",
            "genome_ref function_summary_filepath ontology_summary_filepath stats"
        )

        return annotated_genome(genome_ref, function_summary_fp,
                                ontology_summary_fp, stats)

    def upload_file(self,
                    filepath,
                    message="Annotation report generated by kb_prokka"):
        """
        Upload a file to shock
        :param filepath: File to upload
        :param message: Optional Upload Message
        :return:
        """
        output_file_shock_id = self.dfu.file_to_shock({"file_path":
                                                       filepath})["shock_id"]
        print("Uploaded filepath" + filepath + "to shock and got id" +
              output_file_shock_id)
        return {
            "shock_id": output_file_shock_id,
            "name": os.path.basename(filepath),
            "label": os.path.basename(filepath),
            "description": message
        }

    def report_annotated_genome(self, genome):
        """ Create report output with newly reannotated genome, and some stats

        :param genome: Reannotated Genome Reference, Report Files and Stats
        :return: Reference to Report Object
        """
        genome_ref = genome.genome_ref
        stats = genome.stats

        file_links = [
            self.upload_file(genome.ontology_summary_filepath),
            self.upload_file(genome.function_summary_filepath)
        ]

        report_message = ("Genome Ref:{0}\n"
                          "Number of features sent into prokka:{1}\n"
                          "New functions found:{2}\n"
                          "Ontology terms found:{3}\n").format(
                              genome_ref, stats["current_functions"],
                              stats["new_functions"], stats["new_ontologies"])

        report_info = self.kbr.create_extended_report({
            "message":
            report_message,
            "objects_created": [{
                "ref": genome_ref,
                "description": "Annotated genome"
            }],
            "file_links":
            file_links,
            "report_object_name":
            "kb_prokka_report_" + str(uuid.uuid4()),
            "workspace_name":
            self.output_workspace
        })

        return {
            "output_genome_ref": genome_ref,
            "report_name": report_info["name"],
            "report_ref": report_info["ref"]
        }

    def annotate_genome(self, params):
        """ User input an existing genome to re-annotate.

        :param params: Reference to the genome, Output File Name, UI Parameters
        :return: Report with Reannotated Genome and Stats about it
        """
        self.download_seed_data()
        self.output_workspace = params["output_workspace"]

        genome_ref = self._get_input_value(params, "object_ref")
        output_name = self._get_input_value(params, "output_genome_name")
        # genome_data = self.dfu.get_objects({"object_refs": [genome_ref]})["data"][0]

        genome_data = \
            self.genome_api.get_genome_v1({"genomes": [{"ref": genome_ref}], 'downgrade': 0})[
                "genomes"][0]

        fasta_for_prokka_filepath = self.write_genome_to_fasta(genome_data)
        output_dir = self.run_prokka(params, fasta_for_prokka_filepath)
        prokka_results = self.retrieve_prokka_results(output_dir)
        new_annotations = self.get_new_annotations(prokka_results.gff_filepath)
        annotated_genome = self.annotate_genome_with_new_annotations(
            genome_data=genome_data,
            new_annotations=new_annotations,
            output_genome_name=output_name)
        return self.report_annotated_genome(annotated_genome)

    def annotate_assembly(self, params, assembly_info):
        """
        Annotate an assembly with Prokka. The steps include to download the assembly as a fasta file,
        rename the contigs, run prokka against the contigs, parse the results, and finally,
        create and upload a genome object.

        :param params: object reference, output_genome_name and output_workspace
        :param assembly_info: Information used to determine if the assembly is too big
        :return: Report with newly annotated assembly as a genome, and stats about it
        """
        self.download_seed_data()
        output_workspace = params["output_workspace"]

        assembly_ref = self._get_input_value(params, "object_ref")
        output_genome_name = self._get_input_value(params,
                                                   "output_genome_name")
        output_workspace = self._get_input_value(params, "output_workspace")
        assembly_info = self.inspect_assembly(assembly_info[10], assembly_ref)
        orig_fasta_file = self.au.get_assembly_as_fasta({"ref":
                                                         assembly_ref})["path"]

        # Rename Assembly and Keep Track of Old Contigs
        renamed_assembly = self.create_renamed_assembly(orig_fasta_file)
        # Run Prokka with the modified, renamed fasta file
        output_dir = self.run_prokka(params, renamed_assembly.filepath)
        # Prokka_results
        prokka_results = self.retrieve_prokka_results(output_dir)
        # Parse Results
        annotated_assembly = self.parse_prokka_results(
            gff_filepath=prokka_results.gff_filepath,
            cds_to_dna=prokka_results.cds_to_dna,
            cds_to_prot=prokka_results.cds_to_prot,
            new_ids_to_old=renamed_assembly.new_ids_to_old)

        # Force defaults for optional parameters that may be set to None
        scientific_name = 'Unknown'
        if 'scientific_name' in params and params['scientific_name']:
            scientific_name = params['scientific_name']
        domain = "Bacteria"
        if 'kingdom' in params and params['kingdom']:
            domain = params['kingdom']
        gcode = 0
        if 'gcode' in params and params['gcode']:
            gcode = params['gcode']

        genome = {
            "id": "Unknown",
            "features": annotated_assembly.features,
            "scientific_name": scientific_name,
            "domain": domain,
            "genetic_code": gcode,
            "assembly_ref": assembly_ref,
            "cdss": annotated_assembly.cdss,
            "mrnas": annotated_assembly.mrnas,
            "source": "PROKKA annotation pipeline",
            "gc_content": assembly_info.gc_content,
            "dna_size": assembly_info.dna_size,
            "reference_annotation": 0
        }

        info = self.gfu.save_one_genome({
            "workspace": output_workspace,
            "name": output_genome_name,
            "data": genome,
            "provenance": self.ctx.provenance()
        })["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])

        report_message = "Genome saved to: " + output_workspace + "/" + \
                         output_genome_name + "\n" + annotated_assembly.report_message

        report_info = self.kbr.create_extended_report({
            "message":
            report_message,
            "objects_created": [{
                "ref": genome_ref,
                "description": "Annotated genome"
            }],
            "report_object_name":
            "kb_prokka_report_" + str(uuid.uuid4()),
            "workspace_name":
            output_workspace
        })

        return {
            "output_genome_ref": genome_ref,
            "report_name": report_info["name"],
            "report_ref": report_info["ref"]
        }
class MergeAnnotationsUtil:

    workdir = 'tmp/work/'
    staging_dir = "/staging/"
    datadir = "/kb/module/data/"

    def __init__(self, config):
        os.makedirs(self.workdir, exist_ok=True)
        self.config = config
        self.timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.genome_api = GenomeAnnotationAPI(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.ws_client = Workspace(config["workspace-url"])

        self.events = {}
        self.weights = {}
        self.genes = {}

    def get_ontology_events(self, params):
        if 'ontology_events' in self.genome:

            for event, ontology in enumerate(self.genome['ontology_events']):

                # fix some legacy problems
                if 'description' not in ontology:
                    ontology['description'] = ontology['method']
                ontology["id"] = mu.legacy_fix(ontology["id"])

                if len(params['annotations_to_merge']) == 0:
                    self.weights[event] = 1
                    self.events[event] = {}
                    for term in ontology:
                        self.events[event][term] = ontology[term]

                else:
                    for annotations_to_merge in params['annotations_to_merge']:
                        if ontology['description'] in annotations_to_merge[
                                'annotation_source']:
                            self.events[event] = {}
                            self.weights[event] = annotations_to_merge[
                                'annotation_weight']
                            for term in ontology:
                                self.events[event][term] = ontology[term]

        else:
            logging.info("No ontology events in this genome!")

    def merge_annotations(self):
        merged_annotations = {}

        # add gene id to summary
        for feature in self.genome['features']:
            gene_id = feature['id']
            merged_annotations[gene_id] = {}

            # get ontology term
            if "ontology_terms" in feature:
                for type in feature['ontology_terms']:
                    term_dict = feature['ontology_terms'][type]

                    # fix potential legacy problems after getting feature
                    type = mu.legacy_fix(type)

                    for term in term_dict:
                        # logging.info(term)
                        # logging.info(mu.standardize_annotation(term, type))

                        for ontology_event in term_dict[term]:

                            # is this ontology event in the user-selected list?
                            if ontology_event in self.events:

                                rxn = "none"

                                # convert terms to rxns
                                standardized_term = mu.standardize_annotation(
                                    term, type)

                                if standardized_term in self.translations[
                                        type]:
                                    rxn = self.translations[type][
                                        standardized_term]

                                if rxn != "none":
                                    if rxn in merged_annotations[gene_id]:
                                        merged_annotations[gene_id][rxn][
                                            'events'].append(ontology_event)

                                        # clean up duplicates... eg old versions of prokka added many of the same reaction
                                        merged_annotations[gene_id][rxn][
                                            'events'] = list(
                                                set(merged_annotations[gene_id]
                                                    [rxn]['events']))
                                    else:
                                        merged_annotations[gene_id][rxn] = {
                                            'events': []
                                        }
                                        merged_annotations[gene_id][rxn][
                                            'events'] = [ontology_event]

        return merged_annotations

    def score_annotations(self, annotations, threshold, best_only):
        '''
        returns a pandas dataframe suitable for import annotations
        '''

        df = pd.DataFrame(columns=['gene', 'term', 'events', 'score'])

        for gene_id in annotations:

            # get total score of each rxn, save to 'score_total'
            for rxn in annotations[gene_id]:
                annotations[gene_id][rxn]['score_total'] = 0
                for ontology_event in annotations[gene_id][rxn]['events']:
                    annotations[gene_id][rxn]['score_total'] += self.weights[
                        ontology_event]

            # get list of the best rxn or rxns
            '''
            best_score = max()

            for rxn in annotations[gene_id]:

                if annotations[gene_id][rxn]['score_total'] >= threshold:
                    if best_only == "all" or rxn in best_rxn:
                        annotations[gene_id][rxn]['passed'] = 1
                        df = df.append(
                            pd.Series(data={'gene': gene_id, 'term': rxn, 'events': annotations[gene_id][rxn]['events'], 'score': annotations[gene_id][rxn]['score_total']}), ignore_index=True)

                    else:
                        annotations[gene_id][rxn]['passed'] = 0
                        # df = df.append(
                        #     pd.Series(data={'gene': gene_id, 'term': rxn, 'events': annotations[gene_id][rxn]['events'], 'score': annotations[gene_id][rxn]['score_total']}), ignore_index=True)

            #        annotations[gene_id][rxn]['passed'] = 0
            '''

        # with open(os.path.join(self.scratch, "scored.json"), 'w') as outfile:
        #     json.dump(annotations, outfile, indent=2)

        df.to_csv(os.path.join(self.scratch, "scored.txt"),
                  sep="\t",
                  index=False)

        return df

    def html_summary(self, params, summary):

        output_html_files = list()

        # Make report directory and copy over files
        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(output_directory)
        result_file_path = os.path.join(output_directory,
                                        'merge_annotations_summary.html')

        # make html
        table_lines = []
        table_lines.append(f'<h2>Merge Annotations</h2>')
        table_lines.append(f'<h3>Summary</h3>')
        table_lines.append(
            '<table cellspacing="0" cellpadding="3" border="1"><tr><th>EVENT</th><th>DESCRIPTION</th><th>TYPE</th><th>GENES</th><th>TERMS</th><th>RXNS</th></tr>'
        )
        for event in sorted(summary.keys()):
            # RAST/PROKKA don't have descriptions, but they have methods
            description = self.events[event].get('description',
                                                 self.events[event]['method'])
            type = self.events[event]['id']
            table_lines.append('<tr><td>' + str(event) + '</td><td>' +
                               description + '</td><td>' + type + '</td><td>' +
                               str(len(summary[event]["genes"])) +
                               '</td><td>' +
                               str(len(summary[event]["terms"])) +
                               '</td><td>' + str(len(summary[event]["rxns"])) +
                               '</td></tr>')
        table_lines.append('</table>')

        # Write to file
        with open(result_file_path, 'w') as result_file:
            for line in table_lines:
                result_file.write(line + "\n")

        output_html_files.append({
            'path': output_directory,
            'name': os.path.basename(result_file_path),
            'description': 'Summary Report'
        })

        # finalize html reports
        report_params = {
            'message': '',
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'workspace_name': params['workspace_name'],
            'report_object_name': f'merge_annotations_{uuid.uuid4()}'
        }

        output = self.kbr.create_extended_report(report_params)

        return {'report_name': output['name'], 'report_ref': output['ref']}

    def generate_report(self, params, genome_ref):
        """
        Reads in the results from the summary method, and creates the html
        report.

        This is just a copy/paste of the report from the import app
        """

        summary = mu.summarize(params, self.genes)

        output_html_files = list()

        # Make report directory and copy over files
        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(output_directory)
        result_file_path = os.path.join(output_directory,
                                        'import_annotations_summary.html')

        # Build HTML tables for results
        table_lines = []
        table_lines.append(f'<h2>Import Annotations</h2>')
        table_lines.append(f'<h3>Summary</h3>')
        table_lines.append(
            '<table cellspacing="0" cellpadding="3" border="1"><tr><th>TYPE</th><th>VALID</th><th>INVALID</th></tr>'
        )
        table_lines.append('<tr><td>GENES</td><td>' +
                           str(len(summary['valid_genes'])) + '</td><td>' +
                           str(len(summary['invalid_genes'])) + '</td></tr>')
        table_lines.append('<tr><td>TERMS</td><td>' +
                           str(len(summary['valid_terms'])) + '</td><td>' +
                           str(len(summary['invalid_terms'])) + '</td></tr>')
        table_lines.append('</table>')

        if len(summary['invalid_genes']) > 0:
            table_lines.append(f'<h3>Invalid Genes</h3>')
            table_lines.append(
                '<i>These are locus_tags not identified in the genome object. Frequency shown in parentheses.</i><br><br>'
            )

            invalid_genes_count = dict(Counter(summary['invalid_genes']))

            for gene in sorted(invalid_genes_count.keys()):
                gene_count = gene + '\t(' + str(
                    invalid_genes_count[gene]) + ')'
                table_lines.append(gene_count + '<br>')

        if len(summary['invalid_terms']) > 0:
            table_lines.append(f'<h3>Invalid Terms</h3>')
            table_lines.append(
                '<i>These are ontology terms not found in the ontology dictionary. Frequency shown in parentheses.</i><br><br>'
            )

            invalid_terms_count = dict(Counter(summary['invalid_terms']))

            for term in sorted(invalid_terms_count.keys()):
                term_count = term + '\t(' + str(
                    invalid_terms_count[term]) + ')'
                table_lines.append(term_count + '<br>')

        # Write to file
        with open(result_file_path, 'w') as result_file:
            for line in table_lines:
                result_file.write(line + "\n")

        output_html_files.append({
            'path':
            output_directory,
            'name':
            os.path.basename(result_file_path),
            'description':
            'HTML report for import_annotations app'
        })

        report_params = {
            'message':
            '',
            'html_links':
            output_html_files,
            'direct_html_link_index':
            0,
            'objects_created': [{
                'ref':
                genome_ref,
                'description':
                'Genome with imported annotations'
            }],
            'workspace_name':
            params['workspace_name'],
            'report_object_name':
            f'import_annotations_{uuid.uuid4()}'
        }

        output = self.kbr.create_extended_report(report_params)

        return {
            'output_genome_ref': genome_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }

    def run(self, ctx, params):
        params['ontology'] = 'MSRXN'  # just in case it doesn't get set

        self.genome = mu.get_genome(params['genome'], self.genome_api)

        self.get_ontology_events(params)
        self.translations = mu.get_translations(self.datadir)

        merged_annotations = self.merge_annotations()
        scored_annotations = self.score_annotations(
            merged_annotations, params['annotation_threshold'],
            params['keep_best_annotation_only'])

        ontology_dict = mu.get_ontology_dict('MSRXN', self.datadir,
                                             mu.ontology_lookup)

        # get list of uploaded annotation terms
        annotations = mu.get_annotations_file(params,
                                              self.staging_dir,
                                              pass_df=scored_annotations)
        self.genes = mu.annotations_to_genes(annotations, self.genes)

        self.genome = mu.add_ontology_event(self.genome, params,
                                            self.timestamp,
                                            "Merge Annotations")

        # fix missing descriptions
        o_counter = 0
        for ontology_event in self.genome['ontology_events']:
            if 'description' not in ontology_event:
                ontology_event['description'] = ontology_event['method']
            self.genome['ontology_events'][o_counter] = ontology_event
            o_counter += 1

        self.current_ontology_event = len(self.genome['ontology_events']) - 1

        # process
        for gene in self.genes:
            self.genes[gene].validate_gene_ID(self.genome)
            self.genes[gene].validate_annotation_ID(ontology_dict, 'MSRXN')

        self.genome = mu.update_genome(self.genome, 'MSRXN', self.genes,
                                       self.current_ontology_event)

        info = self.gfu.save_one_genome({
            'workspace': params['workspace_name'],
            'name': params['output_name'],
            'data': self.genome,
            'provenance': ctx.provenance()
        })['info']

        genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
        logging.info("*** Genome ID: " + str(genome_ref))

        report = self.generate_report(params, genome_ref)

        return report
    def test_annotate_contigs(self):

        assembly_file_name = "small.fna"  # "AP009048.fna"
        assembly_test_file = os.path.join("/kb/module/test/data/", assembly_file_name)
        assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name)
        shutil.copy(assembly_test_file, assembly_temp_file)
        assembly_name = "Assembly.1"
        au = AssemblyUtil(os.environ["SDK_CALLBACK_URL"])
        assembly_ref = au.save_assembly_from_fasta({"file": {"path": assembly_temp_file},
                                                    "workspace_name": self.getWsName(),
                                                    "assembly_name": assembly_name})
        # Add a genome to the WS to test ref_paths
        genome_name = "Genome.1"
        genome = {"id": "Unknown", "features": [],
                  "scientific_name": "",
                  "domain": "", "genetic_code": 0,
                  "assembly_ref": assembly_ref,
                  "cdss": [], "mrnas": [],
                  "source": "Magic!",
                  "gc_content": 0, "dna_size": 0,
                  "reference_annotation": 0}
        prov = self.getContext().provenance()
        gfu = GenomeFileUtil(os.environ["SDK_CALLBACK_URL"])
        info = gfu.save_one_genome(
            {"workspace": self.getWsName(), "name": genome_name,
             "data": genome, "provenance": prov})["info"]
        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])
        result = self.getImpl().annotate(self.getContext(),
                                         {"object_ref": "{};{}".format(genome_ref, assembly_ref),
                                          "output_workspace": self.getWsName(),
                                          "output_genome_name": genome_name,
                                          "evalue": None,
                                          "fast": 0,
                                          "gcode": 0,
                                          "genus": "genus",
                                          "kingdom": "Bacteria",
                                          "metagenome": 0,
                                          "mincontiglen": 1,
                                          "norrna": 0,
                                          "notrna": 0,
                                          "rawproduct": 0,
                                          "rfam": 1,
                                          "scientific_name": "Super : diper - name;"
                                          })[0]
        rep = self.getWsClient().get_objects([{"ref": result["report_ref"]}])[0]["data"]
        self.assertTrue("text_message" in rep)
        print("Report:\n" + str(rep["text_message"]))
        genome_ref = self.getWsName() + "/" + genome_name
        genome = self.getWsClient().get_objects([{"ref": genome_ref}])[0]["data"]
        features_to_work = {}
        for feature in genome["features"]:
            features_to_work[feature["id"]] = feature["location"]
        aseq = AssemblySequenceAPI(os.environ["SDK_CALLBACK_URL"], token=self.getContext()["token"])
        dna_sequences = aseq.get_dna_sequences({"requested_features": features_to_work,
                                                "assembly_ref": genome["assembly_ref"]})[
            "dna_sequences"]
        bad_dnas = 0
        for feature in genome["features"]:
            if feature["dna_sequence"] != dna_sequences[feature["id"]]:
                bad_dnas += 1
        self.assertEqual(bad_dnas, 0)
    def test_reannotate_genome(self):
        """
        DOESN"T WORK ON CI WITH THIS DATASET, ONLY WITH APPDEV, THIS TEST IS COMMENTED OUT
        This test uploads the genome.json object, replacing the features with a single feature, and runs prokka against this feature.
        The test itself checks the feature to see that it has been updated.
        This test might break if Prokka decides that there is a better function name for this feature.
        :return:
        """
        gfu = GenomeFileUtil(os.environ["SDK_CALLBACK_URL"])

        genome_test_file = os.path.join("/kb/module/test/data/", "RHODO.json")
        genome_test_feature1_file = os.path.join("/kb/module/test/data/", "rsp_0986.json")
        genome_test_feature2_file = os.path.join("/kb/module/test/data/", "rsp_1428.json")
        genome_name = "RhodoBacter2.4"

        assembly_ref = self.getBogusAssembly()

        with open(genome_test_file, "r") as f:
            genome = json.load(f)
            genome["assembly_ref"] = assembly_ref

        # New function found by prokka
        with open(genome_test_feature1_file, "r") as f:
            target_feature_rsp_0986 = json.load(f)

        # Hypothetical Protein as determined by prokka
        with open(genome_test_feature2_file, "r") as f:
            target_feature_rsp_1428 = json.load(f)

        genome["features"] = [target_feature_rsp_0986, target_feature_rsp_1428]

        info = gfu.save_one_genome(
            {"workspace": self.getWsName(), "name": genome_name,
             "data": genome})["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])

        result = self.getImpl().annotate(self.getContext(),
                                         {"object_ref": genome_ref,
                                          "output_workspace": self.getWsName(),
                                          "output_genome_name": genome_name,
                                          "evalue": None,
                                          "fast": 0,
                                          "gcode": 0,
                                          "genus": "genus",
                                          "kingdom": "Bacteria",
                                          "metagenome": 0,
                                          "mincontiglen": 1,
                                          "norrna": 0,
                                          "notrna": 0,
                                          "rawproduct": 0,
                                          "rfam": 1,
                                          "scientific_name": "RhodoBacter"
                                          })[0]

        genome_ref = self.getWsName() + "/" + genome_name
        re_annotated_genome = self.getWsClient().get_objects([{"ref": genome_ref}])[0]["data"]

        old_feature = genome["features"][0]
        new_feature = re_annotated_genome["features"][0]

        # TEST NEW PROTEIN FUNCTION
        self.assertNotEqual(old_feature, new_feature)
        self.assertEqual(old_feature["function"], "fructokinase")
        self.assertEqual(new_feature["function"], "Pantothenate kinase")

        old_feature = genome["features"][1]
        new_feature = re_annotated_genome["features"][1]

        # TEST HYPOTHETICAL PROTEIN
        self.assertEqual(old_feature["function"], "putative Pre (Mob) type recombination enzyme")
        self.assertEqual(old_feature["function"], new_feature["function"])
Пример #6
0
    def run_kb_dram_annotate(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_kb_dram_annotate
        # validate inputs
        if not isinstance(params['assembly_input_ref'], str) or not len(
                params['assembly_input_ref']):
            raise ValueError('Pass in a valid assembly reference string')
        if not isinstance(params['desc'], str) or not len(params['desc']):
            raise ValueError('Pass in a valid genomeSet description')
        if not isinstance(params['output_name'], str) or not len(
                params['output_name']):
            raise ValueError('Pass in a valid genomeSet output name')
        if not isinstance(params['min_contig_size'],
                          int) or (params['min_contig_size'] < 0):
            raise ValueError('Min contig size must be a non-negative integer')

        # setup params
        with open("/kb/module/kbase.yml", 'r') as stream:
            data_loaded = yaml.load(stream)
        version = str(data_loaded['module-version'])

        is_metagenome = params['is_metagenome']
        min_contig_size = params['min_contig_size']
        trans_table = str(params['trans_table'])
        bitscore = params['bitscore']
        rbh_bitscore = params['rbh_bitscore']
        output_dir = os.path.join(self.shared_folder, 'DRAM_annos')
        output_objects = []

        # create Util objects
        wsClient = workspaceService(self.workspaceURL, token=ctx['token'])
        assembly_util = AssemblyUtil(self.callback_url)
        genome_util = GenomeFileUtil(self.callback_url)

        # set DRAM database locations
        print('DRAM version: %s' % dram_version)
        import_config('/data/DRAM_databases/CONFIG')
        # This is a hack to get around a bug in my database setup
        set_database_paths(
            description_db_loc='/data/DRAM_databases/description_db.sqlite')
        print_database_locations()

        # get files
        assemblies = assembly_util.get_fastas(
            {'ref_lst': [params['assembly_input_ref']]})
        # would paths ever have more than one thing?
        fasta_locs = [
            assembly_data['paths'][0]
            for assembly_ref, assembly_data in assemblies.items()
        ]
        # get assembly refs from dram assigned genome names
        assembly_ref_dict = {
            os.path.splitext(
                os.path.basename(
                    remove_suffix(assembly_data['paths'][0], '.gz')))[0]:
            assembly_ref
            for assembly_ref, assembly_data in assemblies.items()
        }

        # annotate and distill with DRAM
        annotate_bins(fasta_locs,
                      output_dir,
                      min_contig_size,
                      trans_table=trans_table,
                      bit_score_threshold=bitscore,
                      rbh_bit_score_threshold=rbh_bitscore,
                      low_mem_mode=True,
                      rename_bins=False,
                      keep_tmp_dir=False,
                      threads=THREADS,
                      verbose=False)
        output_files = get_annotation_files(output_dir)
        distill_output_dir = os.path.join(output_dir, 'distilled')
        summarize_genomes(output_files['annotations']['path'],
                          output_files['trnas']['path'],
                          output_files['rrnas']['path'],
                          output_dir=distill_output_dir,
                          groupby_column='fasta')
        output_files = get_distill_files(distill_output_dir, output_files)

        if is_metagenome:
            pass  # TODO: make annotated metagenome object
        else:
            # generate genome files
            annotations = pd.read_csv(output_files['annotations']['path'],
                                      sep='\t',
                                      index_col=0)
            genome_objects = generate_genomes(
                annotations, output_files['genes_fna']['path'],
                output_files['genes_faa']['path'], assembly_ref_dict,
                assemblies, params["workspace_name"], ctx.provenance())

            genome_ref_dict = dict()
            genome_set_elements = dict()
            for genome_object in genome_objects:
                info = genome_util.save_one_genome(genome_object)["info"]
                genome_ref = '%s/%s/%s' % (info[6], info[0], info[4])
                genome_set_elements[genome_object["name"]] = {
                    'ref': genome_ref
                }
                output_objects.append({
                    "ref": genome_ref,
                    "description": 'Annotated Genome'
                })
                genome_ref_dict[genome_object["name"]] = genome_ref

            # add ontology terms
            anno_api = annotation_ontology_api(service_ver="beta")
            ontology_events = add_ontology_terms(annotations, params['desc'],
                                                 version,
                                                 params['workspace_name'],
                                                 self.workspaceURL,
                                                 genome_ref_dict)
            [
                anno_api.add_annotation_ontology_events(i)
                for i in ontology_events
            ]

            # make genome set
            # TODO: only make genome set if there is more than one genome
            if 'provenance' in ctx:
                provenance = ctx['provenance']
            else:
                provenance = [{}]
            # add additional info to provenance here, in this case the input data object reference
            provenance[0]['input_ws_objects'] = list(genome_ref_dict.values())
            provenance[0]['service'] = 'kb_SetUtilities'
            provenance[0]['method'] = 'KButil_Batch_Create_GenomeSet'
            output_genomeSet_obj = {
                'description': params['desc'],
                'elements': genome_set_elements
            }
            output_genomeSet_name = params['output_name']
            new_obj_info = wsClient.save_objects({
                'workspace':
                params['workspace_name'],
                'objects': [{
                    'type': 'KBaseSearch.GenomeSet',
                    'data': output_genomeSet_obj,
                    'name': output_genomeSet_name,
                    'meta': {},
                    'provenance': provenance
                }]
            })[0]
            genome_set_ref = '%s/%s/%s' % (new_obj_info[6], new_obj_info[0],
                                           new_obj_info[4])
            output_objects.append({
                "ref": genome_set_ref,
                "description": params['desc']
            })

        # generate report
        product_html_loc = os.path.join(distill_output_dir, 'product.html')
        report = generate_product_report(self.callback_url,
                                         params['workspace_name'], output_dir,
                                         product_html_loc, output_files,
                                         output_objects)
        output = {
            'report_name': report['name'],
            'report_ref': report['ref'],
        }
        #END run_kb_dram_annotate

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_kb_dram_annotate return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Пример #7
0
class kb_orthofinder:
    '''
    Module Name:
    kb_orthofinder

    Module Description:
    A KBase module: kb_orthofinder
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.2"
    GIT_URL = "[email protected]:kbaseapps/kb_orthofinder.git"
    GIT_COMMIT_HASH = "4210203500471209c25c62b544dc0077da862142"

    #BEGIN_CLASS_HEADER

    def generate_figure(self, params):

        Data = dict()
        Reference_Results_File = "Reference_Phytozome_Threshold.txt"
        with open(os.path.join(
                "/kb/module/data",
                Reference_Results_File)) as reference_results_handle:
            for line in reference_results_handle.readlines():
                line = line.strip()
                (x, group, y) = line.split('\t')
                y = float(y)
                x = float(x)

                if (group not in Data):
                    Data[group] = list()

                Data[group].append((x, y))
        fig_gen = GenerateFigure(Data)

        uuid_string = str(uuid.uuid4())
        figure_data_file_path = os.path.join(self.scratch, uuid_string)
        os.mkdir(figure_data_file_path)
        fig_gen.generate_figure(figure_path=figure_data_file_path,
                                data_point=params)
        return figure_data_file_path

    def log(self, message, prefix_newline=False):
        time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(time.time()))
        print(('\n' if prefix_newline else '') + time_str + ': ' + message)

    def compute_clusters(self, cluster):
        features = sorted(cluster.keys())
        clustered_sequences = dict()
        for i in range(len(features) - 1):
            for j in range(1, len(features)):
                if (i >= j):
                    continue

                Spp1 = features[i].split("||")[0]
                Spp2 = features[j].split("||")[0]
                if (Spp1 == features[i]):
                    Spp1 = "_".join(features[i].split("_")[0:-1])
                if (Spp2 == features[j]):
                    Spp2 = "_".join(features[j].split("_")[0:-1])

                if (Spp1 == Spp2):
                    continue

                Seq1 = cluster[features[i]]
                Seq2 = cluster[features[j]]

                AA_Match = 0.0
                for k in range(len(Seq1)):
                    if (Seq1[k] == "-" or Seq2[k] == "-"):
                        continue

                    if (Seq1[k] == Seq2[k]):
                        AA_Match += 1.0

                Seq1 = Seq1.replace('-', '')
                Seq2 = Seq2.replace('-', '')

                ID1 = AA_Match / len(Seq1)
                ID2 = AA_Match / len(Seq2)
                Avg = "{0:.6f}".format((ID1 + ID2) / 2.0)

                if ("Athaliana" in Spp1):
                    if (features[j] not in clustered_sequences):
                        clustered_sequences[features[j]] = dict()
                    clustered_sequences[features[j]][features[i]] = Avg
                elif ("Athaliana" in Spp2):
                    if (features[i] not in clustered_sequences):
                        clustered_sequences[features[i]] = dict()
                    clustered_sequences[features[i]][features[j]] = Avg

        return clustered_sequences

    def propagate_annotation(self, cluster, threshold, plantseed_curation):

        top_orthologs = dict()
        for ortholog in (cluster.keys()):

            if (cluster[ortholog] not in top_orthologs):
                top_orthologs[cluster[ortholog]] = dict()
            top_orthologs[cluster[ortholog]][ortholog] = 1

        top_ortholog_seqid = "0.00"
        for seq_id in (top_orthologs.keys()):
            if (float(seq_id) > float(top_ortholog_seqid)):
                top_ortholog_seqid = seq_id

        #Key to controlling the propagation of annotation
        #Is the sequence identity of the highest-scoring ortholog good enough?
        if (float(top_ortholog_seqid) < threshold):
            return (None, "Unannotated 1: LESS_THAN_THRESHOLD", 0.0)

        top_ortholog = ""
        if (len(top_orthologs[top_ortholog_seqid]) == 1):
            top_ortholog = list(top_orthologs[top_ortholog_seqid].keys())[0]
        else:
            #There are multiple Arabidopsis orthologs that have the same level of seq. id
            #Collect the actual functions and see if there's multiple functions

            Multi_Functions = dict()
            Unannotated = False
            for ortholog in (top_orthologs[top_ortholog_seqid].keys()):
                function = "Unannotated"
                ortholog_gene = ".".join(ortholog.split('.')[0:-1])
                if (ortholog_gene not in plantseed_curation):
                    function = "Unannotated 2: ARA_GENE_NOT_IN_CURATION"
                    Unannotated = True
                else:
                    function = plantseed_curation[ortholog_gene]['function']
                Multi_Functions[function] = 1

            #Rules are:
            # 0) if 1 function, arbitrarily pick ath ortholog
            # 1) if 2 functions, and one is "Unannotated",
            #    prioritize annotated function
            # 2) if 2 functions and one is a subset of the other (i.e. compartmentalization)
            #    arbitrarily pick ath ortholog
            # 3) if 2 functions, and none is "Unannotated" or > 2 functions,
            #    its ambiguous, return without doing anything
            is_ambiguous = False
            if (len(Multi_Functions.keys()) == 1):
                top_ortholog = list(
                    top_orthologs[top_ortholog_seqid].keys())[0]
            elif (len(Multi_Functions.keys()) == 2):
                if (Unannotated is True):
                    for ortholog in (top_orthologs[top_ortholog_seqid].keys()):
                        ortholog_gene = ".".join(ortholog.split('.')[0:-1])
                        if (ortholog_gene not in plantseed_curation):
                            continue
                        top_ortholog = ortholog
                        break
                else:
                    functions_list = sorted(list(Multi_Functions.keys()))
                    if( functions_list[0] in functions_list[1] or \
                            functions_list[1] in functions_list[0] ):
                        top_ortholog = list(
                            top_orthologs[top_ortholog_seqid].keys())[0]
                    else:
                        is_ambiguous = True
            else:
                is_ambiguous = True

            if (is_ambiguous is True):
                #Ambiguously curated top orthologs, so pass
                print("Ambiguous Functions: ",
                      "||".join(list(Multi_Functions.keys())))
                return (None, "Unannotated 3: AMB_HIT", 0.0)

        top_ortholog_gene = ".".join(top_ortholog.split('.')[0:-1])
        if (top_ortholog_gene in plantseed_curation):
            return (top_ortholog,
                    plantseed_curation[top_ortholog_gene]['function'],
                    top_ortholog_seqid)
        else:
            return (None, "Unannotated 4: NO_ARA_HIT", 0.0)

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.workspaceURL = config['workspace-url']

        self.testing = False
        if (config['testing'] == '1'):
            self.testing = True

        self.runOrthoFinder = True
        if (config['run_orthofinder'] == '0'):
            self.runOrthoFinder = False

        self.token = os.environ['KB_AUTH_TOKEN']
        self.scratch = os.path.abspath(config['scratch'])
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        #END_CONSTRUCTOR
        pass

    def annotate_plant_transcripts(self, ctx, input):
        """
        :param input: instance of type "AnnotatePlantTranscriptsParams" ->
           structure: parameter "threshold" of Double, parameter "input_ws"
           of String, parameter "input_genome" of String, parameter
           "output_genome" of String
        :returns: instance of type "AnnotatePlantTranscriptsResults" ->
           structure: parameter "report_name" of String, parameter
           "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN annotate_plant_transcripts

        output = dict()

        # Retrieve plant genome
        self.log("Fetching plant genome: " + input['input_ws'] + '/' +
                 input['input_genome'])
        plant_genome = self.dfu.get_objects(
            {'object_refs':
             [input['input_ws'] + '/' + input['input_genome']]})['data'][0]

        #Need to extract longest CDS, but only if CDSs available
        use_cds = 1
        if (len(plant_genome['data']['cdss']) == 0):
            use_cds = 0
            if (len(plant_genome['data']['features']) == 0):
                raise Exception(
                    "The genome does not contain any CDSs or features!")

        #Now need to be able to retrieve cds entity
        child_cds_index = dict()
        if (use_cds == 1):
            child_cds_index = dict([
                (f['id'], i)
                for i, f in enumerate(plant_genome['data']['cdss'])
            ])

        # If use_cds==1 iterate through features, iterate through CDSs, find longest sequence, use parent mRNA ID
        # If use_cds==0 use protein_translation field if available for feature, and feature ID
        self.log("Collecting protein sequences")
        sequences_dict = dict()
        for ftr in plant_genome['data']['features']:
            if (use_cds == 0 and len(ftr['protein_translation']) > 0):
                sequences_dict[ftr['id']] = ftr['protein_translation']
            if (use_cds == 1 and 'cdss' in ftr):
                longest_sequence = ""
                longest_sequence_id = ""
                for cds_id in ftr['cdss']:
                    sequence = plant_genome['data']['cdss'][
                        child_cds_index[cds_id]]['protein_translation']
                    if (len(sequence) > len(longest_sequence)):
                        longest_sequence = sequence
                        longest_sequence_id = cds_id
                if (len(longest_sequence) > 0):
                    sequences_dict[longest_sequence_id] = longest_sequence

        if (len(sequences_dict) == 0):
            raise Exception(
                "The genome does not contain any protein sequences!")

        output['ftrs'] = len(sequences_dict.keys())

        #Create directory for storing new fasta file
        uuid_string = str(uuid.uuid4())
        fasta_file_path = os.path.join(self.scratch, uuid_string)
        os.mkdir(fasta_file_path)

        #Reference data is considered immutable but each run modifies results within the directory
        #So here, we copy the reference data directory into scratch
        #The first if condition is for testing purposes

        uuid_string = str(uuid.uuid4())
        family_file_path = os.path.join(self.scratch, uuid_string,
                                        "Reference_Results")

        if (self.testing is True):
            if ('families_path' in input
                    and os.path.isdir(input['families_path'])):
                family_file_path = input['families_path']
            self.log("Testing Reference Families at " + family_file_path)
        else:
            self.log("Copying Reference Families to " + family_file_path)
            shutil.copytree("/data/OrthoFinder_Phytozome_Reference",
                            family_file_path)

        #The fasta file must have a random name to avoid _any_ clashes
        #This will need to be replaced in the newick file
        temp_genome_name = str(uuid.uuid4())
        protein_fasta_file = os.path.join(fasta_file_path,
                                          temp_genome_name + ".fa")
        self.log("Printing protein sequences to file: " + protein_fasta_file)

        testing_count = 200
        with open(protein_fasta_file, 'w') as fasta_handle:
            #Code plagarized from https://github.com/biopython/biopython/blob/master/Bio/SeqIO/FastaIO.py
            for seq_id in sequences_dict:
                #printing smaller set for testing purposes
                if (self.testing is True):
                    testing_count = testing_count - 1
                fasta_handle.write(">" + seq_id + "\n")
                for i in range(0, len(sequences_dict[seq_id]), 80):
                    fasta_handle.write(sequences_dict[seq_id][i:i + 80] + "\n")
                if (testing_count == 0):
                    break

        #Building command
        command = "/kb/deployment/bin/orthofinder/orthofinder.py "
        #Software
        command += "-S diamond -M msa -A mafft -T fasttree "
        # No. of Threads
        command += "-t 8 -a 8 "
        # Avoid adding Species ID to Sequence IDs
        command += "-X "
        #For halting after alignments
        command += "-oa "
        #Input genome
        command += "-f " + fasta_file_path + " "
        #Reference families
        command += "-b " + family_file_path

        #####################################################
        output_files = list()
        if (self.testing is False or self.runOrthoFinder is True):
            self.log("Running OrthoFinder command: " + command)

            pipe = subprocess.Popen(command,
                                    stdout=subprocess.PIPE,
                                    shell=True)

            OrthoFinder_output_file = 'OrthoFinder_Output.txt'
            of_fh = open(
                os.path.join(family_file_path, OrthoFinder_output_file), 'wb')

            while pipe.poll() is None:
                stdout_line = pipe.stdout.readline()
                print(stdout_line.rstrip(), flush=True)
                of_fh.write(stdout_line)
            # Capture last piece of text if any
            stdout_line = pipe.stdout.read()
            print(stdout_line.rstrip(), flush=True)
            of_fh.write(stdout_line)
            of_fh.close()

            output_files.append({
                'path':
                os.path.join(family_file_path, OrthoFinder_output_file),
                'name':
                OrthoFinder_output_file,
                'label':
                "OrthoFinder Output",
                'description':
                'Output text generated by OrthoFinder'
            })

        Ignored_Curation = dict()
        Ignored_File = "Ignored_Roles.txt"
        with open(os.path.join("/kb/module/data",
                               Ignored_File)) as ignored_handle:
            for line in ignored_handle:
                line = line.strip()
                Ignored_Curation[line] = 1

        # Fetch and Parse PlantSEED families and annotation
        with open(
                os.path.join("/kb/module/PlantSEED", "Data/PlantSEED_v3",
                             "PlantSEED_Roles.json")) as plsd_fh:
            PS_Roles = json.load(plsd_fh)

        plantseed = FetchPlantSEEDImpl()
        plantseed_curation = plantseed.fetch_features(PS_Roles)

        self.log("Collecting PlantSEED Curated Functions")
        PlantSEED_Roles = dict()
        output['fns'] = list()
        for feature in plantseed_curation:
            function = plantseed_curation[feature]['function']
            if (function not in output['fns']):
                output['fns'].append(function)

            for role in plantseed_curation[feature]['roles']:
                if (role not in Ignored_Curation):
                    PlantSEED_Roles[role] = 1

        output['fns'] = len(output['fns'])
        output['transcripts'] = len(list(plantseed_curation.keys()))
        output['alignments'] = list()

        #Find, read alignments, collect families
        families_dict = dict()
        search_path = os.path.join(family_file_path, "OrthoFinder",
                                   "Results_*", "MultipleSequenceAlignments",
                                   "OG*.fa")
        self.log("Searching for MSAs in " + search_path)
        for file in glob.glob(search_path):
            array = file.split('/')
            family = array[-1].replace(".fa", "")
            alignment_handle = open(file, 'rU')

            family_sequences_dict = dict()
            curated_family = list()
            # alternate header and sequence
            faiter = (x[1] for x in itertools.groupby(
                alignment_handle, lambda line: line[0] == ">"))
            for header in faiter:
                # drop the ">"
                header = header.__next__()[1:].strip()
                # join all sequence lines to one.
                seq = "".join(s.strip() for s in faiter.__next__())

                transcript_id = ""
                try:
                    transcript_id, transcript_description = header.split(
                        ' ', 1)
                except:
                    transcript_id = header
                    transcript_description = None

                #skip un-necessary proteins to reduce computation time
                if ("Athaliana" not in transcript_id
                        and transcript_id not in sequences_dict):
                    continue

                seq = seq.upper()
                family_sequences_dict[transcript_id] = seq
                gene_id = ".".join(transcript_id.split('.')[0:-1])
                if (gene_id in plantseed_curation):
                    function = plantseed_curation[gene_id]['function']
                    curated_family.append(function + "|||" + transcript_id)
                    output['alignments'].append(transcript_id)

            if (len(curated_family) > 0):
                if (family not in families_dict):
                    families_dict[family] = dict()
                families_dict[family]['sequences'] = family_sequences_dict
                families_dict[family]['functions'] = curated_family

        output['alignments'] = len(output['alignments'])

        #Iterate through collected families and compute
        #Pairwise sequence identity and propagate annotation
        functions_dict = dict()
        found_annotations = list()
        annotated_features_dict = dict()
        self.log("Computing Sequence Identity on " +
                 str(len(families_dict.keys())) + " Curated Alignments")

        # Save comprehensive output
        annotate_fh = open("/kb/module/work/tmp/annotation_results.txt", "w")
        for family in families_dict.keys():

            pw_seq_id_list = self.compute_clusters(
                families_dict[family]['sequences'])

            for function_ortholog in families_dict[family]['functions']:
                (function, ortholog) = function_ortholog.split("|||")
                if (function not in functions_dict):
                    functions_dict[function] = dict()
                if (family not in functions_dict[function]):
                    functions_dict[function][family] = {
                        'orthologs': [],
                        'hits': [],
                        'cluster': pw_seq_id_list
                    }

                if (ortholog
                        not in functions_dict[function][family]['orthologs']):
                    functions_dict[function][family]['orthologs'].append(
                        ortholog)

            for spp_ftr in sorted(pw_seq_id_list.keys()):
                (ortholog, function,
                 seqid) = self.propagate_annotation(pw_seq_id_list[spp_ftr],
                                                    input['threshold'],
                                                    plantseed_curation)
                ftr = spp_ftr.replace(temp_genome_name + "_", "")

                # Write out results
                if (function not in functions_dict):
                    annotate_fh.write(
                        "MISSING FUNCTION: " +
                        " | ".join([function, family,
                                    str(ortholog), spp_ftr]) + "\n")
                else:
                    annotate_fh.write(
                        "FOUND FUNCTION: " +
                        " | ".join([function, family,
                                    str(ortholog), spp_ftr]) + "\n")
                    if (family not in functions_dict[function]):
                        annotate_fh.write("MISSING FAMILY: " + " | ".join(
                            [function, family,
                             str(ortholog), spp_ftr]) + "\n")
                    else:
                        annotate_fh.write("FOUND FAMILY: " + " | ".join(
                            [function, family,
                             str(ortholog), spp_ftr]) + "\n")
                        if (ortholog not in functions_dict[function][family]
                            ['orthologs']):
                            annotate_fh.write(
                                "MISSING ORTHOLOG: " + " | ".join(
                                    [function, family,
                                     str(ortholog), spp_ftr]) + "\n")
                        else:
                            annotate_fh.write("FOUND ORTHOLOG: " + " | ".join(
                                [function, family,
                                 str(ortholog), spp_ftr]) + "\n")

                # Save result
                if ("Unannotated" in function):
                    function = "Unannotated"

                annotated_features_dict[ftr] = function

                if (function not in found_annotations
                        and "Unannotated" not in function):
                    found_annotations.append(function)

                if(function in functions_dict and \
                       family in functions_dict[function] and \
                       ortholog in functions_dict[function][family]['orthologs']):
                    functions_dict[function][family]['hits'].append({
                        'seqid':
                        seqid,
                        'feature':
                        spp_ftr,
                        'ortholog':
                        ortholog
                    })

        annotate_fh.close()

        with open("/kb/module/work/tmp/annotation_output.json", "w") as fh:
            json.dump(functions_dict, fh)
            fh.close()

        output['hit_fns'] = len(found_annotations)
        output['hit_ftrs'] = len(annotated_features_dict.keys())

        #Now, re-populate feature functions, and save genome object
        #But, if annotating CDS, need to be able to retrieve parent feature/transcripts
        parent_feature_index = dict()
        parent_transcript_index = dict()
        if (use_cds == 1):
            parent_feature_index = dict([
                (f['id'], i)
                for i, f in enumerate(plant_genome['data']['features'])
            ])
            parent_transcript_index = dict([
                (f['id'], i)
                for i, f in enumerate(plant_genome['data']['mrnas'])
            ])

        self.log("Populating plant genome with newly clustered functions")
        # Add annotation to protein-coding genes
        # As the Phytozome genomes have CDSs, the features don't usually get annotated here
        for ftr in plant_genome['data']['features']:
            ftr['functions'] = ["Unannotated"]
            if (ftr['id'] in annotated_features_dict):
                ftr['functions'] = [annotated_features_dict[ftr['id']]]

            # It is possible that a gene is listed without an associated transcript
            if ('mrnas' in ftr):

                # Add annotation to transcripts
                # As the Phytozome genomes have CDSs, the features don't usually get annotated here
                for mrna in ftr['mrnas']:

                    # Retrieve mrna object
                    mrna_indice = parent_transcript_index[mrna]
                    mrna_obj = plant_genome['data']['mrnas'][mrna_indice]

                    # Annotate mRNA with feature annotation
                    mrna_obj['functions'] = [ftr['functions'][0]]

                    # If it happens that the mRNA is independently annotated
                    if (mrna in annotated_features_dict):
                        mrna_obj['functions'] = [annotated_features_dict[mrna]]

                        # Then annotate parent feature gene
                        ftr['functions'] = [annotated_features_dict[mrna]]

            # It is possible that a gene is listed without an associated protein
            if ('cdss' in ftr):

                # Add annotation to proteins
                # As the Phytozome genomes have CDSs, the features and mrnas get annotated here
                for cds in ftr['cdss']:

                    # Retrieve cds object
                    cds_indice = child_cds_index[cds]
                    cds_obj = plant_genome['data']['cdss'][cds_indice]

                    # Annotate CDS with feature annotation
                    cds_obj['functions'] = [ftr['functions'][0]]

                    # If it happens that the CDS is independently annotated
                    # Which is most likely event if using Phytozome genomes
                    if (cds in annotated_features_dict):
                        cds_obj['functions'] = [annotated_features_dict[cds]]

                        if ('parent_mrna' in cds_obj):
                            parent_transcript_indice = parent_transcript_index[
                                cds_obj['parent_mrna']]
                            parent_transcript_obj = plant_genome['data'][
                                'mrnas'][parent_transcript_indice]
                            parent_transcript_obj['functions'] = [
                                annotated_features_dict[cds]
                            ]
                        else:
                            self.log("WARNING: CDS " + cds +
                                     " missing parent_mrna")

                        # Then annotate parent feature gene
                        ftr['functions'] = [annotated_features_dict[cds]]

        #Save genome
        with open("/kb/module/work/tmp/annotated_genome.json", "w") as fh:
            json.dump(plant_genome, fh)
            fh.close()

        if ('output_genome' not in input):
            input['output_genome'] = input['input_genome']

        saved_genome = ""
        if (self.testing is True):
            #wsid = self.dfu.ws_name_to_id(input['input_ws'])
            #save_result = self.dfu.save_objects({'id':wsid,'objects':[{'name':input['output_genome'],
            #                                                           'data':plant_genome['data'],
            #                                                           'type':'KBaseGenomes.Genome'}]})[0]
            pass
        else:
            save_result = self.gfu.save_one_genome({
                'workspace':
                input['input_ws'],
                'name':
                input['output_genome'],
                'data':
                plant_genome['data']
            })['info']

            #reference of saved genome
            saved_genome = "{}/{}/{}".format(save_result[6], save_result[0],
                                             save_result[4])

        Annotated_Roles = dict()
        for curation in found_annotations:
            Function_Comments = curation.split("#")
            for i in range(len(Function_Comments)):
                Function_Comments[i] = Function_Comments[i].strip()

            Function = Function_Comments.pop(0)
            Roles = re.split("\s*;\s+|\s+[\@\/]\s+", Function)
            for role in Roles:
                if (role in PlantSEED_Roles):
                    Annotated_Roles[role] = 1

        output['hit_fns'] = len(found_annotations)
        output['cur_roles'] = len(PlantSEED_Roles.keys())
        output['hit_roles'] = len(Annotated_Roles.keys())

        # Calculate fraction of PlantSEED functional roles
        fraction_plantseed = float((float(len(Annotated_Roles.keys())) /
                                    float(len(PlantSEED_Roles.keys()))))

        # HTML Folder Path
        uuid_string = str(uuid.uuid4())
        html_file_path = os.path.join(self.scratch, uuid_string)
        os.mkdir(html_file_path)

        # Generate figure:
        #     the path parameter is for the reference data that is integrated into the figure.
        figure_generator = GenerateFigureImpl("/kb/module/data/")
        bokeh_figure = figure_generator.generate_figure(
            input['threshold'], fraction_plantseed)

        # Save figure
        figure_html_file = "figure.html"
        output_file(os.path.join(html_file_path, figure_html_file))
        save(bokeh_figure)

        # Generate table
        table_generator = GenerateTableImpl()
        annotation_table_string = table_generator.generate_table(
            functions_dict)

        # Save table
        # Read in template html
        with open(
                os.path.join('/kb/module/data', 'app_report_templates',
                             'annotation_report_tables_template.html')
        ) as report_template_file:
            report_template_string = report_template_file.read()

        # Generate and Insert html title
        #     This needs to be done because it affects the name of the CSV download
        title_string = "-".join(
            [input['input_genome'],
             str(input['threshold'])])
        report_template_string = report_template_string.replace(
            '*TITLE*', title_string)

        # Insert table into template
        table_report_string = report_template_string.replace(
            '*TABLES*', annotation_table_string)

        # Save table
        table_html_file = "table.html"
        with open(os.path.join(html_file_path, table_html_file),
                  'w') as table_file:
            table_file.write(table_report_string)

        # Generate main index.html content
        html_string = "<html><head><title>KBase Plant OrthoFinder Report</title></head><body>"
        html_string += "<div style=\"text-align: center; max-width: 800px\">"
        html_string += "<p>The Plant OrthoFinder app has finished running: "
        html_string += str(
            output['ftrs']) + " protein sequences were clustered "
        html_string += "with " + str(
            output['transcripts']) + " PlantSEED-curated enzymes. "
        html_string += "The app was able to predict " + str(
            output['hit_fns']) + " enzymatic functions "
        html_string += "for " + str(
            output['hit_ftrs']) + " protein sequences and "
        html_string += "this result indicates that, for this set of protein sequences, "
        html_string += "the app detected {0:.0f}%".format(
            float(fraction_plantseed * 100.0))
        html_string += " of the enzymatic functions of plant primary metabolism that were "
        html_string += "curated as part of the PlantSEED project.</p>"
        html_string += "<p>The results of the annotation are tabulated in this "
        html_string += "<a href=\"" + table_html_file + "\" target=\"_blank\">Table</a></p></div>"

        caption = "<figcaption><b>Figure 1: Propagation of metabolic roles for "
        caption += str(input['input_genome']) + ". </b>"
        caption += "The PlantSEED project curated " + str(output['cur_roles'])
        caption += " distinct primary metabolic roles for Arabidopsis thaliana. "
        caption += "Here we show the impact of propagating these roles to other "
        caption += "species using sequence identity. "
        caption += "For each group of species, and for a different threshold of "
        caption += "sequence identity, we show the fraction of curated roles that were "
        caption += "propagated. The fraction of propagated roles decreases as a function "
        caption += "of similarity and phylogenetic distance. "
        caption += "The fraction of roles that were propagated for " + input[
            'input_genome']
        caption += " at the chosen threshold of "
        caption += str(
            input['threshold']) + " for sequence identity is {0:.2f}".format(
                float(fraction_plantseed))
        caption += " and is marked by the bold plus. "
        caption += " A user may re-run the app with a different threshold; a higher "
        caption += "threshold will increase the reliability of the results, but will "
        caption += " reduce the number of propagated metabolic roles; a lower threshold "
        caption += "will increase the number of propagated metabolic roles but "
        caption += " also increase the number of propagations, increasing the likelihood "
        caption += "of a false positive. This figure can be viewed in a separate window "
        caption += "<a href=\"" + figure_html_file + "\" target=\"_blank\">here</a></figcaption>"

        html_string += "<div style=\"text-align: center; max-width: 620px\">"
        html_string += "<embed type=\"text/html\" src=\"" + figure_html_file + "\" width=\"620\" height=\"620\"></embed>"
        html_string += caption
        html_string += "</div></body></html>"

        #Save index file
        with open(os.path.join(html_file_path, "index.html"),
                  'w') as index_file:
            index_file.write(html_string)

        upload_info = self.dfu.file_to_shock({
            'file_path': html_file_path,
            'pack': 'zip'
        })

        html_report_list = list()
        html_link = {
            'shock_id':
            upload_info['shock_id'],
            'name':
            figure_html_file,
            'label':
            'Figure generated by app',
            'description':
            'Figure generated by Annotate Plant Enzymes with OrthoFinder app'
        }
        html_report_list.append(html_link)

        html_link = {
            'shock_id':
            upload_info['shock_id'],
            'name':
            table_html_file,
            'label':
            'Table generated by app',
            'description':
            'Table generated by Annotate Plant Enzymes with OrthoFinder app'
        }
        html_report_list.append(html_link)

        description = "Plant genome " + plant_genome['data'][
            'id'] + " annotated with metabolic functions"

        uuid_string = str(uuid.uuid4())
        report_params = {
            'direct_html_link_index':
            0,  #Use to refer to index of 'html_links'
            'workspace_name': input['input_ws'],
            'report_object_name': 'kb_orthofinder_' + uuid_string,
            'file_links': output_files,
            'html_links': html_report_list
        }

        if (self.testing is False):
            report_params['objects_created'] = [{
                "ref": saved_genome,
                "description": description
            }]

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        report_client_output = kbase_report_client.create_extended_report(
            report_params)
        output['report_name'] = report_client_output['name']
        output['report_ref'] = report_client_output['ref']

        #END annotate_plant_transcripts

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError(
                'Method annotate_plant_transcripts return value ' +
                'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]