def test_kb_blast_BLASTp_Search_04_AnnotatedMetagenomeAssembly(self): obj_basename = 'BLASTp_AnnotatedMetagenomeAssembly' obj_out_name = obj_basename + ".test_output.FS" obj_out_type = "KBaseCollections.FeatureSet" # upload test AMA ama_name = "ama_test.AMA" ama_feature_cnt = 888 ama_contigs_file_src = "data/AnnotatedMetagenomeAssembly/ama_contigs.fasta" ama_genes_file_src = "data/AnnotatedMetagenomeAssembly/ama_genes.gff" shared_dir = "/kb/module/work/tmp" ama_contigs_file_upload = os.path.join( shared_dir, os.path.basename(ama_contigs_file_src)) ama_genes_file_upload = os.path.join( shared_dir, os.path.basename(ama_genes_file_src)) shutil.copy(ama_contigs_file_src, ama_contigs_file_upload) shutil.copy(ama_genes_file_src, ama_genes_file_upload) ama_upload_params = { "workspace_name": self.getWsName(), "genome_name": ama_name, "fasta_file": { "path": ama_contigs_file_upload }, "gff_file": { "path": ama_genes_file_upload }, "source": "GFF", "scientific_name": "TEST AMA", "generate_missing_genes": "True" } try: SERVICE_VER = 'dev' GFU = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token'], service_ver=SERVICE_VER) print("UPLOADING AMA: " + ama_name + " to WORKSPACE " + self.getWsName() + " ...") ama_upload_result = GFU.fasta_gff_to_metagenome(ama_upload_params) except: raise ValueError("unable to upload test AMA data object") pprint(ama_upload_result) ama_ref_1 = ama_upload_result['metagenome_ref'] # gene 5_267 from ama_test.AMA query_seq_prot = 'MDRDALTKLVTDLVSIPSVNPLEGPVGNGRGEAELAAFIHSRLTEAGVVCELKEALPGRPNIIARLPGQSEEMIWFDAHMDTVSGEGMAFPPFEPLIEGDRLLGRGSSDNKGSIATMMAALMEVAKSGERPPLTVVFTATADEEYMMRGMLSLFEAGLTAKAGIVAEPTALEIVIAHKGVARFKISTTGKAAHSSRPEEGVNAIYRMGKVLGAIEAYAKRGVGRETHPLLGKGTLSVGIIRGGEYVNVVPDQCEVDVDRRLLPGEDPRRAVSDVRDYLSNALQEEVGLKVSGPTLTVPGLAVSAESPLVQAVAAAVREVTGKAPLTGMQGATHAGQMAAVDIPALVFGPGQMGQAHTATEELDLTQLERAAAVYERLMRTGL' parameters = { 'workspace_name': self.getWsName(), 'input_one_sequence': query_seq_prot, #'input_one_ref': "", 'output_one_name': obj_basename + '.' + "test_query.SS", 'input_many_ref': ama_ref_1, 'output_filtered_name': obj_out_name, 'e_value': ".001", 'bitscore': "50", 'ident_thresh': "40.0", 'overlap_fraction': "50.0", 'maxaccepts': "1000", 'output_extra_format': "none" } ret = self.getImpl().BLASTp_Search(self.getContext(), parameters)[0] self.assertIsNotNone(ret['report_ref']) # check created obj #report_obj = self.getWsClient().get_objects2({'objects':[{'ref':ret['report_ref']}]})[0]['data'] report_obj = self.getWsClient().get_objects([{ 'ref': ret['report_ref'] }])[0]['data'] self.assertIsNotNone(report_obj['objects_created'][0]['ref']) created_obj_0_info = self.getWsClient().get_object_info_new( {'objects': [{ 'ref': report_obj['objects_created'][0]['ref'] }]})[0] [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = list(range(11)) # object_info tuple self.assertEqual(created_obj_0_info[NAME_I], obj_out_name) self.assertEqual(created_obj_0_info[TYPE_I].split('-')[0], obj_out_type) pass
def getAMAInfo(self, ama_basename, item_i=0): if hasattr(self.__class__, 'amaInfo_list'): try: info = self.__class__.amaInfo_list[item_i] name = self.__class__.amaName_list[item_i] if info != None: if name != ama_basename: self.__class__.amaInfo_list[item_i] = None self.__class__.amaName_list[item_i] = None else: return info except: pass # 1) transform GFF+FNA to kbase AMA object and upload to ws shared_dir = "/kb/module/work/tmp" ama_gff_srcfile = 'data/amas/'+ama_basename+'.gff' ama_fna_srcfile = 'data/amas/'+ama_basename+'.fa' ama_gff_dstfile = os.path.join(shared_dir, os.path.basename(ama_gff_srcfile)) ama_fna_dstfile = os.path.join(shared_dir, os.path.basename(ama_fna_srcfile)) shutil.copy(ama_gff_srcfile, ama_gff_dstfile) shutil.copy(ama_fna_srcfile, ama_fna_dstfile) try: SERVICE_VER = 'release' #SERVICE_VER = 'dev' GFU = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token'], service_ver=SERVICE_VER ) except: raise ValueError ("unable to obtain GenomeFileUtil client") print ("UPLOADING AMA: "+ama_basename+" to WORKSPACE "+self.getWsName()+" ...") ama_upload_params = { "workspace_name": self.getWsName(), "genome_name": ama_basename, "fasta_file": {"path": ama_fna_dstfile}, "gff_file": {"path": ama_gff_dstfile}, "source": "GFF", "scientific_name": "TEST AMA", "generate_missing_genes": "True" } try: ama_upload_result = GFU.fasta_gff_to_metagenome(ama_upload_params) except: raise ValueError("unable to upload test AMA data object") print ("AMA UPLOADED") pprint(ama_upload_result) ama_ref = ama_upload_result['metagenome_ref'] new_obj_info = self.getWsClient().get_object_info_new({'objects': [{'ref': ama_ref}]})[0] # 2) store it if not hasattr(self.__class__, 'amaInfo_list'): self.__class__.amaInfo_list = [] self.__class__.amaName_list = [] for i in range(item_i+1): try: assigned = self.__class__.amaInfo_list[i] except: self.__class__.amaInfo_list.append(None) self.__class__.amaName_list.append(None) self.__class__.amaInfo_list[item_i] = new_obj_info self.__class__.amaName_list[item_i] = ama_basename return new_obj_info
def test_fractiontate_contigs_ASSEMBLY_AMA_03(self): method = 'fractionate_contigs_pos_filter_ASSEMBLY_AMA_03' print("\n\nRUNNING: test_" + method + "()") print("==========================================================\n\n") # upload test data try: auClient = AssemblyUtil(self.callback_url, token=self.getContext()['token']) except Exception as e: raise ValueError( 'Unable to instantiate auClient with callbackURL: ' + self.callback_url + ' ERROR: ' + str(e)) try: gfuClient = GenomeFileUtil(self.callback_url, token=self.getContext()['token']) except Exception as e: raise ValueError( 'Unable to instantiate gfuClient with callbackURL: ' + self.callback_url + ' ERROR: ' + str(e)) base_1 = 'assembly_1plus2' base_2 = 'assembly_2' type_1 = 'Assembly' type_2 = 'AMA' ass_file_1_fa = base_1 + '.fa.gz' ass_file_2_fa = base_2 + '.fa.gz' ass_file_2_gff = base_2 + '.gff' ass_path_1_fa = os.path.join(self.scratch, ass_file_1_fa) ass_path_2_fa = os.path.join(self.scratch, ass_file_2_fa) ass_path_2_gff = os.path.join(self.scratch, ass_file_2_gff) shutil.copy(os.path.join("data", ass_file_1_fa), ass_path_1_fa) shutil.copy(os.path.join("data", ass_file_2_fa), ass_path_2_fa) shutil.copy(os.path.join("data", ass_file_2_gff), ass_path_2_gff) ass_ref_1 = auClient.save_assembly_from_fasta({ 'file': { 'path': ass_path_1_fa }, 'workspace_name': self.getWsName(), 'assembly_name': base_1 + '.' + type_1 }) ass_ref_2 = gfuClient.fasta_gff_to_metagenome({ 'fasta_file': { 'path': ass_path_2_fa }, 'gff_file': { 'path': ass_path_2_gff }, 'generate_missing_genes': 1, 'source': 'GFF', 'scientific_name': base_2, 'workspace_name': self.getWsName(), 'genome_name': base_2 + '.' + type_2 }).get('metagenome_ref') # run method base_output_name = method + '_output' fractionate_mode = 'pos' params = { 'workspace_name': self.getWsName(), 'input_assembly_ref': ass_ref_1, 'input_pos_filter_obj_refs': [ass_ref_2], 'fractionate_mode': fractionate_mode, 'output_name': 'test_fractionated' + '-' + base_1 + '.' + type_1 + '-' + base_2 + '.' + type_2 + '-' + fractionate_mode } result = self.getImpl().run_fractionate_contigs( self.getContext(), params) print('RESULT:') pprint(result) pass
def test_kb_blast_BLASTp_Search_05_MultipleTargets(self): [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = list(range(11)) # object_info tuple obj_basename = 'BLASTp_MultipleTargets' obj_out_name = obj_basename + ".test_output.FS" obj_out_type = "KBaseCollections.FeatureSet" reference_prok_genomes_WS = 'ReferenceDataManager' # PROD and CI genome_ref_1 = 'ReferenceDataManager/GCF_001566335.1/1' # E. coli K-12 MG1655 genome_ref_2 = 'ReferenceDataManager/GCF_002936495.2/1' # E. coli genome_ref_3 = 'ReferenceDataManager/GCF_002936145.2/1' # E. coli genome_ref_list = [genome_ref_1, genome_ref_2, genome_ref_3] genome_scinames = ['FOO', 'BAR', 'FOOBAR'] # create GenomeSet testGS = {'description': 'three genomes', 'elements': dict()} for genome_i, genome_ref in enumerate(genome_ref_list): testGS['elements'][genome_scinames[genome_i]] = {'ref': genome_ref} obj_info = self.getWsClient().save_objects({ 'workspace': self.getWsName(), 'objects': [{ 'type': 'KBaseSearch.GenomeSet', 'data': testGS, 'name': 'test_genomeset', 'meta': {}, 'provenance': [{ 'service': 'kb_blast', 'method': 'BLASTp_Search' }] }] })[0] #pprint(obj_info) target_genomeSet_ref = "/".join([ str(obj_info[WORKSPACE_I]), str(obj_info[OBJID_I]), str(obj_info[VERSION_I]) ]) # upload test AMA ama_name = "ama_test.AMA" ama_feature_cnt = 888 ama_contigs_file_src = "data/AnnotatedMetagenomeAssembly/ama_contigs.fasta" ama_genes_file_src = "data/AnnotatedMetagenomeAssembly/ama_genes.gff" shared_dir = "/kb/module/work/tmp" ama_contigs_file_upload = os.path.join( shared_dir, os.path.basename(ama_contigs_file_src)) ama_genes_file_upload = os.path.join( shared_dir, os.path.basename(ama_genes_file_src)) shutil.copy(ama_contigs_file_src, ama_contigs_file_upload) shutil.copy(ama_genes_file_src, ama_genes_file_upload) ama_upload_params = { "workspace_name": self.getWsName(), "genome_name": ama_name, "fasta_file": { "path": ama_contigs_file_upload }, "gff_file": { "path": ama_genes_file_upload }, "source": "GFF", "scientific_name": "TEST AMA", "generate_missing_genes": "True" } try: SERVICE_VER = 'dev' GFU = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token'], service_ver=SERVICE_VER) print("UPLOADING AMA: " + ama_name + " to WORKSPACE " + self.getWsName() + " ...") ama_upload_result = GFU.fasta_gff_to_metagenome(ama_upload_params) except: raise ValueError("unable to upload test AMA data object") pprint(ama_upload_result) ama_ref_1 = ama_upload_result['metagenome_ref'] # gene 5_267 from ama_test.AMA query_seq_prot = 'MDRDALTKLVTDLVSIPSVNPLEGPVGNGRGEAELAAFIHSRLTEAGVVCELKEALPGRPNIIARLPGQSEEMIWFDAHMDTVSGEGMAFPPFEPLIEGDRLLGRGSSDNKGSIATMMAALMEVAKSGERPPLTVVFTATADEEYMMRGMLSLFEAGLTAKAGIVAEPTALEIVIAHKGVARFKISTTGKAAHSSRPEEGVNAIYRMGKVLGAIEAYAKRGVGRETHPLLGKGTLSVGIIRGGEYVNVVPDQCEVDVDRRLLPGEDPRRAVSDVRDYLSNALQEEVGLKVSGPTLTVPGLAVSAESPLVQAVAAAVREVTGKAPLTGMQGATHAGQMAAVDIPALVFGPGQMGQAHTATEELDLTQLERAAAVYERLMRTGL' parameters = { 'workspace_name': self.getWsName(), 'input_one_sequence': query_seq_prot, #'input_one_ref': "", 'output_one_name': obj_basename + '.' + "test_query.SS", 'input_many_refs': [ama_ref_1, target_genomeSet_ref], 'output_filtered_name': obj_out_name, 'genome_disp_name_config': 'obj_name_ver_sci_name', 'e_value': ".001", 'bitscore': "50", 'ident_thresh': "10.0", 'overlap_fraction': "25.0", 'maxaccepts': "1000", 'output_extra_format': "none" } ret = self.getImpl().BLASTp_Search(self.getContext(), parameters)[0] self.assertIsNotNone(ret['report_ref']) # check created obj #report_obj = self.getWsClient().get_objects2({'objects':[{'ref':ret['report_ref']}]})[0]['data'] report_obj = self.getWsClient().get_objects([{ 'ref': ret['report_ref'] }])[0]['data'] self.assertIsNotNone(report_obj['objects_created'][0]['ref']) created_obj_0_info = self.getWsClient().get_object_info_new( {'objects': [{ 'ref': report_obj['objects_created'][0]['ref'] }]})[0] self.assertEqual(created_obj_0_info[NAME_I], ama_name + '-' + obj_out_name) self.assertEqual(created_obj_0_info[TYPE_I].split('-')[0], obj_out_type) created_obj_1_info = self.getWsClient().get_object_info_new( {'objects': [{ 'ref': report_obj['objects_created'][1]['ref'] }]})[0] self.assertEqual(created_obj_1_info[NAME_I], 'test_genomeset' + '-' + obj_out_name) self.assertEqual(created_obj_1_info[TYPE_I].split('-')[0], obj_out_type) pass
class ImportMetagenomeGFFFastaUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url, service_ver='dev') self.uploader_utils = UploaderUtil(config) self.scratch = os.path.join(config['scratch'], 'import_Metagenome_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) def import_metagenome_gff_fasta_from_staging(self, params): """ import_gff_fasta_from_staging: wrapper method for GenomeFileUtil.fasta_gff_to_genome required params: fasta_file: fasta file from user's staging area gff_file: gff file from user's staging area genome_name: output genome object name workspace_name: workspace name that genome will be stored to file paths for both fasta and gff files must be subdirectory file path in staging area e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name optional params: release: Release Or Version Of The Source Data genetic_code: Genetic Code For The Organism type: 'Reference', 'User upload', 'Representative' return: genome_ref: return object reference report_name: name of generated report (if any) report_ref: report reference (if any) """ # logging.info('--->\nrunning ImportMetagenomeGFFFastaUtil.import_metagenome_gff_fasta_from_staging\n' + # f'params:\n{json.dumps(params, indent=1)}') self.validate_import_metagenome_gff_fasta_from_staging_params(params) for key in ('fasta_file', 'gff_file'): file_path = params[key] download_staging_file_params = { 'staging_file_subdir_path': file_path } dfu_returnVal = self.dfu.download_staging_file( download_staging_file_params) params[key] = {'path': dfu_returnVal['copy_file_path']} returnVal = self.gfu.fasta_gff_to_metagenome(params) """ Update the workspace object related meta-data for staged file """ # self.uploader_utils.update_staging_service(download_staging_file_params.get('staging_file_subdir_path'), # returnVal['genome_ref']) return returnVal def validate_import_metagenome_gff_fasta_from_staging_params(self, params): """ validate_import_metagenome_gff_fasta_from_staging_params: validates params passed to import_gff_fasta_from_staging method """ # check for required parameters for p in ['genome_name', 'workspace_name', 'fasta_file', 'gff_file']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') # for now must use workspace name, but no ws_id_to_name() function available if str(params["workspace_name"]).isdigit(): error_msg = '"{}" parameter is a workspace id and workspace name is required'.format( params["workspace_name"]) raise ValueError(error_msg) def generate_html_report(self, genome_ref, params): """ _generate_html_report: generate html summary report """ logging.info('start generating html report') genome_obj = self.dfu.get_objects({'object_refs': [genome_ref]}) html_report = list() tmp_dir = os.path.join(self.scratch, str(uuid.uuid4())) handler_utils._mkdir_p(tmp_dir) result_file_path = os.path.join(tmp_dir, 'report.html') genome_name = str(genome_obj.get('data')[0].get('info')[1]) genome_file = params.get('staging_file_subdir_path') genome_data = genome_obj.get('data')[0].get('data') genome_info = genome_obj.get('data')[0].get('info') genome_metadata = genome_info[10] source = genome_metadata.get('Source') num_contigs = genome_metadata.get('Number contigs') size = genome_metadata.get('Size') gc_content = genome_metadata.get('GC content') warnings = genome_data.get('warnings', []) feature_counts = sorted( list(genome_data.get('feature_counts', {}).items())) genome_overview_data = collections.OrderedDict() genome_overview_data['Name'] = '{} ({})'.format( genome_name, genome_ref) #genome_overview_data['Uploaded File'] = genome_file genome_overview_data['Date Uploaded'] = time.strftime("%c") genome_overview_data['Source'] = source genome_overview_data['Number of Contigs'] = num_contigs genome_overview_data['Size'] = size genome_overview_data['GC Content'] = gc_content genome_overview_data['Warnings'] = "\n".join(warnings) genome_overview_data.update(feature_counts) overview_content = '<br/><table>\n' for key, val in genome_overview_data.items(): overview_content += '<tr><td><b>{}</b></td>'.format(key) overview_content += '<td>{}</td></tr>\n'.format(val) overview_content += '</table>' feature_content = str( [[str(k), v] for k, v in list(genome_data.get('feature_counts', {}).items()) if k != 'gene']) contig_content = str( [[str(c), l] for c, l in zip(genome_data.get('contig_ids', []), genome_data.get('contig_lengths', []))]) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template', 'report_template_genome.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Overview_Content</p>', overview_content) report_template = report_template.replace( '*FEATURE_DATA*', feature_content) report_template = report_template.replace( '*CONTIG_DATA*', contig_content) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': tmp_dir, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for imported Annotated Metagenome Assembly' }) return html_report def generate_report(self, genome_ref, params): """ :param genome_ref: Return Val from GenomeFileUtil for Uploaded metagenome Need to get report warnings and message from it. :return: """ uuid_string = str(uuid.uuid4()) objects_created = [{ 'ref': genome_ref, 'description': 'Imported Annotated Metagenome Assembly' }] output_html_files = self.generate_html_report(genome_ref, params) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 300, 'report_object_name': 'kb_metagenome_upload_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class ProkkaUtils: def __init__(self, config): self.scratch = config["scratch"] self.ctx = config['ctx']; self.callback_url = config["SDK_CALLBACK_URL"] self.ws_client = workspaceService(config["workspace-url"]) self.gfu = GenomeFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.kbr = KBaseReport(self.callback_url) self.dfu = DataFileUtil(self.callback_url) self.genome_api = GenomeAnnotationAPI(self.callback_url) self.sso_ref = None self.sso_event = None self.ec_to_sso = {} self.output_workspace = None @staticmethod def _get_input_value(params, key): """Get value of key after checking for its existence :param params: Params dictionary haystack :param key: Key to search in Params :return: Parameter Value :raises ValueError: raises an exception if the key doesn"t exist """ if not key in params: raise ValueError("Parameter " + key + " should be set in input parameters") return params[key] @staticmethod def _get_qualifier_value(qualifier): """Get first qualifier from the list of qualifiers :param qualifier: list contents of the qualifier from BCBio GFF Tools :return: first element in the list """ return qualifier[0] if (qualifier and len(qualifier) > 0) else None def download_seed_data(self): """Download Seed Data Ontology, and set the gene_ontology reference (sso_ref) and the create a table from ec numbers to sso (ec_to_sso) :return: None """ # Download Seed Reference Data sso_ret = self.ws_client.get_objects([{"ref": "KBaseOntology/seed_subsystem_ontology"}])[0] sso = sso_ret["data"] for sso_id in sso["term_hash"]: sso_name = sso["term_hash"][sso_id]["name"] if "(EC " in sso_name and sso_name.endswith(")"): ec = sso_name[sso_name.index("(EC ") + 4: -1].strip() sso_list = self.ec_to_sso.get(ec, None) if not sso_list: sso_list = [] self.ec_to_sso[ec] = sso_list sso_list.append(sso["term_hash"][sso_id]) print("EC found in SSO: " + str(len(self.ec_to_sso))) sso_info = sso_ret["info"] sso_ref = str(sso_info[6]) + "/" + str(sso_info[0]) + "/" + str(sso_info[4]) with open("/kb/module/work/seed_so.json", "w") as outfile: json.dump(sso, outfile, sort_keys=True, indent=4) self.sso_ref = sso_ref def inspect_assembly(self, assembly_meta, assembly_ref): """Check to see if assembly has too many contigs and might not be a metagenome or non prokaryotic dataset :param assembly_meta: information about the assembly reference :param assembly_ref: the assembly reference number :return: a tuple containing gc_content and dna_size """ gc_content = float(assembly_meta.get("GC content")) dna_size = int(assembly_meta.get("Size")) n_contigs = 0 if "N Contigs" in assembly_meta: n_contigs = int(assembly_meta.get("N Contigs")) else: contig = self.ws_client.get_objects([{"ref": assembly_ref}])[0] n_contigs = len(contig["data"]["contigs"]) if n_contigs >= 30000: message = """ Hmmm. There are over 30,000 contigs in this Assembly. It looks like you are trying to run Prokka on a metagenome or non-prokaryotic data set. If this is a metagenome data set we recommend using an App like MaxBin to first bin the contigs into genome-like bins. These bins can then be individually annotated as a single genome using Prokka. If this data comes from a Eukaryotic sample, KBase does not currently have an annotation app designed for Eukaryotes. Alternatively, you can try reducing the number of contigs using a filter app.") raise ValueError("Too many contigs for Prokka. See logs for details and suggestions """ print(message) raise ValueError("Too many contigs for Prokka. See logs for details and suggestions") assembly_info = namedtuple("assembly_info", "gc_content dna_size") return assembly_info(gc_content, dna_size) @staticmethod def create_renamed_assembly(assembly_fasta_filepath): """Rename records to be in the format of contig_N and output a new fasta file :param assembly_fasta_filepath: :return: A tuple with The path to the fasta file with renamed contigs the number of contigs, the mapping from old ids to new ids, and the contigs as SeqRecords """ records = [] new_ids_to_old = {} contig_counter = 0 for record in SeqIO.parse(assembly_fasta_filepath, "fasta"): contig_counter += 1 old_id = record.id new_id = "contig_" + str(contig_counter) sequence = record.seq # it has type "Seq" record = SeqRecord(sequence, id=new_id, description="(" + old_id + ")") records.append(record) new_ids_to_old[new_id] = old_id renamed_assembly_fasta_filepath = assembly_fasta_filepath + "_renamed.fna" SeqIO.write(records, renamed_assembly_fasta_filepath, "fasta") renamed_assembly = namedtuple("renamed_assembly", "filepath contig_counter new_ids_to_old records") return renamed_assembly(renamed_assembly_fasta_filepath, contig_counter, new_ids_to_old, records) def run_prokka(self, params, subject_fasta_filepath): """Run Prokka :param params: Prokka parameters :param subject_fasta_filepath: The contigs or genes to run prokka against :return: The directory with all of the prokka output files """ output_dir = "/kb/module/work/tmp/temp_" + str(uuid.uuid4()) prokka_cmd_list = ["perl", "/kb/prokka/bin/prokka", "--outdir", output_dir, "--prefix", "mygenome"] # --kingdom [X] Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default "Bacteria") if "kingdom" in params and params["kingdom"]: prokka_cmd_list.extend(["--kingdom", str(params['kingdom'])]) # --genus [X] Genus name (triggers to use --usegenus) if "genus" in params and params["genus"]: prokka_cmd_list.extend(["--genus", str(params["genus"]), "--usegenus"]) # --gcode [N] Genetic code / Translation table (set if --kingdom is set) (default "0") if "gcode" in params and params["gcode"]: prokka_cmd_list.extend(["--gcode", str(params["gcode"])]) else: prokka_cmd_list.extend(["--gcode", "0"]) # --gram [X] Gram: -/neg +/pos (default "") if "gram" in params and params["gram"]: raise ValueError("gram parameter is not supported in current Prokka installation") # --metagenome Improve gene predictions for highly fragmented genomes (default OFF) if "metagenome" in params and params["metagenome"] == 1: prokka_cmd_list.append("--metagenome") # --rawproduct Do not clean up /product annotation (default OFF) if "rawproduct" in params and params["rawproduct"] == 1: prokka_cmd_list.append("--rawproduct") # --fast Fast mode - skip CDS /product searching (default OFF) if "fast" in params and params["fast"] == 1: prokka_cmd_list.append("--fast") # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default "1") if "mincontiglen" in params and params["mincontiglen"]: prokka_cmd_list.extend(["--mincontiglen", str(params["mincontiglen"])]) # --evalue [n.n] Similarity e-value cut-off (default "1e-06") if "evalue" in params and params["evalue"]: prokka_cmd_list.extend(["--evalue", str(params["evalue"])]) # --rfam Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default "0") if "rfam" in params and params["rfam"] == 1: prokka_cmd_list.append("--rfam") # --norrna Don"t run rRNA search (default OFF) if "norrna" in params and params["norrna"] == 1: prokka_cmd_list.append("--norrna") # --notrna Don"t run tRNA search (default OFF) if "notrna" in params and params["notrna"] == 1: prokka_cmd_list.append("--notrna") prokka_cmd_list.append(subject_fasta_filepath) print("Prokka command line: " + str(prokka_cmd_list)) #tbl2asn or some other non essential prokka binary will fail, so supress that try: check_output(prokka_cmd_list, cwd=self.scratch) except CalledProcessError as e: pprint(e) return output_dir @staticmethod def retrieve_prokka_results(output_dir): """ Gather up the relevant prokka results, load the records from the results files :param output_dir: :return: A tuple containing Sequences from the .faa .ffn files and the gff_filepath """ faa_file = output_dir + "/mygenome.faa" cds_to_prot = {} for record in SeqIO.parse(faa_file, "fasta"): cds_to_prot[record.id] = str(record.seq) ffn_file = output_dir + "/mygenome.ffn" cds_to_dna = {} for record in SeqIO.parse(ffn_file, "fasta"): cds_to_dna[record.id] = str(record.seq) gff_file = output_dir + "/mygenome.gff" if not os.path.isfile(gff_file): raise ValueError("PROKKA output GFF file is not found") prokka_results = namedtuple("prokka_results", "cds_to_prot cds_to_dna gff_filepath") return prokka_results(cds_to_prot, cds_to_dna, gff_file) def parse_prokka_results(self, **prokka_parse_parameters): """ Go through the prokka results from the input contigs and then create the features, mrnas and cdss components of the KbaseGenome.Genome object for genome annotation only. :param prokka_parse_parameters: gff_filepath, mappings :return: A tuple with Genome:features Genome:cdss Genome:mrnas report_message of genes discovered """ gff_filepath = prokka_parse_parameters["gff_filepath"] cds_to_dna = prokka_parse_parameters["cds_to_dna"] cds_to_prot = prokka_parse_parameters["cds_to_prot"] new_ids_to_old = prokka_parse_parameters["new_ids_to_old"] evidence = self.make_annotation_evidence() cdss = [] mrnas = [] features = [] non_hypothetical = 0 genes_with_ec = 0 genes_with_sso = 0 prot_lengths = [] with open(gff_filepath, "r") as f1: for rec in GFF.parse(f1): contig_id = new_ids_to_old[str(rec.id)] for ft in rec.features: loc = ft.location min_pos = int(loc.start) + 1 max_pos = int(loc.end) strand = "+" if loc.strand == 1 else "-" flen = max_pos - min_pos + 1 start = min_pos if strand == "+" else max_pos location = [[contig_id, start, strand, flen]] qualifiers = ft.qualifiers generated_id = self._get_qualifier_value(qualifiers.get("ID")) if not generated_id: # Skipping feature with no ID (mostly repeat regions) continue dna = cds_to_dna.get(generated_id) if not dna: # Skipping feature with no DNA (mostly repeat regions) continue name = self._get_qualifier_value(qualifiers.get("Name")) ec = self._get_qualifier_value(qualifiers.get("eC_number")) gene = self._get_qualifier_value(qualifiers.get("gene")) product = self._get_qualifier_value(qualifiers.get("product")) fid = generated_id aliases = [] if name: aliases.append(name) if gene: aliases.append(gene) if ec: aliases.append(ec) genes_with_ec += 1 md5 = hashlib.md5(dna.encode()).hexdigest() feature = {"id": fid, "location": location, "type": "gene", "aliases": aliases, "md5": md5, "dna_sequence": dna, "dna_sequence_length": len(dna), } if product: feature["function"] = product if product != "hypothetical protein": non_hypothetical += 1 if ec and ec in self.ec_to_sso: sso_list = self.ec_to_sso[ec] sso_terms = {} for sso_item in sso_list: sso_terms[sso_item["id"]] = {"id": sso_item["id"], "evidence": [evidence], "term_name": sso_item["name"], "ontology_ref": self.sso_ref, "term_lineage": []} feature["ontology_terms"] = {"SSO": sso_terms} genes_with_sso += 1 cds = None mrna = None prot = cds_to_prot.get(generated_id) if prot: cds_id = fid + "_CDS" mrna_id = fid + "_mRNA" prot_len = len(prot) prot_lengths.append(prot_len) feature["protein_translation"] = prot feature["protein_translation_length"] = prot_len feature["cdss"] = [cds_id] feature["mrnas"] = [mrna_id] cds = {"id": cds_id, "location": location, "md5": md5, "parent_gene": fid, "parent_mrna": mrna_id, "function": (product if product else ""), "ontology_terms": {}, "protein_translation": prot, "protein_translation_length": prot_len, "aliases": aliases} mrna = {"id": mrna_id, "location": location, "md5": md5, "parent_gene": fid, "cds": cds_id} features.append(feature) if cds: cdss.append(cds) if mrna: mrnas.append(mrna) # Prepare report report = "" report += "Number of genes predicted: " + str(len(features)) + "\n" report += "Number of protein coding genes: " + str(len(prot_lengths)) + "\n" report += "Number of genes with non-hypothetical function: " + str(non_hypothetical) + "\n" report += "Number of genes with EC-number: " + str(genes_with_ec) + "\n" report += "Number of genes with Seed Subsystem Ontology: " + str(genes_with_sso) + "\n" report += "Average protein length: " + str(int(sum(prot_lengths) / float(len(prot_lengths)))) + " aa.\n" annotated_assembly = namedtuple("annotated_assembly", "features cdss mrnas report_message") return annotated_assembly(features, cdss, mrnas, report) def get_new_annotations(self, gff_filepath): """ :param gff_filepath: A dictionary of ids with products and ec numbers :return: """ evidence = self.make_annotation_evidence() genome = {} with open(gff_filepath, "r") as f: for rec in GFF.parse(f): gid = rec.id gene_features = {"id": id} for feature in rec.features: qualifiers = feature.qualifiers if "product" in qualifiers: gene_features["function"] = " ".join(qualifiers["product"]) if "eC_number" in qualifiers: ec_numbers = qualifiers["eC_number"] sso_terms = dict() for ec in ec_numbers: sso_list = self.ec_to_sso.get(ec, []) for sso_item in sso_list: sso_terms[sso_item["id"]] = {"id": sso_item["id"], "evidence": [evidence], "term_name": sso_item["name"], "ontology_ref": self.sso_ref, "term_lineage": []} gene_features["ontology_terms"] = sso_terms genome[gid] = gene_features return genome def write_genome_to_fasta(self, genome_data): """ :param genome_data: :return: """ fasta_for_prokka_filepath = os.path.join(self.scratch, "features_" + str(uuid.uuid4()) + ".fasta") count = 0 with open(fasta_for_prokka_filepath, "w") as f: for item in genome_data["data"]["features"]: if "id" not in item or "dna_sequence" not in item: print("This feature does not have a valid dna sequence.") else: f.write(">" + item["id"] + "\n" + item["dna_sequence"] + "\n") count += 1 print("Finished printing to" + fasta_for_prokka_filepath) if os.stat(fasta_for_prokka_filepath).st_size == 0: raise Exception( "This genome does not contain features with DNA_SEQUENCES. Fasta file is empty.") return fasta_for_prokka_filepath def make_sso_ontology_event(self): """ :param sso_ref: Reference to the annotation library set :return: Ontology_event to be appended to the list of genome ontology events """ time_string = str( datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) return { "method": "Prokka Annotation", "method_version": version, "timestamp": time_string, "id": "SSO", "ontology_ref": self.sso_ref } def make_annotation_evidence(self): """ Create a dict for the evidence field for the genome :param sso_ref: Reference to the annotation library set :return: Ontology_event to be appended to the list of genome ontology events """ time_string = str( datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) return { "method": "Prokka Annotation (Evidence)", "method_version": version, "timestamp": time_string, } def create_genome_ontology_fields(self, genome_data): """ Create ontology event fields for a genome object :param genome_data: A genome object's data filed :return: a named tuple containg the modified genome object and a new ontology event index """ # Make sure ontologies_events exist sso_event = self.make_sso_ontology_event() ontology_event_index = 0 if 'ontology_events' in genome_data['data']: genome_data['data']['ontology_events'].append(sso_event) ontology_event_index += len(genome_data['data']['ontology_events']) - 1 else: genome_data['data']['ontology_events'] = [sso_event] genome_obj_modified = namedtuple('genome_obj_modified', 'genome_data ontology_event_index') return genome_obj_modified(genome_data, ontology_event_index) @staticmethod def old_genome_ontologies(feature, new_ontology): """ Update the feature's ontologies for an old genome :param feature: Feature to update :param new_ontology: New Ontology to update with :return: The feature with the ontology updated, in the old style """ if "ontology_terms" not in feature: feature["ontology_terms"] = {"SSO": {}} if "SSO" not in feature["ontology_terms"]: feature["ontology_terms"]["SSO"] = {} for key in new_ontology.keys(): feature["ontology_terms"]["SSO"][key] = new_ontology[key] return feature @staticmethod def new_genome_ontologies(feature, new_ontology, ontology_event_index): """ Update the feature's ontologies for a new genome :param feature: Feature to update :param new_ontology: New Ontology to update with :param ontology_event_index: Ontology index to update the feature with :return: the updated feature """ if "ontology_terms" not in feature: feature["ontology_terms"] = {"SSO": {}} if "SSO" not in feature["ontology_terms"]: feature["ontology_terms"]["SSO"] = {} for key in new_ontology.keys(): id = new_ontology[key]["id"] if id in feature["ontology_terms"]["SSO"]: feature["ontology_terms"]["SSO"][id].append(ontology_event_index) else: feature["ontology_terms"]["SSO"][id] = [ontology_event_index] return feature def annotate_genome_with_new_annotations(self, **annotation_args): """ Annotate the genome with new annotations for Genome ReAnnotation :param annotation_args: genome_data from the genome obj, new_annotations from prokka, and the output_genome_name :return: A tuple containg the genome_ref, filepaths for the function and ontology summary, and stats about the annotations """ genome_data = annotation_args["genome_data"] new_annotations = annotation_args["new_annotations"] new_genome = False if 'feature_counts' in genome_data['data']: new_genome = True genome_obj_modified = self.create_genome_ontology_fields(genome_data) genome_data = genome_obj_modified.genome_data ontology_event_index = genome_obj_modified.ontology_event_index stats = {"current_functions": len(genome_data["data"]["features"]), "new_functions": 0, "found_functions": 0, "new_ontologies": 0} function_summary_fp = os.path.join(self.scratch, "ontology_report") ontology_summary_fp = os.path.join(self.scratch, "function_report") onto_r = open(function_summary_fp, "w") func_r = open(ontology_summary_fp, "w") func_r.write("function_id current_function new_function\n") onto_r.write("function_id current_ontology new_ontology\n") ontologies_present = {"SSO": {}} for i, feature in enumerate(genome_data["data"]["features"]): fid = feature["id"] current_function = feature.get("function", "") current_functions = feature.get("functions", []) current_ontology = feature.get("ontology_terms", None) new_function = "" new_ontology = dict() if fid in new_annotations: # Set Function new_function = new_annotations[fid].get("function", "") if new_function and "hypothetical protein" not in new_function: if (new_function != current_function and new_function not in current_functions): stats['new_functions'] += 1 genome_data["data"]["features"][i]["function"] = new_function genome_data["data"]["features"][i]["functions"] = [new_function] stats['found_functions'] += 1 # Set Ontologies new_ontology = new_annotations[fid].get("ontology_terms", None) if new_ontology: stats['new_ontologies'] += 1 if new_genome: # New style genome_data["data"]["features"][i] = self. \ new_genome_ontologies(feature, new_ontology, ontology_event_index) # Add to ontologies Present for key in new_ontology.keys(): oid = new_ontology[key]["id"] name = new_ontology[key].get("name", "Unknown") ontologies_present["SSO"][oid] = name else: genome_data["data"]["features"][i] = self. \ old_genome_ontologies(feature, new_ontology) if current_function: func_r.write(json.dumps([fid, [current_function], [new_function]]) + "\n") else: func_r.write(json.dumps([fid, current_functions, [new_function]]) + "\n") onto_r.write(json.dumps([fid, current_ontology, new_ontology]) + "\n") func_r.close() onto_r.close() if ontologies_present: if "ontologies_present" in genome_data["data"]: if "SSO" in genome_data["data"]["ontologies_present"]: for key, value in ontologies_present["SSO"].items(): genome_data["data"]["ontologies_present"]["SSO"][key] = value else: genome_data["data"]["ontologies_present"] = ontologies_present["SSO"] else: genome_data["data"]["ontologies_present"] = ontologies_present info = self.gfu.save_one_genome({"workspace": self.output_workspace, "name": annotation_args["output_genome_name"], "data": genome_data["data"], "provenance": self.ctx.provenance()})["info"] genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4]) annotated_genome = namedtuple("annotated_genome", "genome_ref function_summary_filepath ontology_summary_filepath stats") return annotated_genome(genome_ref, function_summary_fp, ontology_summary_fp, stats) def upload_file(self, filepath, message="Annotation report generated by kb_prokka"): """ Upload a file to shock :param filepath: File to upload :param message: Optional Upload Message :return: """ output_file_shock_id = self.dfu.file_to_shock({"file_path": filepath})["shock_id"] print(f"Uploaded filepath {filepath} to shock and got id {output_file_shock_id}") return {"shock_id": output_file_shock_id, "name": os.path.basename(filepath), "label": os.path.basename(filepath), "description": message} def report_annotated_genome(self, genome): """ Create report output with newly reannotated genome, and some stats :param genome: Reannotated Genome Reference, Report Files and Stats :return: Reference to Report Object """ genome_ref = genome.genome_ref stats = genome.stats file_links = [self.upload_file(genome.ontology_summary_filepath), self.upload_file(genome.function_summary_filepath)] report_message = ("Genome Ref:{0}\n" "Number of features sent into prokka:{1}\n" "New functions found:{2}\n" "Ontology terms found:{3}\n" ).format(genome_ref, stats["current_functions"], stats["new_functions"], stats["new_ontologies"]) report_info = self.kbr.create_extended_report( {"message": report_message, "objects_created": [{"ref": genome_ref, "description": "Annotated genome"}], "file_links": file_links, "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()), "workspace_name": self.output_workspace }) return {"output_genome_ref": genome_ref, "report_name": report_info["name"], "report_ref": report_info["ref"]} def annotate_genome(self, params): """ User input an existing genome to re-annotate. :param params: Reference to the genome, Output File Name, UI Parameters :return: Report with Reannotated Genome and Stats about it """ self.download_seed_data() self.output_workspace = params["output_workspace"] genome_ref = self._get_input_value(params, "object_ref") output_name = self._get_input_value(params, "output_genome_name") # genome_data = self.dfu.get_objects({"object_refs": [genome_ref]})["data"][0] genome_data = \ self.genome_api.get_genome_v1({"genomes": [{"ref": genome_ref}], 'downgrade': 0})[ "genomes"][0] fasta_for_prokka_filepath = self.write_genome_to_fasta(genome_data) output_dir = self.run_prokka(params, fasta_for_prokka_filepath) prokka_results = self.retrieve_prokka_results(output_dir) new_annotations = self.get_new_annotations(prokka_results.gff_filepath) annotated_genome = self.annotate_genome_with_new_annotations(genome_data=genome_data, new_annotations=new_annotations, output_genome_name=output_name) return self.report_annotated_genome(annotated_genome) def save_genome(self, params, prokka_results, renamed_assembly, assembly_ref): """ Save KBaseGenomes.Genome object, inputs: params - input parameters from .spec prokka_results - result files from prokka run renamed_assembly - assembly object with renamed contigs assembly_ref - reference to input assembly object output: genome_ref: saved genome object reference report_message: message associated with er """ # Parse Results output_genome_name = self._get_input_value(params, "output_genome_name") output_workspace = self._get_input_value(params, "output_workspace") annotated_assembly = self.parse_prokka_results(gff_filepath=prokka_results.gff_filepath, cds_to_dna=prokka_results.cds_to_dna, cds_to_prot=prokka_results.cds_to_prot, new_ids_to_old=renamed_assembly.new_ids_to_old) # Force defaults for optional parameters that may be set to None scientific_name = 'Unknown' if 'scientific_name' in params and params['scientific_name']: scientific_name = params['scientific_name'] domain = "Bacteria" if 'kingdom' in params and params['kingdom']: domain = params['kingdom'] gcode = 0 if 'gcode' in params and params['gcode']: gcode = params['gcode'] genome = {"id": "Unknown", "features": annotated_assembly.features, "scientific_name": scientific_name, "domain": domain, "genetic_code": gcode, "assembly_ref": assembly_ref, "cdss": annotated_assembly.cdss, "mrnas": annotated_assembly.mrnas, "source": "PROKKA annotation pipeline", "gc_content": assembly_info.gc_content, "dna_size": assembly_info.dna_size, "reference_annotation": 0} info = self.gfu.save_one_genome({"workspace": output_workspace, "name": output_genome_name, "data": genome, "provenance": self.ctx.provenance()})["info"] genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4]) return genome_ref, annotated_assembly.report_message def _replace_id(self, line, new_ids_to_old, fasta=False): """ inputs: line - text line to replace id of new_ids_to_old - dict of newly assigned ids to input (old) ids fasta. - bool wheter file is fasta, else is gff file outputs: returned text line """ if fasta: if '>' == line[0]: tokens = line.split() if len(tokens) > 1: id_ = tokens[0][1:].strip() rest = ' '.join(tokens[1:]) return '>' + new_ids_to_old[id_] + ' ' + rest else: id_ = tokens[0][1:].strip() return '>' + new_ids_to_old[id_] else: return line else: id_, rest = line.split('\t')[0], line.split('\t')[1:] return '\t'.join([new_ids_to_old[id_]] + rest) def _rename_and_separate_gff(self, gff, new_ids_to_old): """ rename the output gff file ids and separate the fasta file from the gff3. inputs: gff - path to gff_file new_ids_to_old - dict of newly assigned ids to input (old) ids """ fasta = [] save = [] with open(gff) as f: for l in f: if '##FASTA' in l: for line in f: fasta.append(self._replace_id(line, new_ids_to_old, True)) break if '##' in l: continue save.append(self._replace_id(l, new_ids_to_old)) gff_path = gff + "_edited.gff" with open(gff_path, 'w') as f: for l in save: f.write(l.strip() + '\n') fasta_path = gff + "edited.fa" with open(fasta_path, 'w') as f: for l in fasta: f.write(l.strip() + '\n') return gff_path, fasta_path def save_metagenome(self, params, gff_file, fasta_file): """ inputs: params - input "params" from .spec gff_file - path to gff_file to save as Metagenome fasta_file - path to fasta_file to save as Metagenome outputs: metagenome_ref - saved KBaseMetagenomes.AnnotatedMetagenomeAssembly object ref """ output_name = self._get_input_value(params, "output_metagenome_name") output_workspace = self._get_input_value(params, "output_workspace") metagenome_ref = self.gfu.fasta_gff_to_metagenome({ "fasta_file": {'path': fasta_file}, "gff_file": {'path': gff_file}, "genome_name": output_name, "workspace_name": output_workspace, "generate_missing_genes": True })['genome_ref'] return metagenome_ref def annotate_metagenome(self, params): """ Given a KBaseMetagenome.AnnotatedMetagenomeAssembly object, reannotate it using Prokka. Saves a KBaseMetagenome.AnnotatedMetagenomeAssembly as output. inputs: params - input "params" from .spec outputs: output_metagenome_ref - saved KBaseMetagenomes.AnnotatedMetagenomeAssembly object ref report_name - name of outgoing report object report_ref - reference to Report object """ metagenome_ref = self._get_input_value(params, "object_ref") output_genome_name = self._get_input_value(params, "output_metagenome_name") output_workspace = self._get_input_value(params, "output_workspace") # orig_fasta_file = self.au.get_fastas({'ref_lst': [metagenome_ref]}) obj_data = self.dfu.get_objects({"object_refs": [metagenome_ref]})['data'][0]['data'] orig_fasta_file = self.au.get_assembly_as_fasta({"ref": obj_data['assembly_ref']})["path"] renamed_assembly = self.create_renamed_assembly(orig_fasta_file) output_dir = self.run_prokka(params, renamed_assembly.filepath) # need to analyse output gff and fastas from prokka. gff_file, fasta_file = self._rename_and_separate_gff(output_dir + "/mygenome.gff", renamed_assembly.new_ids_to_old) metagenome_ref = self.save_metagenome(params, gff_file, fasta_file) report_message = "Metagenome saved to: " + output_workspace + "/" + \ output_genome_name + "\n" report_info = self.kbr.create_extended_report({ "message": report_message, "objects_created": [{"ref": metagenome_ref, "description": "Annotated Metagenome Assembly"}], "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()), "workspace_name": output_workspace }) return { "output_metagenome_ref": metagenome_ref, "report_name": report_info["name"], "report_ref": report_info["ref"] } def annotate_assembly(self, params, assembly_info): """ Annotate an assembly with Prokka. The steps include to download the assembly as a fasta file, rename the contigs, run prokka against the contigs, parse the results, and finally, create and upload a genome object. :param params: object reference, output_genome_name and output_workspace :param assembly_info: Information used to determine if the assembly is too big :return: Report with newly annotated assembly as a genome, and stats about it """ self.download_seed_data() output_workspace = params["output_workspace"] if params.get('metagenome'): save_type = "Annotated Metagenome Assembly" output_field_name = 'output_metagenome_ref' output_name = self._get_input_value(params, "output_metagenome_name") else: save_type = "Annotated Genome" output_field_name = "output_genome_ref" output_name = self._get_input_value(params, "output_genome_name") assembly_ref = self._get_input_value(params, "object_ref") output_workspace = self._get_input_value(params, "output_workspace") # for now, don't do this check if we are using a metagenome if not params.get('metagenome'): assembly_info = self.inspect_assembly(assembly_info[10], assembly_ref) orig_fasta_file = self.au.get_assembly_as_fasta({"ref": assembly_ref})["path"] # Rename Assembly and Keep Track of Old Contigs renamed_assembly = self.create_renamed_assembly(orig_fasta_file) # Run Prokka with the modified, renamed fasta file output_dir = self.run_prokka(params, renamed_assembly.filepath) # Prokka_results if params.get('metagenome'): gff_file, fasta_file = self._rename_and_separate_gff(output_dir + "/mygenome.gff", renamed_assembly.new_ids_to_old) genome_ref = self.save_metagenome(params, gff_file, fasta_file) report_message = "" else: prokka_results = self.retrieve_prokka_results(output_dir) genome_ref, report_message = self.save_genome(params, prokka_results, renamed_assembly, assembly_ref) report_message = f"{save_type} saved to: " + output_workspace + "/" + \ output_name + "\n" + report_message report_info = self.kbr.create_extended_report({ "message": report_message, "objects_created": [{"ref": genome_ref, "description": save_type}], "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()), "workspace_name": output_workspace }) return { output_field_name: genome_ref, "report_name": report_info["name"], "report_ref": report_info["ref"] }
def test_KBaseDataObjectToFileUtils_AnnotatedMetagenomeAssemblyToFASTA_01( self): # upload test data ama_name = "ama_test.AMA" ama_feature_cnt = 888 ama_contigs_file_src = "data/AnnotatedMetagenomeAssembly/ama_contigs.fasta" ama_genes_file_src = "data/AnnotatedMetagenomeAssembly/ama_genes.gff" shared_dir = "/kb/module/work/tmp" ama_contigs_file_upload = os.path.join( shared_dir, os.path.basename(ama_contigs_file_src)) ama_genes_file_upload = os.path.join( shared_dir, os.path.basename(ama_genes_file_src)) shutil.copy(ama_contigs_file_src, ama_contigs_file_upload) shutil.copy(ama_genes_file_src, ama_genes_file_upload) ama_upload_params = { "workspace_name": self.getWsName(), "genome_name": ama_name, "fasta_file": { "path": ama_contigs_file_upload }, "gff_file": { "path": ama_genes_file_upload }, "source": "GFF", "scientific_name": "TEST AMA", "generate_missing_genes": "True" } try: SERVICE_VER = 'dev' GFU = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token'], service_ver=SERVICE_VER) print("UPLOADING AMA: " + ama_name + " to WORKSPACE " + self.getWsName() + " ...") ama_upload_result = GFU.fasta_gff_to_metagenome(ama_upload_params) except: raise ValueError("unable to upload test AMA data object") pprint(ama_upload_result) ama_ref = ama_upload_result['metagenome_ref'] # get protein fastas output_dir = os.path.join(self.scratch, 'fasta_out.' + str(uuid.uuid4())) if not os.path.exists(output_dir): os.makedirs(output_dir) parameters = { 'ama_ref': ama_ref, 'file': 'test_ama_proteins.fasta', 'dir': output_dir, 'console': [], 'invalid_msgs': [], 'residue_type': 'protein', 'feature_type': 'CDS', 'record_id_pattern': '%%feature_id%%', 'record_desc_pattern': '[%%genome_id%%]', 'case': 'upper', 'linewrap': 50 } ret = self.getImpl().AnnotatedMetagenomeAssemblyToFASTA( self.getContext(), parameters)[0] self.assertIsNotNone(ret['fasta_file_path']) self.assertIsNotNone(ret['feature_ids']) self.assertNotEqual(len(ret['feature_ids']), 0) self.assertEqual(len(ret['feature_ids']), ama_feature_cnt) pass