class NarrativeManager: KB_CELL = 'kb-cell' KB_TYPE = 'type' KB_APP_CELL = 'kb_app' KB_FUNCTION_CELL = 'function_input' KB_OUTPUT_CELL = 'function_output' KB_ERROR_CELL = 'kb_error' KB_CODE_CELL = 'kb_code' KB_STATE = 'widget_state' DEBUG = False DATA_PALETTES_TYPES = DataPaletteTypes(False) def __init__(self, config, ctx, set_api_client, data_palette_client): self.narrativeMethodStoreURL = config['narrative-method-store'] self.set_api_cache = set_api_client # DynamicServiceCache type self.data_palette_client = data_palette_client # DynamicServiceCache type self.token = ctx["token"] self.user_id = ctx["user_id"] self.ws = Workspace(config['workspace-url'], token=self.token) self.intro_md_file = config['intro-markdown-file'] # We switch DPs on only for internal Continuous Integration environment for now: if config['kbase-endpoint'].startswith("https://ci.kbase.us/") or \ 'USE_DP' in os.environ: self.DATA_PALETTES_TYPES = DataPaletteTypes(True) def list_objects_with_sets(self, ws_id=None, ws_name=None, workspaces=None, types=None, include_metadata=0, include_data_palettes=0): if not workspaces: if not ws_id and not ws_name: raise ValueError( "One and only one of 'ws_id', 'ws_name', 'workspaces' " + "parameters should be set") workspaces = [self._get_workspace_name_or_id(ws_id, ws_name)] return self._list_objects_with_sets(workspaces, types, include_metadata, include_data_palettes) def _list_objects_with_sets(self, workspaces, types, include_metadata, include_data_palettes): type_map = None if types is not None: type_map = {key: True for key in types} processed_refs = {} data = [] if self.DEBUG: print("NarrativeManager._list_objects_with_sets: processing sets") t1 = time.time() set_ret = self.set_api_cache.call_method( "list_sets", [{ 'workspaces': workspaces, 'include_set_item_info': 1, 'include_metadata': include_metadata }], self.token) sets = set_ret['sets'] for set_info in sets: # Process target_set_items = [] for set_item in set_info['items']: target_set_items.append(set_item['info']) if self._check_info_type(set_info['info'], type_map): data_item = { 'object_info': set_info['info'], 'set_items': { 'set_items_info': target_set_items } } data.append(data_item) processed_refs[set_info['ref']] = data_item if self.DEBUG: print(" (time=" + str(time.time() - t1) + ")") if self.DEBUG: print("NarrativeManager._list_objects_with_sets: loading ws_info") t2 = time.time() ws_info_list = [] # for ws in workspaces: if len(workspaces) == 1: ws = workspaces[0] ws_id = None ws_name = None if str(ws).isdigit(): ws_id = int(ws) else: ws_name = str(ws) ws_info_list.append( self.ws.get_workspace_info({ "id": ws_id, "workspace": ws_name })) else: ws_map = {key: True for key in workspaces} for ws_info in self.ws.list_workspace_info({'perm': 'r'}): if ws_info[1] in ws_map or str(ws_info[0]) in ws_map: ws_info_list.append(ws_info) if self.DEBUG: print(" (time=" + str(time.time() - t2) + ")") if self.DEBUG: print( "NarrativeManager._list_objects_with_sets: loading workspace objects" ) t3 = time.time() for info in WorkspaceListObjectsIterator( self.ws, ws_info_list=ws_info_list, list_objects_params={'includeMetadata': include_metadata}): item_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) if item_ref not in processed_refs and self._check_info_type( info, type_map): data_item = {'object_info': info} data.append(data_item) processed_refs[item_ref] = data_item if self.DEBUG: print(" (time=" + str(time.time() - t3) + ")") return_data = {"data": data} if include_data_palettes == 1: if self.DEBUG: print( "NarrativeManager._list_objects_with_sets: processing DataPalettes" ) t5 = time.time() dp_ret = self.data_palette_client.call_method( "list_data", [{ 'workspaces': workspaces, 'include_metadata': include_metadata }], self.token) for item in dp_ret['data']: ref = item['ref'] if self._check_info_type(item['info'], type_map): data_item = None if ref in processed_refs: data_item = processed_refs[ref] else: data_item = {'object_info': item['info']} processed_refs[ref] = data_item data.append(data_item) dp_info = {} if 'dp_ref' in item: dp_info['ref'] = item['dp_ref'] if 'dp_refs' in item: dp_info['refs'] = item['dp_refs'] data_item['dp_info'] = dp_info return_data["data_palette_refs"] = dp_ret['data_palette_refs'] if self.DEBUG: print(" (time=" + str(time.time() - t5) + ")") return return_data def _check_info_type(self, info, type_map): if type_map is None: return True obj_type = info[2].split('-')[0] return type_map.get(obj_type, False) def copy_narrative(self, newName, workspaceRef, workspaceId): time_ms = int(round(time.time() * 1000)) newWsName = self.user_id + ':narrative_' + str(time_ms) # add the 'narrative' field to newWsMeta later. newWsMeta = {"narrative_nice_name": newName, "searchtags": "narrative"} # start with getting the existing narrative object. currentNarrative = self.ws.get_objects([{'ref': workspaceRef}])[0] if not workspaceId: workspaceId = currentNarrative['info'][6] # Let's prepare exceptions for clone the workspace. # 1) currentNarrative object: excluded_list = [{'objid': currentNarrative['info'][0]}] # 2) let's exclude objects of types under DataPalette handling: ## DP CODE # data_palette_type = "DataPalette.DataPalette" # excluded_types = [data_palette_type] # excluded_types.extend(self.DATA_PALETTES_TYPES.keys()) # add_to_palette_list = [] # dp_detected = False ## END DP CODE # for obj_type in excluded_types: # list_objects_params = {'type': obj_type} ## DP CODE # if obj_type == data_palette_type: # list_objects_params['showHidden'] = 1 ## END DP CODE # for info in WorkspaceListObjectsIterator(self.ws, # ws_id=workspaceId, # list_objects_params=list_objects_params): ## DP CODE # if obj_type == data_palette_type: # dp_detected = True # else: # add_to_palette_list.append({ # 'ref': str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) # }) ## END DP CODE # excluded_list.append({'objid': info[0]}) # clone the workspace EXCEPT for currentNarrative object newWsId = self.ws.clone_workspace({ 'wsi': { 'id': workspaceId }, 'workspace': newWsName, 'meta': newWsMeta, 'exclude': excluded_list })[0] try: ## DP CODE # if dp_detected: # self.data_palette_client.call_method( # "copy_palette", # [{'from_workspace': str(workspaceId), 'to_workspace': str(newWsId)}], # self.token # ) # if len(add_to_palette_list) > 0: # # There are objects in source workspace that have type under DataPalette handling # # but these objects are physically stored in source workspace rather that saved # # in DataPalette object. So they weren't copied by "dps.copy_palette". # self.data_palette_client.call_method( # "add_to_palette", # [{'workspace': str(newWsId), 'new_refs': add_to_palette_list}], # self.token # ) ## END DP CODE # update the ref inside the narrative object and the new workspace metadata. newNarMetadata = currentNarrative['info'][10] newNarMetadata['name'] = newName newNarMetadata['ws_name'] = newWsName newNarMetadata['job_info'] = json.dumps({ 'queue_time': 0, 'running': 0, 'completed': 0, 'run_time': 0, 'error': 0 }) is_temporary = newNarMetadata.get('is_temporary', 'false') if 'is_temporary' not in newNarMetadata: if newNarMetadata['name'] == 'Untitled' or newNarMetadata[ 'name'] is None: is_temporary = 'true' newNarMetadata['is_temporary'] = is_temporary currentNarrative['data']['metadata']['name'] = newName currentNarrative['data']['metadata']['ws_name'] = newWsName currentNarrative['data']['metadata']['job_ids'] = { 'apps': [], 'methods': [], 'job_usage': { 'queue_time': 0, 'run_time': 0 } } # save the shiny new Narrative so it's at version 1 newNarInfo = self.ws.save_objects({ 'id': newWsId, 'objects': [{ 'type': currentNarrative['info'][2], 'data': currentNarrative['data'], 'provenance': currentNarrative['provenance'], 'name': currentNarrative['info'][1], 'meta': newNarMetadata }] }) # now, just update the workspace metadata to point # to the new narrative object if 'worksheets' in currentNarrative['data']: # handle legacy. num_cells = len( currentNarrative['data']['worksheets'][0]['cells']) else: num_cells = len(currentNarrative['data']['cells']) newNarId = newNarInfo[0][0] self.ws.alter_workspace_metadata({ 'wsi': { 'id': newWsId }, 'new': { 'narrative': str(newNarId), 'is_temporary': is_temporary, 'cell_count': str(num_cells) } }) return {'newWsId': newWsId, 'newNarId': newNarId} except Exception: # let's delete copy of workspace so it's out of the way - it's broken self.ws.delete_workspace({'id': newWsId}) raise def create_new_narrative(self, app, method, appparam, appData, markdown, copydata, importData, includeIntroCell, title): if app and method: raise ValueError( "Must provide no more than one of the app or method params") if not importData and copydata: importData = copydata.split(';') if not appData and appparam: appData = [] for tmp_item in appparam.split(';'): tmp_tuple = tmp_item.split(',') step_pos = None if tmp_tuple[0]: try: step_pos = int(tmp_tuple[0]) except ValueError: pass appData.append([step_pos, tmp_tuple[1], tmp_tuple[2]]) cells = None if app: cells = [{"app": app}] elif method: cells = [{"method": method}] elif markdown: cells = [{"markdown": markdown}] narr_info = self._create_temp_narrative(cells, appData, importData, includeIntroCell, title) if title is not None: # update workspace info so it's not temporary pass return narr_info def _get_intro_markdown(self): """ Creates and returns a cell with the introductory text included. """ # Load introductory markdown text with open(self.intro_md_file) as intro_file: intro_md = intro_file.read() return intro_md def _create_temp_narrative(self, cells, parameters, importData, includeIntroCell, title): # Migration to python of JavaScript class from https://github.com/kbase/kbase-ui/blob/4d31151d13de0278765a69b2b09f3bcf0e832409/src/client/modules/plugins/narrativemanager/modules/narrativeManager.js#L414 narr_id = int(round(time.time() * 1000)) workspaceName = self.user_id + ':narrative_' + str(narr_id) narrativeName = "Narrative." + str(narr_id) ws = self.ws ws_info = ws.create_workspace({ 'workspace': workspaceName, 'description': '' }) [narrativeObject, metadataExternal ] = self._fetchNarrativeObjects(workspaceName, cells, parameters, includeIntroCell, title) is_temporary = 'true' if title is not None and title != 'Untitled': is_temporary = 'false' metadataExternal['is_temporary'] = is_temporary objectInfo = ws.save_objects({ 'workspace': workspaceName, 'objects': [{ 'type': 'KBaseNarrative.Narrative', 'data': narrativeObject, 'name': narrativeName, 'meta': metadataExternal, 'provenance': [{ 'script': 'NarrativeManager.py', 'description': 'Created new ' + 'Workspace/Narrative bundle.' }], 'hidden': 0 }] })[0] objectInfo = ServiceUtils.object_info_to_object(objectInfo) ws_info = self._completeNewNarrative(ws_info[0], objectInfo['id'], importData, is_temporary, title, len(narrativeObject['cells'])) return { 'workspaceInfo': ServiceUtils.workspace_info_to_object(ws_info), 'narrativeInfo': objectInfo } def _fetchNarrativeObjects(self, workspaceName, cells, parameters, includeIntroCell, title): if not cells: cells = [] if not title: title = 'Untitled' # fetchSpecs appSpecIds = [] methodSpecIds = [] specMapping = {'apps': {}, 'methods': {}} for cell in cells: if 'app' in cell: appSpecIds.append(cell['app']) elif 'method' in cell: methodSpecIds.append(cell['method']) nms = NarrativeMethodStore(self.narrativeMethodStoreURL, token=self.token) if len(appSpecIds) > 0: appSpecs = nms.get_app_spec({'ids': appSpecIds}) for spec in appSpecs: spec_id = spec['info']['id'] specMapping['apps'][spec_id] = spec if len(methodSpecIds) > 0: methodSpecs = nms.get_method_spec({'ids': methodSpecIds}) for spec in methodSpecs: spec_id = spec['info']['id'] specMapping['methods'][spec_id] = spec # end of fetchSpecs metadata = { 'job_ids': { 'methods': [], 'apps': [], 'job_usage': { 'queue_time': 0, 'run_time': 0 } }, 'format': 'ipynb', 'creator': self.user_id, 'ws_name': workspaceName, 'name': title, 'type': 'KBaseNarrative.Narrative', 'description': '', 'data_dependencies': [] } cellData = self._gatherCellData(cells, specMapping, parameters, includeIntroCell) narrativeObject = { 'nbformat_minor': 0, 'cells': cellData, 'metadata': metadata, 'nbformat': 4 } metadataExternal = {} for key in metadata: value = metadata[key] if isinstance(value, str): metadataExternal[key] = value else: metadataExternal[key] = json.dumps(value) return [narrativeObject, metadataExternal] def _gatherCellData(self, cells, specMapping, parameters, includeIntroCell): cell_data = [] if includeIntroCell == 1: cell_data.append({ 'cell_type': 'markdown', 'source': self._get_intro_markdown(), 'metadata': {} }) for cell_pos, cell in enumerate(cells): if 'app' in cell: cell_data.append( self._buildAppCell(len(cell_data), specMapping['apps'][cell['app']], parameters)) elif 'method' in cell: cell_data.append( self._buildMethodCell( len(cell_data), specMapping['methods'][cell['method']], parameters)) elif 'markdown' in cell: cell_data.append({ 'cell_type': 'markdown', 'source': cell['markdown'], 'metadata': {} }) else: raise ValueError("cannot add cell #" + str(cell_pos) + ", unrecognized cell content") return cell_data def _buildAppCell(self, pos, spec, params): cellId = 'kb-cell-' + str(pos) + '-' + str(uuid.uuid4()) cell = { "cell_type": "markdown", "source": "<div id='" + cellId + "'></div>" + "\n<script>" + "$('#" + cellId + "').kbaseNarrativeAppCell({'appSpec' : '" + self._safeJSONStringify(spec) + "', 'cellId' : '" + cellId + "'});" + "</script>", "metadata": {} } cellInfo = {} widgetState = [] cellInfo[self.KB_TYPE] = self.KB_APP_CELL cellInfo['app'] = spec if params: steps = {} for param in params: stepid = 'step_' + str(param[0]) if stepid not in steps: steps[stepid] = {} steps[stepid]['inputState'] = {} steps[stepid]['inputState'][param[1]] = param[2] state = {'state': {'step': steps}} widgetState.append(state) cellInfo[self.KB_STATE] = widgetState cell['metadata'][self.KB_CELL] = cellInfo return cell def _buildMethodCell(self, pos, spec, params): cellId = "kb-cell-" + str(pos) + "-" + str(uuid.uuid4()) cell = { "cell_type": "markdown", "source": "<div id='" + cellId + "'></div>" + "\n<script>" + "$('#" + cellId + "').kbaseNarrativeMethodCell({'method' : '" + self._safeJSONStringify(spec) + "'});" + "</script>", "metadata": {} } cellInfo = {"method": spec, "widget": spec["widgets"]["input"]} cellInfo[self.KB_TYPE] = self.KB_FUNCTION_CELL widgetState = [] if params: wparams = {} for param in params: wparams[param[1]] = param[2] widgetState.append({"state": wparams}) cellInfo[self.KB_STATE] = widgetState cell["metadata"][self.KB_CELL] = cellInfo return cell def _completeNewNarrative(self, workspaceId, objectId, importData, is_temporary, title, num_cells): """ 'Completes' the new narrative by updating workspace metadata with the required fields and copying in data from the importData list of references. """ new_meta = { 'narrative': str(objectId), 'is_temporary': is_temporary, 'searchtags': 'narrative', 'cell_count': str(num_cells) } if is_temporary == 'false' and title is not None: new_meta['narrative_nice_name'] = title self.ws.alter_workspace_metadata({ 'wsi': { 'id': workspaceId }, 'new': new_meta }) # copy_to_narrative: if importData: objectsToCopy = [{'ref': x} for x in importData] infoList = self.ws.get_object_info_new({ 'objects': objectsToCopy, 'includeMetadata': 0 }) for item in infoList: objectInfo = ServiceUtils.object_info_to_object(item) self.copy_object(objectInfo['ref'], workspaceId, None, None, objectInfo) return self.ws.get_workspace_info({'id': workspaceId}) def _safeJSONStringify(self, obj): return json.dumps(self._safeJSONStringifyPrepare(obj)) def _safeJSONStringifyPrepare(self, obj): if isinstance(obj, str): return obj.replace("'", "'").replace('"', """) elif isinstance(obj, list): for pos in range(len(obj)): obj[pos] = self._safeJSONStringifyPrepare(obj[pos]) elif isinstance(obj, dict): obj_keys = list(obj.keys()) for key in obj_keys: obj[key] = self._safeJSONStringifyPrepare(obj[key]) else: pass # it's boolean/int/float/None return obj def _get_workspace_name_or_id(self, ws_id, ws_name): ret = ws_name if not ret: ret = str(ws_id) return ret def copy_object(self, ref, target_ws_id, target_ws_name, target_name, src_info): """ Copies an object from one workspace to another. """ if not target_ws_id and not target_ws_name: raise ValueError("Neither target workspace id nor name is defined") if not src_info: src_info_tuple = self.ws.get_object_info_new({ 'objects': [{ 'ref': ref }], 'includeMetadata': 0 })[0] src_info = ServiceUtils.object_info_to_object(src_info_tuple) if not target_name: target_name = src_info['name'] obj_info_tuple = self.ws.copy_object({ 'from': { 'ref': ref }, 'to': { 'wsid': target_ws_id, 'workspace': target_ws_name, 'name': target_name } }) obj_info = ServiceUtils.object_info_to_object(obj_info_tuple) return {'info': obj_info} def list_available_types(self, workspaces): data = self.list_objects_with_sets(workspaces=workspaces)['data'] type_stat = {} for item in data: info = item['object_info'] obj_type = info[2].split('-')[0] if obj_type in type_stat: type_stat[obj_type] += 1 else: type_stat[obj_type] = 1 return {'type_stat': type_stat}
class FunctionalEnrichmentUtil: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_run_fe1_params(self, params): """ _validate_run_fe1_params: validates params passed to run_fe1 method """ log('start validating run_fe1 params') # check for required parameters for p in ['feature_set_ref', 'workspace_name']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _generate_report(self, enrichment_map, result_directory, workspace_name, feature_id_go_id_list_map, feature_set_ids, genome_ref, go_id_parent_ids_map, feature_ids): """ _generate_report: generate summary report """ log('start creating report') output_files = self._generate_output_file_list( result_directory, enrichment_map, feature_id_go_id_list_map, feature_set_ids, genome_ref, go_id_parent_ids_map, feature_ids) output_html_files = self._generate_html_report(result_directory, enrichment_map) report_object_name = 'kb_functional_enrichment_1_report_' + str( uuid.uuid4()) report_params = { 'message': '', 'workspace_name': workspace_name, 'file_links': output_files, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 333, 'report_object_name': report_object_name } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _generate_supporting_files(self, result_directory, enrichment_map, feature_id_go_id_list_map, feature_set_ids, genome_ref, go_id_parent_ids_map, feature_ids): """ _generate_supporting_files: generate varies debug files """ supporting_files = list() feature_id_go_ids_map_file = os.path.join(result_directory, 'feature_id_go_ids_map.txt') go_id_genome_feature_ids_map_file = os.path.join( result_directory, 'go_id_genome_feature_ids_map.txt') go_id_set_feature_ids_map_file = os.path.join( result_directory, 'go_id_feature_set_feature_ids_map.txt') feature_ids_file = os.path.join(result_directory, 'feature_ids.txt') feature_set_ids_file = os.path.join(result_directory, 'feature_set_ids.txt') fisher_variables_file = os.path.join(result_directory, 'fisher_variables.txt') genome_info_file = os.path.join(result_directory, 'genome_info.txt') go_id_parent_ids_map_file = os.path.join(result_directory, 'go_id_parent_ids_map.txt') supporting_files.append(feature_id_go_ids_map_file) supporting_files.append(go_id_genome_feature_ids_map_file) supporting_files.append(feature_ids_file) supporting_files.append(feature_set_ids_file) supporting_files.append(fisher_variables_file) supporting_files.append(genome_info_file) supporting_files.append(go_id_parent_ids_map_file) supporting_files.append(go_id_set_feature_ids_map_file) total_feature_ids = list(feature_id_go_id_list_map.keys()) feature_ids_with_feature = [] for feature_id, go_ids in feature_id_go_id_list_map.items(): if isinstance(go_ids, list): feature_ids_with_feature.append(feature_id) genome_name = self.ws.get_object_info3( {'objects': [{ 'ref': genome_ref }]})['infos'][0][1] with open(go_id_parent_ids_map_file, 'w') as go_id_parent_ids_map_file: for go_id, parent_ids in go_id_parent_ids_map.items(): go_id_parent_ids_map_file.write( f'{go_id}: {", ".join(parent_ids)}\n') with open(genome_info_file, 'w') as genome_info_file: genome_info_file.write(f'genome_name: {genome_name}\n') genome_info_file.write(f'features: {len(total_feature_ids)}\n') genome_info_file.write( f'features with term: {len(feature_ids_with_feature)}') with open(feature_set_ids_file, 'w') as feature_set_ids_file: feature_set_ids_file.write('\n'.join(feature_set_ids)) with open(feature_id_go_ids_map_file, 'w') as feature_id_go_ids_map_file: with open(feature_ids_file, 'w') as feature_ids_file: for feature_id, go_ids in feature_id_go_id_list_map.items(): feature_ids_file.write( f'{feature_id} {feature_id in feature_set_ids}\n') if isinstance(go_ids, str): feature_id_go_ids_map_file.write( f'{feature_id} {go_ids}\n') else: feature_id_go_ids_map_file.write( f'{feature_id} {", ".join(go_ids)}\n') with open(go_id_genome_feature_ids_map_file, 'w') as go_id_genome_feature_ids_map_file: with open(go_id_set_feature_ids_map_file, 'w') as go_id_set_feature_ids_map_file: with open(fisher_variables_file, 'w') as fisher_variables_file: for go_id, go_info in enrichment_map.items(): mapped_features = go_info.get('mapped_features') fs_mapped_features = list( set(mapped_features).intersection(feature_set_ids)) mapped_features_line = f'{go_id}: {", ".join(mapped_features)}\n' go_id_genome_feature_ids_map_file.write( mapped_features_line) set_mapped_features_line = f'{go_id}: {", ".join(fs_mapped_features)}\n' go_id_set_feature_ids_map_file.write( set_mapped_features_line) a_value = go_info.get('num_in_subset_feature_set') b_value = len(feature_set_ids) - a_value c_value = len(mapped_features) - a_value d_value = len(feature_ids) - len( feature_set_ids) - c_value p_value = go_info.get('raw_p_value') fisher_variables_file.write( f'{go_id} a:{a_value} b:{b_value} c:{c_value} d:{d_value} ' ) fisher_variables_file.write(f'p_value:{p_value}\n') result_file = os.path.join(result_directory, 'supporting_files.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for supporting_file in supporting_files: zip_file.write(supporting_file, os.path.basename(supporting_file)) return [{ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'GO term functional enrichment supporting files' }] def _generate_output_file_list(self, result_directory, enrichment_map, feature_id_go_id_list_map, feature_set_ids, genome_ref, go_id_parent_ids_map, feature_ids): """ _generate_output_file_list: zip result files and generate file_links for report """ log('start packing result files') output_files = list() result_file = os.path.join(result_directory, 'functional_enrichment.csv') with open(result_file, 'w') as csv_file: writer = csv.writer(csv_file) writer.writerow([ 'term_id', 'term', 'ontology', 'num_in_feature_set', 'num_in_ref_genome', 'raw_p_value', 'adjusted_p_value' ]) for key, value in enrichment_map.items(): writer.writerow([ key, value['go_term'], value['namespace'], value['num_in_subset_feature_set'], value['num_in_ref_genome'], value['raw_p_value'], value['adjusted_p_value'] ]) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'GO term functional enrichment' }) supporting_files = self._generate_supporting_files( result_directory, enrichment_map, feature_id_go_id_list_map, feature_set_ids, genome_ref, go_id_parent_ids_map, feature_ids) output_files += supporting_files return output_files def _generate_html_report(self, result_directory, enrichment_map): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') enrichment_table = '' data = csv.DictReader(open( os.path.join(result_directory, 'functional_enrichment.csv')), delimiter=',') sortedlist = sorted( data, key=lambda row: (float(row['adjusted_p_value']), float(row['raw_p_value']), float(row['num_in_ref_genome'])), reverse=False) for row in sortedlist: # if row['num_in_feature_set'] != '0': enrichment_table += f'<tr><td>{row["term_id"]}</td>' enrichment_table += f'<td>{row["term"]}</td>' enrichment_table += f'<td>{row["ontology"]}</td>' enrichment_table += f'<td>{row["num_in_feature_set"]}</td>' enrichment_table += f'<td>{row["num_in_ref_genome"]}</td>' enrichment_table += f'<td>{float(row["raw_p_value"]):.3g}</td>' enrichment_table += f'<td>{float(row["adjusted_p_value"]):.3g}</td></tr>' with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<tr>Enrichment_Table</tr>', enrichment_table) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_directory, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Functional Enrichment App' }) return html_report def _get_go_maps_from_genome(self, genome_ref): """ _search_genome: search genome data """ log('start parsing GO terms from genome') feature_num = self.gsu.search({'ref': genome_ref})['num_found'] genome_features = self.gsu.search({ 'ref': genome_ref, 'limit': feature_num, 'sort_by': [['feature_id', True]] })['features'] feature_id_go_id_list_map = {} go_id_feature_id_list_map = {} go_id_go_term_map = {} feature_id_feature_info_map = {} for genome_feature in genome_features: feature_id = genome_feature.get('feature_id') feature_func = genome_feature.get('function') feature_type = genome_feature.get('feature_type') ontology_terms = genome_feature.get('ontology_terms') feature_id_feature_info_map.update({ feature_id: { 'function': feature_func, 'feature_type': feature_type } }) go_id_list = [] if ontology_terms: for ontology_id, ontology_term in ontology_terms.items(): if re.match('[gG][oO]\:.*', ontology_id): go_id_go_term_map.update({ontology_id: ontology_term}) go_id_list.append(ontology_id) if go_id_list: feature_id_go_id_list_map.update({feature_id: go_id_list}) for go_id in go_id_list: if go_id in go_id_feature_id_list_map: feature_ids = go_id_feature_id_list_map.get(go_id) feature_ids.append(feature_id) go_id_feature_id_list_map.update({go_id: feature_ids}) else: go_id_feature_id_list_map.update({go_id: [feature_id]}) else: feature_id_go_id_list_map.update({feature_id: 'Unlabeled'}) return (feature_id_go_id_list_map, go_id_feature_id_list_map, go_id_go_term_map, feature_id_feature_info_map) def _process_feature_set(self, feature_set_ref): """ _process_feature_set: process FeatureSet object return: genome_ref: reference Genome object ref feature_set_ids: FeatureSet feature ids """ log('start processing FeatureSet object') feature_set_data = self.ws.get_objects2( {'objects': [{ 'ref': feature_set_ref }]})['data'][0]['data'] feature_elements = feature_set_data['elements'] feature_set_ids = [] genome_ref_array = [] for feature_id, genome_refs in feature_elements.items(): feature_set_ids.append(feature_id) genome_ref_array += genome_refs if len(set(genome_ref_array)) > 1: error_msg = 'FeatureSet has multiple reference Genomes: {}'.format( genome_ref_array) raise ValueError(error_msg) return feature_set_ids, genome_ref_array[0] def _get_immediate_parents(self, ontology_hash, go_id, is_a_relationship, regulates_relationship, part_of_relationship): """ _get_immediate_parents: get immediate parents go_ids for a given go_id """ parent_ids = [] antology_info = ontology_hash.get(go_id, {}) if is_a_relationship: is_a_parents = antology_info.get('is_a') if is_a_parents: for parent_string in is_a_parents: is_a_parent_id = parent_string.split('!')[0][:-1] parent_ids.append(is_a_parent_id) if regulates_relationship: relationship = antology_info.get('relationship') if relationship: for relationship_string in relationship: if relationship_string.split(' ')[0] == 'regulates': parent_ids.append(relationship_string.split(' ')[1]) if part_of_relationship: relationship = antology_info.get('relationship') if relationship: for relationship_string in relationship: if relationship_string.split(' ')[0] == 'part_of': parent_ids.append(relationship_string.split(' ')[1]) return parent_ids def _fetch_all_parents_go_ids(self, ontology_hash, go_id, is_a_relationship, regulates_relationship, part_of_relationship): """ _fetch_all_parents_go_ids: recusively fetch all parent go_ids """ parent_ids = self._get_immediate_parents(ontology_hash, go_id, is_a_relationship, regulates_relationship, part_of_relationship) if parent_ids: grand_parent_ids = parent_ids for parent_id in parent_ids: grand_parent_ids += self._fetch_all_parents_go_ids( ontology_hash, parent_id, is_a_relationship, regulates_relationship, part_of_relationship)[parent_id] return {go_id: list(set(grand_parent_ids))} else: return {go_id: []} def _generate_parent_child_map(self, ontology_hash, go_ids, is_a_relationship=True, regulates_relationship=True, part_of_relationship=False): """ _generate_parent_child_map: fetch parent go_ids for given go_id """ log('start fetching parent go_ids') start = time.time() go_id_parent_ids_map = {} for go_id in go_ids: fetch_result = self._fetch_all_parents_go_ids( ontology_hash, go_id, is_a_relationship, regulates_relationship, part_of_relationship) go_id_parent_ids_map.update(fetch_result) end = time.time() print(f'used {end - start:.2f} s') return go_id_parent_ids_map def _round(self, number, digits=3): """ round number to given digits """ round_number = format(number, f'.{digits}g') return round_number def __init__(self, config): self.ws_url = config['workspace-url'] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.ws = Workspace(self.ws_url, token=self.token) def run_fe1(self, params): """ run_fe1: Functional Enrichment One required params: feature_set_ref: FeatureSet object reference workspace_name: the name of the workspace it gets saved to optional params: propagation: includes is_a relationship to all go terms (default is 1) filter_ref_features: filter reference genome features with no go terms (default is 0) statistical_significance: parameter for statistical significance. Select one from left_tailed, right_tailed or two_tailed (default is left_tailed) ignore_go_term_not_in_feature_set: ignore Go term analysis if term is not associated with FeatureSet (default is 1) return: result_directory: folder path that holds all files generated by run_deseq2_app report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ log('--->\nrunning FunctionalEnrichmentUtil.run_fe1\n' + f'params:\n{json.dumps(params, indent=1)}') self._validate_run_fe1_params(params) propagation = params.get('propagation', True) filter_ref_features = params.get('filter_ref_features', False) statistical_significance = params.get('statistical_significance', 'left_tailed') ignore_go_term_not_in_feature_set = params.get( 'ignore_go_term_not_in_feature_set', True) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) feature_set_ids, genome_ref = self._process_feature_set( params.get('feature_set_ref')) (feature_id_go_id_list_map, go_id_feature_id_list_map, go_id_go_term_map, feature_id_feature_info_map ) = self._get_go_maps_from_genome(genome_ref) if not len(feature_id_go_id_list_map): raise ValueError( "No features in the referenced genome ({}) contain ontology mappings" .format(genome_ref)) unknown_feature_ids = set(feature_set_ids) - set( feature_id_feature_info_map.keys()) if unknown_feature_ids: raise ValueError( "The specified feature set contains {} feature ids which are not " "present referenced genome".format(genome_ref)) if filter_ref_features: log('start filtering features with no term') feature_ids = [] for feature_id, go_ids in feature_id_go_id_list_map.items(): if isinstance(go_ids, list): feature_ids.append(feature_id) else: feature_ids = list(feature_id_go_id_list_map.keys()) ontology_hash = dict() ontologies = self.ws.get_objects([{ 'workspace': 'KBaseOntology', 'name': 'gene_ontology' }, { 'workspace': 'KBaseOntology', 'name': 'plant_ontology' }]) ontology_hash.update(ontologies[0]['data']['term_hash']) ontology_hash.update(ontologies[1]['data']['term_hash']) if propagation: go_id_parent_ids_map = self._generate_parent_child_map( ontology_hash, list(go_id_go_term_map.keys()), regulates_relationship=False) else: go_id_parent_ids_map = {} for go_id in go_id_go_term_map.keys(): go_id_parent_ids_map.update({go_id: []}) log('including parents to feature id map') for go_id, parent_ids in go_id_parent_ids_map.items(): mapped_features = go_id_feature_id_list_map.get(go_id) for parent_id in parent_ids: parent_mapped_features = go_id_feature_id_list_map.get( parent_id) if not parent_mapped_features: parent_mapped_features = [] if mapped_features: parent_mapped_features += mapped_features go_id_feature_id_list_map.update( {parent_id: list(set(parent_mapped_features))}) log('start calculating p-values') enrichment_map = {} go_info_map = {} all_raw_p_value = [] pos = 0 for go_id, go_term in go_id_go_term_map.items(): mapped_features = go_id_feature_id_list_map.get(go_id) # in feature_set matches go_id a = len(set(mapped_features).intersection(feature_set_ids)) # ignore go term analysis if not associated with FeatureSet if ignore_go_term_not_in_feature_set and a == 0: continue # in feature_set doesn't match go_id b = len(feature_set_ids) - a # not in feature_set matches go_id c = len(mapped_features) - a # not in feature_set doesn't match go_id d = len(feature_ids) - len(feature_set_ids) - c fisher_value = fisher.pvalue(a, b, c, d) if statistical_significance == 'left_tailed': raw_p_value = self._round(fisher_value.left_tail) elif statistical_significance == 'right_tailed': raw_p_value = self._round(fisher_value.right_tail) elif statistical_significance == 'two_tailed': raw_p_value = self._round(fisher_value.two_tail) else: raise ValueError('Improper statistical_significance value') all_raw_p_value.append(raw_p_value) go_info_map.update({ go_id: { 'raw_p_value': raw_p_value, 'num_in_ref_genome': len(mapped_features), 'num_in_subset_feature_set': a, 'pos': pos, 'mapped_features': mapped_features } }) pos += 1 stats = importr('stats') adjusted_p_values = stats.p_adjust(FloatVector(all_raw_p_value), method='fdr') for go_id, go_info in go_info_map.items(): if go_id not in ontology_hash: continue adjusted_p_value = self._round( adjusted_p_values[go_info.get('pos')]) namespace = ontology_hash[go_id]['namespace'] enrichment_map.update({ go_id: { 'raw_p_value': go_info.get('raw_p_value'), 'adjusted_p_value': adjusted_p_value, 'num_in_ref_genome': go_info.get('num_in_ref_genome'), 'num_in_subset_feature_set': go_info.get('num_in_subset_feature_set'), 'go_term': go_id_go_term_map.get(go_id), 'namespace': namespace.split("_")[1][0].upper(), 'mapped_features': go_info.get('mapped_features') } }) returnVal = {'result_directory': result_directory} report_output = self._generate_report(enrichment_map, result_directory, params.get('workspace_name'), feature_id_go_id_list_map, feature_set_ids, genome_ref, go_id_parent_ids_map, feature_ids) returnVal.update(report_output) return returnVal