class ImportPhenotypeSetUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.fba = fba_tools(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_phenotype_set_from_staging(self, params): ''' import_phenotype_set_from_staging: wrapper method for fba_tools.tsv_file_to_phenotype_set required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name phenotype_set_name: output PhenotypeSet object name workspace_name: workspace name/ID of the object genome: Genome object that contains features referenced by the Phenotype Set return: obj_ref: return object reference ''' log('--->\nrunning ImportPhenotypeSetUtil.import_phenotype_set_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_phenotype_set_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = { 'path': scratch_file_path } import_phenotype_set_params = params.copy() import_phenotype_set_params['phenotype_set_file'] = file ref = self.fba.tsv_file_to_phenotype_set(import_phenotype_set_params) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'), ref.get('ref')) returnVal = {'obj_ref': ref.get('ref')} return returnVal def validate_import_phenotype_set_from_staging_params(self, params): """ validate_import_phenotype_set_from_staging_params: validates params passed to import_phenotype_set_from_staging method """ # check for required parameters for p in ['staging_file_subdir_path', 'workspace_name', 'phenotype_set_name', 'genome']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_phenotype_set_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' get_objects_params = { 'object_refs': [obj_ref], 'ignore_errors': False } object_data = self.dfu.get_objects(get_objects_params) upload_message += "Phenotype Set Name: " upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n' upload_message += 'Imported File: {}\n'.format(params.get('staging_file_subdir_path')) report_params = {'message': upload_message, 'objects_created': [{'ref': obj_ref, 'description': 'Imported Phenotype Set'}], 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_mothods_report_' + uuid_string} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output
class CompoundSetUtils: ''' Module Name: CompoundSetUtils Module Description: A KBase module: CompoundSetUtils Contains tools for import & export of compound sets ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "2.1.2" GIT_URL = "https://github.com/Tianhao-Gu/CompoundSetUtils.git" GIT_COMMIT_HASH = "12e1f23022354f475d7ceb3631913956eb5831a7" #BEGIN_CLASS_HEADER @staticmethod def _check_param(in_params, req_param, opt_param=list()): """ Check if each of the params in the list are in the input params """ for param in req_param: if param not in in_params: raise ValueError('{} parameter is required'.format(param)) defined_param = set(req_param + opt_param) for param in in_params: if param not in defined_param: logging.warning( "Received unexpected parameter {}".format(param)) def _save_to_ws_and_report(self, ws_id, source, compoundset, message=None): """Save compound set to the workspace and make report""" info = self.dfu.save_objects({ 'id': ws_id, "objects": [{ "type": "KBaseBiochem.CompoundSet", "data": compoundset, "name": compoundset['name'] }] })[0] compoundset_ref = "%s/%s/%s" % (info[6], info[0], info[4]) if not message: message = 'Imported %s as %s' % (source, info[1]) report_params = { 'objects_created': [{ 'ref': compoundset_ref, 'description': 'Compound Set' }], 'message': message, 'workspace_name': info[7], 'report_object_name': 'compound_set_creation_report' } # Construct the output to send back report_client = KBaseReport(self.callback_url) report_info = report_client.create_extended_report(report_params) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'compoundset_ref': compoundset_ref } return output def _export_compound_set(self, ref, file_type): logging.info("Exporting {} as {}".format(ref, file_type)) compoundset = self.dfu.get_objects({'object_refs': [ref]})['data'][0]['data'] temp_dir = "{}/{}".format(self.scratch, uuid.uuid4()) os.mkdir(temp_dir) out_dir = "{}/{}".format(temp_dir, compoundset['name']) os.mkdir(out_dir) target = "{}/{}.{}".format(out_dir, compoundset['name'], file_type) if file_type == 'tsv': parse.write_tsv(compoundset, target) elif file_type == 'sdf': parse.write_sdf(compoundset, target) else: raise ValueError("Bad file_type: {}".format(file_type)) handle = self.dfu.package_for_download({ 'file_path': out_dir, 'ws_refs': [ref] }) output = {'shock_id': handle['shock_id']} return output def _fetch_mol2_files(self, ref): compoundset_obj = self.dfu.get_objects({'object_refs': [ref]})['data'][0] compoundset_info = compoundset_obj['info'] compoundset = compoundset_obj['data'] temp_dir = "{}/{}".format(self.scratch, uuid.uuid4()) os.mkdir(temp_dir) compounds = compoundset.get('compounds') mol2_files = [] comp_id_mol2_file_name_map = {} for compound in compounds: mol2_handle_ref = compound.get('mol2_handle_ref') if mol2_handle_ref: mol2_file_path = self.dfu.shock_to_file({ 'handle_id': mol2_handle_ref, 'file_path': temp_dir }).get('file_path') mol2_files.append(mol2_file_path) comp_id_mol2_file_name_map[compound['id']] = os.path.basename( mol2_file_path) packed_mol2_files_path = None if mol2_files: packed_mol2_files_path = os.path.join( temp_dir, compoundset_info[1] + '_mol2_files.zip') with zipfile.ZipFile(packed_mol2_files_path, 'w') as zipMe: for file in mol2_files: zipMe.write(file, compress_type=zipfile.ZIP_DEFLATED) return packed_mol2_files_path, comp_id_mol2_file_name_map def _covert_mol2_files_to_pdbqt(self, ref): compoundset_obj = self.dfu.get_objects({'object_refs': [ref]})['data'][0] compoundset_info = compoundset_obj['info'] compoundset = compoundset_obj['data'] mol2_temp_dir = "{}/{}".format(self.scratch, uuid.uuid4()) os.mkdir(mol2_temp_dir) pdbqt_temp_dir = "{}/{}".format(self.scratch, uuid.uuid4()) os.mkdir(pdbqt_temp_dir) compounds = compoundset.get('compounds') pdbqt_files = [] comp_id_pdbqt_file_name_map = {} for compound in compounds: mol2_handle_ref = compound.get('mol2_handle_ref') if mol2_handle_ref: mol2_file_path = self.dfu.shock_to_file({ 'handle_id': mol2_handle_ref, 'file_path': mol2_temp_dir }).get('file_path') pdbqt_file_path = os.path.join(pdbqt_temp_dir, compound['id'] + '.pdbqt') command = [ 'obabel', '-i', 'mol2', mol2_file_path, '-o', 'pdbqt', '-O', pdbqt_file_path ] process = Popen(command, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() if 'converted' in str(stderr) and 'molecule' in str(stderr): logging.info( 'Successfully converted Mol2 to pdbqt format: {}'. format(os.path.basename(mol2_file_path))) pdbqt_files.append(pdbqt_file_path) comp_id_pdbqt_file_name_map[ compound['id']] = os.path.basename(pdbqt_file_path) else: logging.warning( 'Cannot convert Mol2 file to pdbqt format: {}'.format( os.path.basename(mol2_file_path))) logging.warning(stderr) packed_pdbqt_files_path = None if pdbqt_files: packed_pdbqt_files_path = os.path.join( pdbqt_temp_dir, compoundset_info[1] + '_pdbqt_files.zip') with zipfile.ZipFile(packed_pdbqt_files_path, 'w') as zipMe: for file in pdbqt_files: zipMe.write(file, compress_type=zipfile.ZIP_DEFLATED) return packed_pdbqt_files_path, comp_id_pdbqt_file_name_map #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) #END_CONSTRUCTOR pass def compound_set_from_file(self, ctx, params): """ CompoundSetFromFile string staging_file_path :param params: instance of type "compoundset_upload_params" -> structure: parameter "workspace_id" of String, parameter "staging_file_path" of String, parameter "compound_set_name" of String, parameter "mol2_staging_file_path" of String :returns: instance of type "compoundset_upload_results" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "compoundset_ref" of type "obj_ref" """ # ctx is the context object # return variables are: output #BEGIN compound_set_from_file self._check_param( params, ['workspace_id', 'staging_file_path', 'compound_set_name'], opt_param=['mol2_staging_file_path']) scratch_file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': params['staging_file_path'] }).get('copy_file_path') # I probably should be uploading the raw files to shock mol2_staging_file_path = params.get('mol2_staging_file_path') mol2_file_dir = None if mol2_staging_file_path: mol2_scratch_file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': mol2_staging_file_path }).get('copy_file_path') try: logging.info("start unpacking mol2 file") mol2_file_path_out = self.dfu.unpack_file( {'file_path': mol2_scratch_file_path})['file_path'] mol2_file_dir = os.path.dirname(mol2_file_path_out) except Exception: raise ValueError('Cannot unpack mol2 file: {}'.format( os.path.basename(mol2_file_path_out))) ext = os.path.splitext(scratch_file_path)[1] file_name = os.path.basename(scratch_file_path) if ext == '.sdf': compounds = parse.read_sdf(scratch_file_path, mol2_file_dir=mol2_file_dir, callback_url=self.callback_url) elif ext == '.tsv': compounds = parse.read_tsv(scratch_file_path, mol2_file_dir=mol2_file_dir, callback_url=self.callback_url) else: raise ValueError('Invalid input file type. Expects .tsv or .sdf') compoundset = { 'id': params['compound_set_name'], 'name': params['compound_set_name'], 'description': 'Compound Set produced from %s' % file_name, 'compounds': compounds, } output = self._save_to_ws_and_report(params['workspace_id'], params['staging_file_path'], compoundset) #END compound_set_from_file # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method compound_set_from_file return value ' + 'output is not type dict as required.') # return the results return [output] def compound_set_to_file(self, ctx, params): """ CompoundSetToFile string compound_set_name string output_format :param params: instance of type "compoundset_download_params" -> structure: parameter "compound_set_ref" of String, parameter "output_format" of String :returns: instance of type "compoundset_download_results" -> structure: parameter "file_path" of String, parameter "packed_mol2_files_path" of String, parameter "comp_id_mol2_file_name_map" of mapping from String to String """ # ctx is the context object # return variables are: output #BEGIN compound_set_to_file self._check_param(params, ['compound_set_ref', 'output_format']) ret = self.dfu.get_objects( {'object_refs': [params['compound_set_ref']]})['data'][0] compoundset = ret['data'] ext = params['output_format'] out = f"{self.scratch}/{uuid.uuid4()}" os.mkdir(out) out += f"/{compoundset['name']}" if ext == 'sdf': outfile_path = parse.write_sdf(compoundset, out) elif ext == 'tsv': outfile_path = parse.write_tsv(compoundset, out) else: outfile_path = parse.write_mol_dir(compoundset, out, ext) packed_mol2_files_path, comp_id_mol2_file_name_map = self._fetch_mol2_files( params['compound_set_ref']) output = { 'file_path': outfile_path, 'packed_mol2_files_path': packed_mol2_files_path, 'comp_id_mol2_file_name_map': comp_id_mol2_file_name_map } #END compound_set_to_file # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method compound_set_to_file return value ' + 'output is not type dict as required.') # return the results return [output] def compound_set_from_model(self, ctx, params): """ CompoundSetFromModel required: string workspace_id string model_ref string compound_set_name :param params: instance of type "compoundset_from_model_params" -> structure: parameter "workspace_id" of String, parameter "model_ref" of String, parameter "compound_set_name" of String :returns: instance of type "compoundset_upload_results" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "compoundset_ref" of type "obj_ref" """ # ctx is the context object # return variables are: output #BEGIN compound_set_from_model self._check_param(params, ['workspace_id', 'model_ref', 'compound_set_name']) model = self.dfu.get_objects({'object_refs': [params['model_ref']] })['data'][0]['data'] compounds, undef = parse.parse_model(model) compoundset = { 'id': params['compound_set_name'], 'name': params['compound_set_name'], 'description': 'Compound Set produced from %s, a metabolic model' % model['id'], 'compounds': compounds, } output = self._save_to_ws_and_report(params['workspace_id'], model['name'], compoundset) #END compound_set_from_model # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method compound_set_from_model return value ' + 'output is not type dict as required.') # return the results return [output] def export_compoundset_as_tsv(self, ctx, params): """ :param params: instance of type "ExportParams" (input and output structure functions for standard downloaders) -> structure: parameter "input_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_compoundset_as_tsv output = self._export_compound_set(params['input_ref'], 'tsv') #END export_compoundset_as_tsv # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_compoundset_as_tsv return value ' + 'output is not type dict as required.') # return the results return [output] def export_compoundset_as_sdf(self, ctx, params): """ :param params: instance of type "ExportParams" (input and output structure functions for standard downloaders) -> structure: parameter "input_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_compoundset_as_sdf output = self._export_compound_set(params['input_ref'], 'sdf') #END export_compoundset_as_sdf # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_compoundset_as_sdf return value ' + 'output is not type dict as required.') # return the results return [output] def export_compoundset_mol2_files(self, ctx, params): """ :param params: instance of type "ExportParams" (input and output structure functions for standard downloaders) -> structure: parameter "input_ref" of String :returns: instance of type "export_mol2_files_results" -> structure: parameter "packed_mol2_files_path" of String, parameter "comp_id_mol2_file_name_map" of mapping from String to String """ # ctx is the context object # return variables are: output #BEGIN export_compoundset_mol2_files self._check_param(params, ['input_ref']) packed_mol2_files_path, comp_id_mol2_file_name_map = self._fetch_mol2_files( params['input_ref']) output = { 'packed_mol2_files_path': packed_mol2_files_path, 'comp_id_mol2_file_name_map': comp_id_mol2_file_name_map } #END export_compoundset_mol2_files # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError( 'Method export_compoundset_mol2_files return value ' + 'output is not type dict as required.') # return the results return [output] def convert_compoundset_mol2_files_to_pdbqt(self, ctx, params): """ :param params: instance of type "ExportParams" (input and output structure functions for standard downloaders) -> structure: parameter "input_ref" of String :returns: instance of type "convert_mol2_files_results" -> structure: parameter "packed_pdbqt_files_path" of String, parameter "comp_id_pdbqt_file_name_map" of mapping from String to String """ # ctx is the context object # return variables are: output #BEGIN convert_compoundset_mol2_files_to_pdbqt self._check_param(params, ['input_ref']) packed_pdbqt_files_path, comp_id_pdbqt_file_name_map = self._covert_mol2_files_to_pdbqt( params['input_ref']) output = { 'packed_pdbqt_files_path': packed_pdbqt_files_path, 'comp_id_pdbqt_file_name_map': comp_id_pdbqt_file_name_map } #END convert_compoundset_mol2_files_to_pdbqt # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError( 'Method convert_compoundset_mol2_files_to_pdbqt return value ' + 'output is not type dict as required.') # return the results return [output] def fetch_mol2_files_from_zinc(self, ctx, params): """ :param params: instance of type "FetchZINCMol2Params" -> structure: parameter "workspace_id" of String, parameter "compoundset_ref" of type "obj_ref", parameter "over_write" of Long :returns: instance of type "compoundset_upload_results" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "compoundset_ref" of type "obj_ref" """ # ctx is the context object # return variables are: output #BEGIN fetch_mol2_files_from_zinc self._check_param(params, ['workspace_id', 'compoundset_ref'], opt_param=['over_write']) over_write = params.get('over_write', False) compoundset = self.dfu.get_objects( {'object_refs': [params['compoundset_ref']]})['data'][0]['data'] compoundset_copy = copy.deepcopy(compoundset) count = 0 for compound in compoundset_copy.get('compounds'): if not compound.get('mol2_handle_ref') or over_write: temp_dir = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(temp_dir) mol2_file_path = os.path.join(temp_dir, compound.get('id')) inchikey = compound.get('inchikey') if zinc_db_util.inchikey_to_mol2(inchikey, mol2_file_path): handle_id = self.dfu.file_to_shock({ 'file_path': mol2_file_path, 'make_handle': True })['handle']['hid'] compound['mol2_handle_ref'] = handle_id compound['mol2_source'] = 'ZINC15' count += 1 else: logging.warning( 'Cannot find Mol2 file from ZINC for {}'.format( inchikey)) if count: message = 'Successfully fetched {} Mol2 files from ZINC database'.format( count) else: message = 'Fetched 0 Mol2 files from ZINC database. The CompoundSet object remains unchanged.' output = self._save_to_ws_and_report(params['workspace_id'], '', compoundset_copy, message=message) #END fetch_mol2_files_from_zinc # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError( 'Method fetch_mol2_files_from_zinc return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class AttributesUtil: def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.srv_wiz_url = config['srv-wiz-url'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.kbse = KBaseSearchEngine(config['search-url']) self.data_util = DataUtil(config) self.wsClient = workspaceService(self.ws_url, token=self.token) self.DEFAULT_ONTOLOGY_ID = "Custom:Term" self.DEFAULT_UNIT_ID = "Custom:Unit" self.ONT_LABEL_DEL = " - " self.ONT_TERM_DEL = ":" @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning( "Unexpected parameter {} supplied".format(param)) def file_to_attribute_mapping(self, params): """Convert a user supplied file to a compound set""" if 'input_file_path' in params: scratch_file_path = params['input_file_path'] elif 'input_shock_id' in params: scratch_file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') else: raise ValueError( "Must supply either a input_shock_id or input_file_path") attr_mapping = self._file_to_am_obj(scratch_file_path) info = self.dfu.save_objects({ "id": params['output_ws_id'], "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": attr_mapping, "name": params['output_obj_name'] }] })[0] return { "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4]) } def append_file_to_attribute_mapping(self, staging_file_subdir_path, old_am_ref, output_ws_id, new_am_name=None): """append an attribute mapping file to existing attribute mapping object """ download_staging_file_params = { 'staging_file_subdir_path': staging_file_subdir_path } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') append_am_data = self._file_to_am_obj(scratch_file_path) old_am_obj = self.dfu.get_objects({'object_refs': [old_am_ref]})['data'][0] old_am_info = old_am_obj['info'] old_am_name = old_am_info[1] old_am_data = old_am_obj['data'] new_am_data = self._check_and_append_am_data(old_am_data, append_am_data) if not new_am_name: current_time = time.localtime() new_am_name = old_am_name + time.strftime('_%H_%M_%S_%Y_%m_%d', current_time) info = self.dfu.save_objects({ "id": output_ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": new_am_data, "name": new_am_name }] })[0] return { "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4]) } def update_matrix_attribute_mapping(self, params): dimension = params.get('dimension') if dimension not in ['col', 'row']: raise ValueError('Please use "col" or "row" for input dimension') workspace_name = params.get('workspace_name') old_matrix_ref = params.get('input_matrix_ref') old_matrix_obj = self.dfu.get_objects( {'object_refs': [old_matrix_ref]})['data'][0] old_matrix_info = old_matrix_obj['info'] old_matrix_data = old_matrix_obj['data'] old_am_ref = old_matrix_data.get( '{}_attributemapping_ref'.format(dimension)) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name if not old_am_ref: raise ValueError( 'Matrix object does not have {} attribute mapping'.format( dimension)) new_am_ref = self.append_file_to_attribute_mapping( params['staging_file_subdir_path'], old_am_ref, workspace_id, params['output_am_obj_name'])['attribute_mapping_ref'] old_matrix_data['{}_attributemapping_ref'.format( dimension)] = new_am_ref info = self.dfu.save_objects({ "id": workspace_id, "objects": [{ "type": old_matrix_info[2], "data": old_matrix_data, "name": params['output_matrix_obj_name'] }] })[0] new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) objects_created = [{ 'ref': new_am_ref, 'description': 'Updated Attribute Mapping' }, { 'ref': new_matrix_obj_ref, 'description': 'Updated Matrix' }] report_params = { 'message': '', 'objects_created': objects_created, 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) return { 'new_matrix_obj_ref': new_matrix_obj_ref, 'new_attribute_mapping_ref': new_am_ref, 'report_name': output['name'], 'report_ref': output['ref'] } def _check_and_append_am_data(self, old_am_data, append_am_data): exclude_keys = {'attributes', 'instances'} new_am_data = { k: old_am_data[k] for k in set(list(old_am_data.keys())) - exclude_keys } old_attrs = old_am_data.get('attributes') old_insts = old_am_data.get('instances') append_attrs = append_am_data.get('attributes') append_insts = append_am_data.get('instances') # checking duplicate attributes old_attrs_names = [old_attr.get('attribute') for old_attr in old_attrs] append_attrs_names = [ append_attr.get('attribute') for append_attr in append_attrs ] duplicate_attrs = set(old_attrs_names).intersection(append_attrs_names) if duplicate_attrs: error_msg = 'Duplicate attribute mappings: [{}]'.format( duplicate_attrs) raise ValueError(error_msg) # checking missing instances missing_inst = old_insts.keys() - append_insts.keys() if missing_inst: error_msg = 'Appended attribute mapping misses [{}] instances'.format( missing_inst) raise ValueError(error_msg) new_attrs = old_attrs + append_attrs new_am_data['attributes'] = new_attrs new_insts = deepcopy(old_insts) for inst_name, val in new_insts.items(): append_val = append_insts.get(inst_name) val.extend(append_val) new_am_data['instances'] = new_insts return new_am_data def _am_data_to_df(self, data): """ Converts a compound set object data to a dataframe """ attributes = pd.DataFrame(data['attributes']) attributes.rename(columns=lambda x: x.replace("ont", "ontology"). capitalize().replace("_", " ")) instances = pd.DataFrame(data['instances']) am_df = attributes.join(instances) return am_df def _clusterset_data_to_df(self, data): """ Converts a cluster set object data to a dataframe """ original_matrix_ref = data.get('original_data') data_matrix = self.data_util.fetch_data({ 'obj_ref': original_matrix_ref }).get('data_matrix') data_df = pd.read_json(data_matrix) clusters = data.get('clusters') id_name_list = [ list(cluster.get('id_to_data_position').keys()) for cluster in clusters ] id_names = [item for sublist in id_name_list for item in sublist] if set(data_df.columns.tolist()) == set( id_names): # cluster is based on columns data_df = data_df.T cluster_names = [None] * data_df.index.size cluster_id = 0 for cluster in clusters: item_ids = list(cluster.get('id_to_data_position').keys()) item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids] for idx in item_idx: cluster_names[idx] = cluster_id cluster_id += 1 data_df['cluster'] = cluster_names return data_df def _ws_obj_to_df(self, input_ref): """Converts workspace obj to a DataFrame""" res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0] name = res['info'][1] obj_type = res['info'][2] if "KBaseExperiments.AttributeMapping" in obj_type: cs_df = self._am_data_to_df(res['data']) elif "KBaseExperiments.ClusterSet" in obj_type: cs_df = self._clusterset_data_to_df(res['data']) else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please supply KBaseExperiments.AttributeMapping or KBaseExperiments.ClusterSet' raise ValueError("err_msg") return name, cs_df, obj_type def _file_to_am_obj(self, scratch_file_path): try: df = pd.read_excel(scratch_file_path, dtype='str') except XLRDError: df = pd.read_csv(scratch_file_path, sep=None, dtype='str') df = df.replace('nan', '') if df.columns[1].lower() == "attribute ontology id": am_obj = self._df_to_am_obj(df) else: am_obj = self._isa_df_to_am_object(df) return am_obj def _df_to_am_obj(self, am_df): """Converts a dataframe from a user file to a compound set object""" if not len(am_df): raise ValueError("No attributes in supplied files") attribute_df = am_df.filter(regex="[Uu]nit|[Aa]ttribute") instance_df = am_df.drop(attribute_df.columns, axis=1) if not len(instance_df.columns): raise ValueError( "Unable to find any instance columns in supplied file") attribute_df.rename( columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(), inplace=True) if "attribute" not in attribute_df.columns: raise ValueError( "Unable to find a 'attribute' column in supplied file") attribute_df['source'] = 'upload' attribute_fields = ('attribute', 'unit', 'attribute_ont_id', 'unit_ont_id', 'source') attributes = attribute_df.filter( items=attribute_fields).to_dict('records') print(attributes) self._validate_attribute_values( am_df.set_index(attribute_df.attribute).iterrows()) attribute_mapping = { 'ontology_mapping_method': "User Curation", 'attributes': [self._add_ontology_info(f) for f in attributes], 'instances': instance_df.to_dict('list') } return attribute_mapping def _isa_df_to_am_object(self, isa_df): skip_columns = { 'Raw Data File', 'Derived Data File', 'Array Data File', 'Image File' } if 'Sample Name' in isa_df.columns and not any( isa_df['Sample Name'].duplicated()): isa_df.set_index('Sample Name', inplace=True) elif 'Assay Name' in isa_df.columns and not any( isa_df['Assay Name'].duplicated()): isa_df.set_index('Assay Name', inplace=True) elif not any(isa_df[isa_df.columns[0]].duplicated()): logging.warning(f'Using {isa_df.columns[0]} as ID column') isa_df.set_index(isa_df.columns[0], inplace=True) else: raise ValueError( "Unable to detect an ID column that was unigue for each row. " f"Considered 'Sample Names', 'Assay Names' and {isa_df.columns[0]}" ) self._validate_attribute_values(isa_df.iteritems()) attribute_mapping = { 'ontology_mapping_method': "User Curation - ISA format" } attribute_mapping[ 'attributes'], new_skip_cols = self._get_attributes_from_isa( isa_df, skip_columns) reduced_isa = isa_df.drop(columns=new_skip_cols, errors='ignore') attribute_mapping['instances'] = reduced_isa.T.to_dict('list') return attribute_mapping def _validate_attribute_values(self, attribute_series): errors = {} for attr, vals in attribute_series: try: validator = getattr(AttributeValidation, attr) attr_errors = validator(vals) if attr_errors: errors[attr] = attr_errors except AttributeError: continue if errors: for attr, attr_errors in errors.items(): logging.error( f'Attribute {attr} had the following validation errors:\n' "\n".join(attr_errors) + '\n') raise ValueError( f'The following attributes failed validation: {", ".join(errors)}' f'\n See the log for details') def _get_attributes_from_isa(self, isa_df, skip_columns): attributes = [] # associate attribute columns with the other columns that relate to them for i, col in enumerate(isa_df.columns): if col.startswith('Term Source REF'): skip_columns.add(col) last_attr = attributes[-1] if '_unit' in last_attr: last_attr['_unit_ont'] = col else: last_attr['_val_ont'] = col elif col.startswith('Term Accession Number'): # If the term Accession is a web link only grab the last bit # Similarly, sometimes the number is prefixed with the term source e.x. UO_0000012 isa_df[col] = isa_df[col].map( lambda x: x.split("/")[-1].split("_")[-1]) skip_columns.add(col) last_attr = attributes[-1] if '_unit' in last_attr: last_attr['_unit_accession'] = col else: last_attr['_val_accession'] = col elif col.startswith('Unit'): skip_columns.add(col) last_attr = attributes[-1] if last_attr.get('unit'): raise ValueError( "More than one unit column is supplied for attribute {}" .format(last_attr['attribute'])) last_attr['_unit'] = col elif col not in skip_columns: split_col = col.split("|", 1) if len(split_col) > 1: attributes.append({ "attribute": split_col[0], "attribute_ont_id": split_col[1], "source": "upload" }) else: attributes.append({"attribute": col, "source": "upload"}) # handle the categories for each attribute for i, attribute in enumerate(attributes): if '_val_accession' in attribute: category_df = isa_df[[ attribute['attribute'], attribute.pop('_val_ont'), attribute.pop('_val_accession') ]].drop_duplicates() category_df[ 'attribute_ont_id'] = category_df.iloc[:, 1].str.cat( category_df.iloc[:, 2], ":") category_df['value'] = category_df[attribute['attribute']] cats = category_df.set_index(attribute['attribute'])[[ 'value', 'attribute_ont_id' ]].to_dict('index') attribute['categories'] = { k: self._add_ontology_info(v) for k, v in cats.items() } if '_unit' in attribute: units = isa_df[attribute.pop('_unit')].unique() if len(units) > 1: raise ValueError( "More than one unit type is supplied for attribute {}: {}" .format(attribute['attribute'], units)) attribute['unit'] = units[0] if '_unit_ont' in attribute: unit_ont = isa_df[attribute.pop('_unit_ont')].str.cat( isa_df[attribute.pop('_unit_accession')], ":").unique() if len(units) > 1: raise ValueError( "More than one unit ontology is supplied for attribute " "{}: {}".format(attribute['attribute'], unit_ont)) attribute['unit_ont_id'] = unit_ont[0] attributes[i] = self._add_ontology_info(attribute) return attributes, skip_columns def _search_ontologies(self, term, closest=False): """ Match to an existing KBase ontology term :param term: Test to match :param closest: if false, term must exactly match an ontology ID :return: dict(ontology_ref, id) """ params = { "object_types": ["OntologyTerm"], "match_filter": { "lookup_in_keys": { "id": { "value": term } } }, "access_filter": { "with_private": 0, "with_public": 1 }, "pagination": { "count": 1 }, "post_processing": { "skip_data": 1 } } if closest: params['match_filter'] = {"full_text_in_all": term} res = self.kbse.search_objects(params) if not res['objects']: return None term = res['objects'][0] return { "ontology_ref": term['guid'].split(":")[1], "id": term['key_props']['id'] } def _add_ontology_info(self, attribute): """Searches KBASE ontologies for terms matching the user supplied attributes and units. Add the references if found""" optionals = { "unit", "unit_ont_id", "unit_ont_ref", } attribute = { k: v for k, v in attribute.items() if k not in optionals or v != "" } ont_info = self._search_ontologies( attribute.get('attribute_ont_id', "").replace("_", ":")) if ont_info: attribute['attribute_ont_ref'] = ont_info['ontology_ref'] attribute['attribute_ont_id'] = ont_info['id'] elif not attribute.get( 'attribute_ont_id') or attribute['attribute_ont_id'] == ":": attribute.pop('attribute_ont_id', None) if attribute.get('unit'): ont_info = self._search_ontologies( attribute.get('unit_ont_id', '').replace("_", ":")) if ont_info: attribute['unit_ont_ref'] = ont_info['ontology_ref'] attribute['unit_ont_id'] = ont_info['id'] elif not attribute.get( 'attribute_ont_id') or attribute['unit_ont_id'] == ":": attribute.pop('unit_ont_id', None) return attribute def to_tsv(self, params): """Convert an compound set to TSV file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".tsv") df.to_csv(files['file_path'], sep="\t", index=False) return _id, files def to_excel(self, params): """Convert an compound set to Excel file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".xlsx") writer = pd.ExcelWriter(files['file_path']) if "KBaseExperiments.AttributeMapping" in obj_type: df.to_excel(writer, "Attributes", index=False) elif "KBaseExperiments.ClusterSet" in obj_type: df.to_excel(writer, "ClusterSet", index=True) # else is checked in `_ws_obj_to_df` writer.save() return _id, files def export(self, file, name, input_ref): """Saves a set of files to SHOCK for export""" export_package_dir = os.path.join(self.scratch, name + str(uuid.uuid4())) os.makedirs(export_package_dir) shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [input_ref] }) return {'shock_id': package_details['shock_id']}
class CompMolNWChem: ''' Module Name: CompMolNWChem Module Description: A KBase module: CompMolNWChem ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/nkkchem/CompMolNWChem.git" GIT_COMMIT_HASH = "0e157ea3f395c544a04c1542473be59ec39129ef" #BEGIN_CLASS_HEADER def _generate_output_file_list(self, result_directory): """ _generate_output_file_list: zip result files and generate file_links for report """ #log('start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join(output_directory, 'DESeq2_result.zip') plot_file = os.path.join(output_directory, 'DESeq2_plot.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if not (file.endswith('.zip') or file.endswith('.png') or file.endswith('.DS_Store')): zip_file.write(os.path.join(root, file), os.path.join(os.path.basename(root), file)) output_files.append({'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by DESeq2 App'}) with zipfile.ZipFile(plot_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if file.endswith('.png'): zip_file.write(os.path.join(root, file), os.path.join(os.path.basename(root), file)) output_files.append({'path': plot_file, 'name': os.path.basename(plot_file), 'label': os.path.basename(plot_file), 'description': 'Visualization plots by DESeq2 App'}) return output_files def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _save_to_ws_and_report(self, ws_id, source, compoundset, message=None): """Save compound set to the workspace and make report""" info = self.dfu.save_objects( {'id': ws_id, "objects": [{ "type": "KBaseBiochem.CompoundSet", "data": compoundset, "name": compoundset['name'] }]})[0] compoundset_ref = "%s/%s/%s" % (info[6], info[0], info[4]) if not message: message = 'Imported %s as %s' % (source, info[1]) report_params = { 'objects_created': [{'ref': compoundset_ref, 'description': 'Compound Set'}], 'message': message, 'workspace_name': info[7], 'report_object_name': 'compound_set_creation_report' } # Construct the output to send back report_client = KBaseReport(self.callback_url) report_info = report_client.create_extended_report(report_params) output = {'report_name': report_info['name'], 'report_ref': report_info['ref'], 'compoundset_ref': compoundset_ref} return output #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.comp = CompoundSetUtils(self.callback_url) self.scratch = config['scratch'] logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #self.scratch = config['scratch'] #END_CONSTRUCTOR pass def run_CompMolNWChem(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_CompMolNWChem # Initial Tests to Check for Proper Inputs for name in ['Input_File','calculation_type','workspace_name']: if name not in params: raise ValueError('Parameter "' + name + '"is required but missing') if not isinstance(params['Input_File'], str): raise ValueError('Input_File must be a string') # Load the tsv file into a compound set using DataFileUtil methods scratch_file_path = self.dfu.download_staging_file({'staging_file_subdir_path':params['Input_File']} ).get('copy_file_path') #print('Scratch File Path: ',scratch_file_path) mol2_file_dir = None ext = os.path.splitext(scratch_file_path)[1] file_name = os.path.basename(scratch_file_path) if ext == '.sdf': compounds = parse.read_sdf(scratch_file_path, mol2_file_dir=mol2_file_dir, callback_url=self.callback_url) elif ext == '.tsv': compounds = parse.read_tsv(scratch_file_path, mol2_file_dir=mol2_file_dir, callback_url=self.callback_url) else: raise ValueError('Invalid input file type. Expects .tsv or .sdf') #DEBUG:: #print('Compounds:',compounds) compoundset = { 'id': params['Input_File'], 'name': params['Input_File'], 'description': 'Compound Set produced from %s' % file_name, 'compounds': compounds, } # Finish Reading in Compound Set # Read ids and smiles from compound set for nwchem input ids = [] smiles = [] for d in compounds: ids.append(d['id']) smiles.append(d['smiles']) #print(ids) #print(smiles) # Read the ids and structures of the compounds its.inchi_to_dft(ids,smiles) #DEBUG:: #os.system('pwd') #os.system('ls') length = len(ids) for i in range(length): os.chdir('./'+ids[i]+'/dft') x = ids[i] + '_nwchem.out' #print('x:',x) file1 = open(x, 'r') nAtoms = mul.getNumberOfAtoms(file1) energy = mul.getInternalEnergy0K(file1) charge =mul.getMullikenCharge(file1,nAtoms) file1.close() mul.nAtoms = nAtoms mul.E0K = energy mul.calculate(ids[i]) # Build KBase Output. Should output entire /simulation directory and build a CompoundSet with Mol2 Files result_directory = '/simulation/' ## Build CompoundSet with Mol2 Files... similarly to fetch_mol2_files_from_zinc (CompoundSetUtils).... compoundset_copy = copy.deepcopy(compoundset) count = 0 for compound in compoundset_copy.get('compounds'): if not compound.get('mol2_handle_ref'): mol2_file_path = result_directory+compound.get('id') SMILES = compound.get('smiles') shutil.move(mol2_file_path,self.scratch) os.chdir(self.scratch) mol2_file_path = self.scratch + '/'+ compound.get('id')+'/dft/' + compound.get('id')+'_Mulliken.mol2' handle_id = self.dfu.file_to_shock({'file_path': mol2_file_path, 'make_handle': True})['handle']['hid'] print('Handle ID:',handle_id) compound['mol2_handle_ref'] = handle_id count += 1 if count: message = 'Successfully fetched {} Mol2 files from Staging Path'.format(count) ## Create Extended Report output_files = self._generate_output_file_list(self.scratch) report_params = {'message': message, 'workspace_id': params['workspace_id'], 'objects_created': [], 'file_links': output_files, 'report_object_name': 'kb_deseq2_report_' + str(uuid.uuid4())} report = KBaseReport(self.callback_url) report_info = report.create_extended_report(report_params) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } output2 = self._save_to_ws_and_report( params['workspace_id'],'', compoundset_copy, message=message) return [output,output2] #END run_CompMolNWChem # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_CompMolNWChem return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = {'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH} #END_STATUS return [returnVal]
class ImportFBAModelUtil: def _call_sbml_tools(self, params): try: # calling SBMLTools.sbml_importer without genome sbml_importer_params = dict() sbml_importer_params['sbml_local_path'] = params.get( 'model_file').get('path') sbml_importer_params['automatically_integrate'] = 1 sbml_importer_params['remove_boundary'] = 1 for param_name in ['workspace_name', 'model_name', 'biomass']: sbml_importer_params[param_name] = params.get(param_name) log('start executing SBMLTools.sbml_importer with {}'.format( sbml_importer_params)) sbml_importer_ret = self.SBMLTools.sbml_importer( sbml_importer_params) log('SBMLTools.sbml_importer returned {}'.format( sbml_importer_ret)) except Exception: raise ValueError( 'Unexpected error calling SBMLTools.sbml_importer') try: # calling SBMLTools.integrate_model integrate_model_params = dict() integrate_model_params['biomass_reactions'] = '' integrate_model_params['compartment_translation'] = list() integrate_model_params['compound_mappings'] = '' integrate_model_params['create_extracellular'] = 0 integrate_model_params['fill_metadata'] = 1 integrate_model_params['gene_mappings'] = '' integrate_model_params['remove_boundary'] = 1 integrate_model_params['template_id'] = 'gramneg' integrate_model_params['translate_database'] = 'modelseed' integrate_model_params['workspace_name'] = params.get( 'workspace_name') integrate_model_params['model_name'] = params.get('model_name') integrate_model_params['output_model_name'] = params.get( 'model_name') integrate_model_params['output_media_name'] = params.get( 'model_name') + '.media' integrate_model_params['genome_id'] = params.get('genome') log('start executing SBMLTools.integrate_model with {}'.format( integrate_model_params)) sbml_integrate_model_ret = self.SBMLTools.integrate_model( integrate_model_params) log('SBMLTools.integrate_model returned {}'.format( sbml_integrate_model_ret)) fbamodel_id = sbml_integrate_model_ret['fbamodel_id'] report_name = sbml_integrate_model_ret['report_name'] report_ref = sbml_integrate_model_ret['report_ref'] except Exception: raise ValueError( 'Unexpected error calling SBMLTools.integrate_model') return fbamodel_id, report_name, report_ref def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.fba = fba_tools(self.callback_url) self.SBMLTools = SBMLTools(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_fbamodel_from_staging(self, params): log('--->\nrunning {}.{}\n params:\n{}'.format( self.__class__.__name__, sys._getframe().f_code.co_name, json.dumps(params, indent=1))) self._check_param(params, [ 'model_file', 'file_type', 'workspace_name', 'model_name', 'biomass' ], ['genome', 'compounds_file']) if params['file_type'] == 'tsv' and not params.get( 'compounds_file', None): raise ValueError('A compound file is required for tsv upload.') fba_tools_params = params.copy() for infile in ['model_file', 'compounds_file']: if not params.get(infile, None): continue download_staging_file_params = { 'staging_file_subdir_path': params[infile] } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') fba_tools_params[infile] = {'path': scratch_file_path} report_name = None report_ref = None if params['file_type'] == 'sbml': fbamodel_id, report_name, report_ref = self._call_sbml_tools( fba_tools_params) res = dict() res['ref'] = fbamodel_id # res = self.fba.sbml_file_to_model(fba_tools_params) elif params['file_type'] == 'excel': res = self.fba.excel_file_to_model(fba_tools_params) elif params['file_type'] == 'tsv': res = self.fba.tsv_file_to_model(fba_tools_params) else: raise ValueError('"{}" is not a valid import file_type'.format( params['file_type'])) return { 'obj_ref': res['ref'], 'report_name': report_name, 'report_ref': report_ref } @staticmethod def _check_param(in_params, req_param, opt_param=list()): """ Check if each of the params in the list are in the input params """ for param in req_param: if param not in in_params: raise ValueError( 'Required parameter "{}" is missing'.format(param)) defined_param = set(req_param + opt_param) for param in in_params: if param not in defined_param: print(('WARNING: received unexpected parameter "{}"'.format( param))) def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_excel(tsv)_as_media_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' upload_message += "FBAModel Object Name: " upload_message += params['model_name'] + '\n' upload_message += 'Imported File: {}\n'.format( params.get('model_file')) report_params = { 'message': upload_message, 'objects_created': [{ 'ref': obj_ref, 'description': 'Imported FBAModel' }], 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_methods_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class ImportExpressionMatrixUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.fv = KBaseFeatureValues(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_tsv_as_expression_matrix_from_staging(self, params): ''' import_tsv_as_expression_matrix_from_staging: wrapper method for KBaseFeatureValues.tsv_file_to_matrix required params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name matrix_name: output Expressin Matirx file name workspace_name: workspace name/ID of the object optional params: genome_ref: optional reference to a Genome object that will be used for mapping feature IDs to fill_missing_values: optional flag for filling in missing values in matrix (default value is false) data_type: optional filed, value is one of 'untransformed', 'log2_level', 'log10_level', 'log2_ratio', 'log10_ratio' or 'unknown' (last one is default value) data_scale: optional parameter (default value is '1.0') return: obj_ref: return object reference ''' log('--->\nrunning ImportAssemblyUtil.import_tsv_as_expression_matrix_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_tsv_as_expression_matrix_from_staging_params( params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') import_matrix_params = params import_matrix_params['input_file_path'] = scratch_file_path import_matrix_params['output_ws_name'] = params.get('workspace_name') import_matrix_params['output_obj_name'] = params.get('matrix_name') ref = self.fv.tsv_file_to_matrix(import_matrix_params) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service( params.get('staging_file_subdir_path'), ref.get('output_matrix_ref')) returnVal = {'obj_ref': ref.get('output_matrix_ref')} return returnVal def validate_import_tsv_as_expression_matrix_from_staging_params( self, params): """ validate_import_tsv_as_expression_matrix_from_staging_params: validates params passed to import_tsv_as_expression_matrix_from_staging method """ # check for required parameters for p in ['staging_file_subdir_path', 'workspace_name', 'matrix_name']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_tsv_as_expression_matrix_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False} object_data = self.dfu.get_objects(get_objects_params) upload_message += "Expression Matrix Object Name: " upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n' upload_message += 'Imported TSV File: {}\n'.format( params.get('staging_file_subdir_path')) report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_mothods_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class BiomUtil: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.data_util = DataUtil(config) self.attr_util = AttributesUtil(config) self.matrix_util = MatrixUtil(config) self.matrix_types = [x.split(".")[1].split('-')[0] for x in self.data_util.list_generic_types()] self.taxon_wsname = config['taxon-workspace-name'] self.kbse = KBaseSearchEngine(config['search-url']) def import_matrix_from_biom(self, params): """ arguments: obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix matrix_name: matrix object name workspace_name: workspace name matrix object to be saved to input_shock_id: file shock id or input_file_path: absolute file path or input_staging_file_path: staging area file path optional arguments: col_attributemapping_ref: column AttributeMapping reference row_attributemapping_ref: row AttributeMapping reference genome_ref: genome reference matrix_obj_ref: Matrix reference """ #exit(params) {'obj_type': 'AmpliconMatrix', 'matrix_name': 'test_AmpliconMatrix', 'workspace_name': 'man4ish_gupta:narrative_1568644342277', 'biom_fasta': {'biom_file_biom_fasta': 'data/phyloseq_test.biom', 'fasta_file_biom_fasta': 'data/phyloseq_test.fa'}, 'scale': 'raw', 'description': 'OTU data', 'amplicon_set_name': 'test_AmpliconSet', 'col_attributemapping_ref': '44071/33/54'} (biom_file, tsv_file, fasta_file, mode, metadata_keys) = self._process_params(params) workspace_name = params.get('workspace_name') matrix_name = params.get('matrix_name') amplicon_set_name = params.get('amplicon_set_name') obj_type = params.get('obj_type') scale = params.get('scale') description = params.get('description') refs = {k: v for k, v in params.items() if "_ref" in k} if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name amplicon_data = self._file_to_amplicon_data(biom_file, tsv_file, mode, refs, matrix_name, workspace_id, scale, description, metadata_keys) new_row_attr_ref = None if not params.get('row_attributemapping_ref'): new_row_attr_ref = amplicon_data.get('row_attributemapping_ref') new_col_attr_ref = None if not params.get('col_attributemapping_ref'): new_col_attr_ref = amplicon_data.get('col_attributemapping_ref') logging.info('start saving Matrix object: {}'.format(matrix_name)) matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': amplicon_data, 'workspace_name': workspace_id})['obj_ref'] amplicon_set_data = self._file_to_amplicon_set_data(biom_file, tsv_file, fasta_file, mode, refs, description, matrix_obj_ref) logging.info('start saving AmpliconSet object: {}'.format(amplicon_set_name)) amplicon_set_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseExperiments.AmpliconSet', 'obj_name': amplicon_set_name, 'data': amplicon_set_data, 'workspace_name': workspace_id})['obj_ref'] logging.info('start resaving Matrix object with amplicon set: {}'.format(matrix_name)) amplicon_data['amplicon_set_ref'] = '{}/{}'.format(workspace_id, amplicon_set_name) matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': amplicon_data, 'workspace_name': workspace_id})['obj_ref'] returnVal = {'matrix_obj_ref': matrix_obj_ref, 'amplicon_set_obj_ref': amplicon_set_obj_ref} report_output = self._generate_report(matrix_obj_ref, amplicon_set_obj_ref, new_row_attr_ref, new_col_attr_ref, workspace_name) returnVal.update(report_output) return returnVal def _process_params(self, params): logging.info('start validating import_matrix_from_biom params') # check for required parameters for p in ['obj_type', 'matrix_name', 'workspace_name', 'scale', 'amplicon_set_name']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) obj_type = params.get('obj_type') if obj_type not in self.matrix_types: raise ValueError('Unknown matrix object type: {}'.format(obj_type)) scale = params.get('scale') if scale not in SCALE_TYPES: raise ValueError('Unknown scale type: {}'.format(scale)) biom_file = None tsv_file = None fasta_file = None metadata_keys = DEFAULT_META_KEYS if params.get('biom_tsv'): biom_tsv = params.get('biom_tsv') biom_file = biom_tsv.get('biom_file_biom_tsv') tsv_file = biom_tsv.get('tsv_file_biom_tsv') if not (biom_file and tsv_file): raise ValueError('missing BIOM or TSV file') biom_file = self.dfu.download_staging_file( {'staging_file_subdir_path': biom_file}).get('copy_file_path') tsv_file = self.dfu.download_staging_file( {'staging_file_subdir_path': tsv_file}).get('copy_file_path') mode = 'biom_tsv' elif params.get('biom_fasta'): biom_fasta = params.get('biom_fasta') biom_file = biom_fasta.get('biom_file_biom_fasta') fasta_file = biom_fasta.get('fasta_file_biom_fasta') if not (biom_file and fasta_file): raise ValueError('missing BIOM or FASTA file') biom_file = self.dfu.download_staging_file( {'staging_file_subdir_path': biom_file}).get('copy_file_path') fasta_file = self.dfu.download_staging_file( {'staging_file_subdir_path': fasta_file}).get('copy_file_path') mode = 'biom_fasta' elif params.get('tsv_fasta'): tsv_fasta = params.get('tsv_fasta') tsv_file = tsv_fasta.get('tsv_file_tsv_fasta') fasta_file = tsv_fasta.get('fasta_file_tsv_fasta') if not (tsv_file and fasta_file): raise ValueError('missing TSV or FASTA file') tsv_file = self.dfu.download_staging_file( {'staging_file_subdir_path': tsv_file}).get('copy_file_path') fasta_file = self.dfu.download_staging_file( {'staging_file_subdir_path': fasta_file}).get('copy_file_path') metadata_keys_str = tsv_fasta.get('metadata_keys_tsv_fasta') if metadata_keys_str: metadata_keys += [x.strip() for x in metadata_keys_str.split(',')] mode = 'tsv_fasta' elif params.get('tsv'): tsv = params.get('tsv') tsv_file = tsv.get('tsv_file_tsv') if not tsv_file: raise ValueError('missing TSV file') tsv_file = self.dfu.download_staging_file( {'staging_file_subdir_path': tsv_file}).get('copy_file_path') metadata_keys_str = tsv.get('metadata_keys_tsv') if metadata_keys_str: metadata_keys += [x.strip() for x in metadata_keys_str.split(',')] mode = 'tsv' else: raise ValueError('missing valide file group type in parameters') return (biom_file, tsv_file, fasta_file, mode, list(set(metadata_keys))) def _retrieve_value(self, biom_metadata_dict, tsv_metadata_df, key, required=False): #exit(tsv_metadata_df) defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fdb3037f378>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']}) #exit(key) taxonomy #exit(biom_metadata_dict) none if key in biom_metadata_dict: return {k.lower(): v for k, v in biom_metadata_dict.items()}.get(key) elif key in tsv_metadata_df: return {k.lower(): v for k, v in tsv_metadata_df.items()}.get(key) elif required: raise ValueError('missing necessary [{}] from file'.format(key)) else: return None def _search_taxon(self, scientific_name): """ logic borrowed from: GFU.GenomeInterface https://github.com/kbaseapps/GenomeFileUtil/blob/master/lib/GenomeFileUtil/core/GenomeInterface.py#L216 """ taxon_id = None search_params = { "object_types": ["taxon"], "match_filter": { "lookup_in_keys": { "scientific_name": {"value": scientific_name}}, "exclude_subobjects": 1 }, "access_filter": { "with_private": 0, "with_public": 1 }, "sorting_rules": [{ "is_object_property": 0, "property": "timestamp", "ascending": 0 }] } objects = self.kbse.search_objects(search_params)['objects'] if not objects: search_params['match_filter']['lookup_in_keys'] = { "aliases": {"value": scientific_name} } objects = self.kbse.search_objects(search_params)['objects'] if objects: taxon_id = objects[0].get('object_name') #exit(taxon_id) 561_taxon return taxon_id def _fetch_taxon_level(self, taxon_char): taxon_level_mapping = {'l': 'Life', 'd': 'Domain', 'k': 'Kingdom', 'p': 'Phylum', 'c': 'Class', 'o': 'Order', 'f': 'Family', 'g': 'Genus', 's': 'Species'} return taxon_level_mapping.get(taxon_char[0].lower(), 'Unknown') def _fetch_taxonomy(self, datarow): #exit(datarow) defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7f7ca8e8d950>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']}) lineage = self._retrieve_value([], datarow, 'taxonomy') #exit(lineage) ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'] if isinstance(lineage, str): delimiter = csv.Sniffer().sniff(lineage).delimiter lineage = [x.strip() for x in lineage.split(delimiter)] #exit(lineage) ['k__Bacteria', 'k__Bacteria'] taxonomy = {'lineage': lineage} for key in ['score', 'taxonomy_source', 'species_name']: val = self._retrieve_value([], datarow, key) if val: taxonomy[key] = val #exit(key) species_name for item in lineage[::-1]: scientific_name = item.split('_')[-1] taxon_level_char = item.split('_')[0] if scientific_name: taxon_id = self._search_taxon(scientific_name) if taxon_id: taxon_ref = f"{self.taxon_wsname}/{taxon_id}" taxon_level = self._fetch_taxon_level(taxon_level_char) taxonomy.update({'taxon_ref': taxon_ref, 'taxon_id': taxon_id, 'scientific_name': scientific_name, 'taxon_level': taxon_level}) break #exit(taxonomy) {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'} return taxonomy def _retrieve_tsv_amplicon_set_data(self, tsv_file): #tsv file is data/amplicon_test.tsv amplicons = dict() try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError('Cannot parse file. Please provide valide TSV file') if 'consensus_sequence' not in df.columns.tolist(): raise ValueError('TSV file does not include consensus_sequence') logging.info('start processing each row in TSV') for observation_id in df.index: taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = {'consensus_sequence': df.loc[observation_id, 'consensus_sequence'], 'taxonomy': taxonomy} amplicons.update({observation_id: amplicon}) logging.info('finished parsing TSV file') return amplicons ''' {'GG_OTU_1': {'consensus_sequence': 'AACCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_2': {'consensus_sequence': 'TTGGCC', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_3': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_4': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_5': {'consensus_sequence': 'TTCCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_6': {'consensus_sequence': 'AACCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}} ''' def _retrieve_tsv_fasta_amplicon_set_data(self, tsv_file, fasta_file): #tsvfile = data/amplicon_test.tsv amplicons = dict() try: logging.info('start parsing FASTA file') fastq_dict = SeqIO.index(fasta_file, "fasta") #{'GG_OTU_1' : SeqRecord(...), ...} except Exception: raise ValueError('Cannot parse file. Please provide valide FASTA file') try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError('Cannot parse file. Please provide valide TSV file') logging.info('start processing files') for observation_id in df.index: if observation_id not in fastq_dict: raise ValueError('FASTA file does not have [{}] OTU id'.format(observation_id)) taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = {'consensus_sequence': str(fastq_dict.get(observation_id).seq), 'taxonomy': taxonomy} amplicons.update({observation_id: amplicon}) logging.info('finished processing files') return amplicons ''' {'GG_OTU_1': {'consensus_sequence': 'ACTGACTAGCTAGCTAACTG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_2': {'consensus_sequence': 'GCATCGTAGCTAGCTACGAT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_3': {'consensus_sequence': 'CATCGATCGTACGTACGTAG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_4': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_5': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_6': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}} ''' def _retrieve_biom_fasta_amplicon_set_data(self, biom_file, fasta_file): #exit(biom_file) data/phyloseq_test.biom amplicons = dict() try: logging.info('start parsing FASTA file') fastq_dict = SeqIO.index(fasta_file, "fasta") except Exception: raise ValueError('Cannot parse file. Please provide valide FASTA file') logging.info('start parsing BIOM file') table = biom.load_table(biom_file) observation_ids = table._observation_ids.tolist() observation_metadata = table._observation_metadata logging.info('start processing files') for index, observation_id in enumerate(observation_ids): if observation_id not in fastq_dict: raise ValueError('FASTA file does not have [{}] OTU id'.format(observation_id)) taxonomy = self._fetch_taxonomy(observation_metadata[index]) amplicon = {'consensus_sequence': str(fastq_dict.get(observation_id).seq), 'taxonomy': taxonomy} amplicons.update({observation_id: amplicon}) logging.info('finished processing files') return amplicons ''' {'GG_OTU_1': {'consensus_sequence': 'ACTGACTAGCTAGCTAACTG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}, 'GG_OTU_2': {'consensus_sequence': 'GCATCGTAGCTAGCTACGAT', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostocophycideae', 'o__Nostocales', 'f__Nostocaceae', 'g__Dolichospermum', 's__'], 'taxon_ref': 'ReferenceTaxons/748770_taxon', 'taxon_id': '748770_taxon', 'scientific_name': 'Dolichospermum', 'taxon_level': 'Genus'}}, 'GG_OTU_3': {'consensus_sequence': 'CATCGATCGTACGTACGTAG', 'taxonomy': {'lineage': ['k__Archaea', 'p__Euryarchaeota', 'c__Methanomicrobia', 'o__Methanosarcinales', 'f__Methanosarcinaceae', 'g__Methanosarcina', 's__'], 'taxon_ref': 'ReferenceTaxons/2207_taxon', 'taxon_id': '2207_taxon', 'scientific_name': 'Methanosarcina', 'taxon_level': 'Genus'}}, 'GG_OTU_4': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Firmicutes', 'c__Clostridia', 'o__Halanaerobiales', 'f__Halanaerobiaceae', 'g__Halanaerobium', 's__Halanaerobiumsaccharolyticum'], 'taxon_ref': 'ReferenceTaxons/2330_taxon', 'taxon_id': '2330_taxon', 'scientific_name': 'Halanaerobium', 'taxon_level': 'Genus'}}, 'GG_OTU_5': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}} ''' def _retrieve_biom_tsv_amplicon_set_data(self, biom_file, tsv_file): amplicons = dict() try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError('Cannot parse file. Please provide valide tsv file') if 'consensus_sequence' not in df.columns.tolist(): raise ValueError('TSV file does not include consensus_sequence') logging.info('start parsing BIOM file') table = biom.load_table(biom_file) observation_ids = table._observation_ids.tolist() observation_metadata = table._observation_metadata logging.info('start processing files') for index, observation_id in enumerate(observation_ids): if observation_id not in df.index: raise ValueError('TSV file does not have [{}] OTU id'.format(observation_id)) taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = {'consensus_sequence': df.loc[observation_id, 'consensus_sequence'], 'taxonomy': taxonomy} amplicons.update({observation_id: amplicon}) logging.info('finished processing files') ''' {'GG_OTU_1': {'consensus_sequence': 'AACCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_2': {'consensus_sequence': 'TTGGCC', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_3': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_4': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_5': {'consensus_sequence': 'TTCCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}} ''' return amplicons def _file_to_amplicon_set_data(self, biom_file, tsv_file, fasta_file, mode, refs, description, matrix_obj_ref): logging.info('start parsing amplicon_set_data') amplicon_set_data = dict() if mode == 'biom_tsv': amplicons = self._retrieve_biom_tsv_amplicon_set_data(biom_file, tsv_file) elif mode == 'biom_fasta': amplicons = self._retrieve_biom_fasta_amplicon_set_data(biom_file, fasta_file) elif mode == 'tsv_fasta': amplicons = self._retrieve_tsv_fasta_amplicon_set_data(tsv_file, fasta_file) elif mode == 'tsv': amplicons = self._retrieve_tsv_amplicon_set_data(tsv_file) else: raise ValueError('error parsing _file_to_amplicon_set_data, mode: {}'.format(mode)) amplicon_set_data.update({'amplicons': amplicons}) if 'reads_set_ref' in refs: amplicon_set_data['reads_set_ref'] = refs.get('reads_set_ref') if description: amplicon_set_data['description'] = description matrix_obj_ref_array = matrix_obj_ref.split('/') amplicon_set_data['amplicon_matrix_ref'] = '{}/{}'.format(matrix_obj_ref_array[0], matrix_obj_ref_array[1]) ''' {'amplicons': {'GG_OTU_1': {'consensus_sequence': 'ACTGACTAGCTAGCTAACTG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}, 'GG_OTU_2': {'consensus_sequence': 'GCATCGTAGCTAGCTACGAT', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostocophycideae', 'o__Nostocales', 'f__Nostocaceae', 'g__Dolichospermum', 's__'], 'taxon_ref': 'ReferenceTaxons/748770_taxon', 'taxon_id': '748770_taxon', 'scientific_name': 'Dolichospermum', 'taxon_level': 'Genus'}}, 'GG_OTU_3': {'consensus_sequence': 'CATCGATCGTACGTACGTAG', 'taxonomy': {'lineage': ['k__Archaea', 'p__Euryarchaeota', 'c__Methanomicrobia', 'o__Methanosarcinales', 'f__Methanosarcinaceae', 'g__Methanosarcina', 's__'], 'taxon_ref': 'ReferenceTaxons/2207_taxon', 'taxon_id': '2207_taxon', 'scientific_name': 'Methanosarcina', 'taxon_level': 'Genus'}}, 'GG_OTU_4': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Firmicutes', 'c__Clostridia', 'o__Halanaerobiales', 'f__Halanaerobiaceae', 'g__Halanaerobium', 's__Halanaerobiumsaccharolyticum'], 'taxon_ref': 'ReferenceTaxons/2330_taxon', 'taxon_id': '2330_taxon', 'scientific_name': 'Halanaerobium', 'taxon_level': 'Genus'}}, 'GG_OTU_5': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}}, 'description': 'OTU data', 'amplicon_matrix_ref': '44071/21'} ''' return amplicon_set_data def _file_to_amplicon_data(self, biom_file, tsv_file, mode, refs, matrix_name, workspace_id, scale, description, metadata_keys=None): amplicon_data = refs if mode.startswith('biom'): logging.info('start parsing BIOM file for matrix data') table = biom.load_table(biom_file) observation_metadata = table._observation_metadata sample_metadata = table._sample_metadata matrix_data = {'row_ids': table._observation_ids.tolist(), 'col_ids': table._sample_ids.tolist(), 'values': table.matrix_data.toarray().tolist()} logging.info('start building attribute mapping object') amplicon_data.update(self.get_attribute_mapping("row", observation_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data.update(self.get_attribute_mapping("col", sample_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data['attributes'] = {} for k in ('create_date', 'generated_by'): val = getattr(table, k) if not val: continue if isinstance(val, bytes): amplicon_data['attributes'][k] = val.decode('utf-8') else: amplicon_data['attributes'][k] = str(val) elif mode.startswith('tsv'): observation_metadata = None sample_metadata = None try: logging.info('start parsing TSV file for matrix data') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError('Cannot parse file. Please provide valide tsv file') else: metadata_df = None if metadata_keys: shared_metadata_keys = list(set(metadata_keys) & set(df.columns)) if mode == 'tsv' and 'consensus_sequence' not in shared_metadata_keys: raise ValueError('TSV file does not include consensus_sequence') if shared_metadata_keys: metadata_df = df[shared_metadata_keys] df.drop(columns=shared_metadata_keys, inplace=True) try: df = df.astype(float) except ValueError: err_msg = 'Found some non-float values. Matrix contains only numeric values\n' err_msg += 'Please list any non-numeric column names in Metadata Keys field' raise ValueError(err_msg) df.fillna(0, inplace=True) matrix_data = {'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist()} logging.info('start building attribute mapping object') amplicon_data.update(self.get_attribute_mapping("row", observation_metadata, matrix_data, matrix_name, refs, workspace_id, metadata_df)) amplicon_data.update(self.get_attribute_mapping("col", sample_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data['attributes'] = {} else: raise ValueError('error parsing _file_to_amplicon_data, mode: {}'.format(mode)) amplicon_data.update({'data': matrix_data}) amplicon_data['search_attributes'] = [f'{k}|{v}' for k, v in amplicon_data['attributes'].items()] amplicon_data['scale'] = scale if description: amplicon_data['description'] = description ''' {'col_attributemapping_ref': '44071/33/24', 'row_attributemapping_ref': '44071/19/119', 'row_mapping': {'GG_OTU_1': 'GG_OTU_1', 'GG_OTU_2': 'GG_OTU_2', 'GG_OTU_3': 'GG_OTU_3', 'GG_OTU_4': 'GG_OTU_4', 'GG_OTU_5': 'GG_OTU_5'}, 'col_mapping': {'Sample1': 'Sample1', 'Sample2': 'Sample2', 'Sample3': 'Sample3', 'Sample4': 'Sample4', 'Sample5': 'Sample5', 'Sample6': 'Sample6'}, 'attributes': {'generated_by': 'QIIME revision XYZ'}, 'data': {'row_ids': ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], 'col_ids': ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], 'values': [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]}, 'search_attributes': ['generated_by|QIIME revision XYZ'], 'scale': 'raw', 'description': 'OTU data'} ''' return amplicon_data def get_attribute_mapping(self, axis, metadata, matrix_data, matrix_name, refs, workspace_id, metadata_df=None): ''' getting mapping data based on refs or metadata or metadata_df ''' #exit(metadata) ''' (defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35faf730>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35faf9d8>, {'taxonomy': ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostocophycideae', 'o__Nostocales', 'f__Nostocaceae', 'g__Dolichospermum', 's__']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35faf6a8>, {'taxonomy': ['k__Archaea', 'p__Euryarchaeota', 'c__Methanomicrobia', 'o__Methanosarcinales', 'f__Methanosarcinaceae', 'g__Methanosarcina', 's__']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35fafd08>, {'taxonomy': ['k__Bacteria', 'p__Firmicutes', 'c__Clostridia', 'o__Halanaerobiales', 'f__Halanaerobiaceae', 'g__Halanaerobium', 's__Halanaerobiumsaccharolyticum']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35fafea0>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']})) ''' #exit(matrix_data) {'row_ids': ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], 'col_ids': ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], 'values': [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]} #exit(matrix_name) test_AmpliconMatrix #exit(refs) {'col_attributemapping_ref': '44071/33/51'} mapping_data = {} axis_ids = matrix_data[f'{axis}_ids'] #exit(axis_ids) ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'] if refs.get(f'{axis}_attributemapping_ref'): am_data = self.dfu.get_objects( {'object_refs': [refs[f'{axis}_attributemapping_ref']]} )['data'][0]['data'] unmatched_ids = set(axis_ids) - set(am_data['instances'].keys()) if unmatched_ids: name = "Column" if axis == 'col' else "Row" raise ValueError(f"The following {name} IDs from the uploaded matrix do not match " f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}" f"\nPlease verify the input data or upload an excel file with a" f"{name} mapping tab.") else: mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif metadata: name = matrix_name + "_{}_attributes".format(axis) mapping_data[f'{axis}_attributemapping_ref'] = self._metadata_to_attribute_mapping( axis_ids, metadata, name, workspace_id) # if coming from biom file, metadata and axis IDs are guaranteed to match mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif metadata_df is not None: name = matrix_name + "_{}_attributes".format(axis) mapping_data[f'{axis}_attributemapping_ref'] = self._meta_df_to_attribute_mapping( axis_ids, metadata_df, name, workspace_id) mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} ''' {'row_attributemapping_ref': '44071/19/122', 'row_mapping': {'GG_OTU_1': 'GG_OTU_1', 'GG_OTU_2': 'GG_OTU_2', 'GG_OTU_3': 'GG_OTU_3', 'GG_OTU_4': 'GG_OTU_4', 'GG_OTU_5': 'GG_OTU_5', 'GG_OTU_6': 'GG_OTU_6'}} ''' return mapping_data def _meta_df_to_attribute_mapping(self, axis_ids, metadata_df, obj_name, ws_id): data = {'ontology_mapping_method': "TSV file", 'instances': {}} attribute_keys = metadata_df.columns.tolist() data['attributes'] = [{'attribute': key, 'source': 'upload'} for key in attribute_keys] for axis_id in axis_ids: data['instances'][axis_id] = metadata_df.loc[axis_id].tolist() logging.info('start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": data, "name": obj_name }] })[0] # 44071/19/128 return f'{info[6]}/{info[0]}/{info[4]}' def _metadata_to_attribute_mapping(self, instances, metadata, obj_name, ws_id): data = {'ontology_mapping_method': "BIOM file", 'instances': {}} sample_set = metadata[0:min(len(metadata), 25)] metadata_keys = sorted(set((k for m_dict in sample_set for k in m_dict))) data['attributes'] = [{'attribute': key, 'source': 'upload'} for key in metadata_keys] for inst, meta in zip(instances, metadata): data['instances'][inst] = [str(meta[attr]) for attr in metadata_keys] logging.info('start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": data, "name": obj_name }] })[0] # 44071/19/134 return f'{info[6]}/{info[0]}/{info[4]}' def _generate_report(self, matrix_obj_ref, amplicon_set_obj_ref, new_row_attr_ref, new_col_attr_ref, workspace_name): """ _generate_report: generate summary report """ objects_created = [{'ref': matrix_obj_ref, 'description': 'Imported Amplicon Matrix'}, {'ref': amplicon_set_obj_ref, 'description': 'Imported Amplicon Set'}] if new_row_attr_ref: objects_created.append({'ref': new_row_attr_ref, 'description': 'Imported Amplicons(Row) Attribute Mapping'}) if new_col_attr_ref: objects_created.append({'ref': new_col_attr_ref, 'description': 'Imported Samples(Column) Attribute Mapping'}) report_params = {'message': '', 'objects_created': objects_created, 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} #{'report_name': 'import_matrix_from_biom_db306341-c03a-4e60-b8a4-2bd7f6a48925', 'report_ref': '44071/200/1'} return report_output def _df_to_tsv(self, amplicon_set_df, result_dir, amplicon_set_ref): #not going to be used anywhere logging.info('writting amplicon set data frame to tsv file') amplicon_set_obj = self.dfu.get_objects({'object_refs': [amplicon_set_ref]})['data'][0] amplicon_set_info = amplicon_set_obj['info'] amplicon_set_name = amplicon_set_info[1] file_path = os.path.join(result_dir, amplicon_set_name + ".tsv") amplicon_set_df.to_csv(file_path, sep='\t', index=True, header=True) return file_path def _amplicon_set_to_df(self, amplicon_set_ref): #not going to be used anywhere logging.info('converting amplicon set to data frame') am_set_data = self.dfu.get_objects({'object_refs': [amplicon_set_ref]})['data'][0]['data'] amplicon_matrix_ref = am_set_data.get('amplicon_matrix_ref') matrix_data = self.dfu.get_objects({'object_refs': [amplicon_matrix_ref]})['data'][0]['data'] matrix_value_data = matrix_data.get('data') index = matrix_value_data.get('row_ids') columns = matrix_value_data.get('col_ids') values = matrix_value_data.get('values') df = pd.DataFrame(values, index=index, columns=columns) amplicons = am_set_data.get('amplicons') meta_index = list() meta_columns = ['taxonomy', 'taxon_id', 'taxon_ref', 'taxon_level', 'score', 'taxonomy_source', 'species_name', 'consensus_sequence'] meta_values = list() for otu_id, amplicon in amplicons.items(): meta_index.append(otu_id) taxonomy_data = amplicon.get('taxonomy') taxonomy = taxonomy_data.get('lineage') taxon_id = taxonomy_data.get('taxon_id') taxon_ref = taxonomy_data.get('taxon_ref') taxon_level = taxonomy_data.get('taxon_level') score = taxonomy_data.get('score') taxonomy_source = taxonomy_data.get('taxonomy_source') species_name = taxonomy_data.get('species_name') consensus_sequence = amplicon.get('consensus_sequence') meta_values.append([taxonomy, taxon_id, taxon_ref, taxon_level, score, taxonomy_source, species_name, consensus_sequence]) meta_df = pd.DataFrame(meta_values, index=meta_index, columns=meta_columns) merged_df = df.merge(meta_df, left_index=True, right_index=True, how='left', validate='one_to_one') return merged_df def export_amplicon_set_tsv(self, params): # not goign to be called anywhere """ export AmpliconSet as TSV """ logging.info('start exporting amplicon set object') amplicon_set_ref = params.get('input_ref') amplicon_set_df = self._amplicon_set_to_df(amplicon_set_ref) result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) self._df_to_tsv(amplicon_set_df, result_dir, amplicon_set_ref) package_details = self.dfu.package_for_download({ 'file_path': result_dir, 'ws_refs': [amplicon_set_ref] }) return {'shock_id': package_details['shock_id']}
class ImportEscherMapUtil: @staticmethod def validate_eschermap_params(params, expected, opt_param=set()): """ Validates that required parameters are present. Warns if unexpected parameters appear """ expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning("Unexpected parameter {} supplied".format(param)) def _save_escher_map(self, escher_data, workspace_id, escher_map_name): """ save KBaseFBA.EscherMap to workspace """ logging.info('start saving KBaseFBA.EscherMap') if not isinstance(workspace_id, int): logging.warning('Invalid workspace ID: {}'.format(workspace_id)) try: workspace_id = self.dfu.ws_name_to_id(workspace_id) except Exception: raise ValueError('Cannot convert {} to valid workspace id'.format(workspace_id)) info = self.dfu.save_objects({'id': workspace_id, 'objects': [{'type': 'KBaseFBA.EscherMap', 'data': escher_data, 'name': escher_map_name}]})[0] return "%s/%s/%s" % (info[6], info[0], info[4]) def _refactor_escher_data(self, escher_data): """ refactor escher data to better fit KBaseFBA.EscherMap object """ logging.info('start refactoring escher data') refactored_escher_data = copy.deepcopy(escher_data) refactored_escher_data[0]['map_name'] = "custom map" if not 'authors' in refactored_escher_data[0]: refactored_escher_data[0]['authors'] = [] for rxn_uid in refactored_escher_data[1]['reactions']: rxn_node = refactored_escher_data[1]['reactions'][rxn_uid] rxn_node['reversibility'] = 1 if rxn_node['reversibility'] else 0 for seg_uid in rxn_node['segments']: seg = rxn_node['segments'][seg_uid] if seg['b1'] == None: del seg['b1'] if seg['b2'] == None: del seg['b2'] for node_uid in refactored_escher_data[1]['nodes']: node = refactored_escher_data[1]['nodes'][node_uid] if 'node_is_primary' in node: node['node_is_primary'] = 1 if node['node_is_primary'] else 0 refactored_escher_data = { "metadata" : refactored_escher_data[0], "layout" : refactored_escher_data[1] } if refactored_escher_data == escher_data: logging.warning('No changes in escher data') return refactored_escher_data def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) def import_eschermap_from_staging(self, params): """ import_attribute_mapping_from_staging: import a JSON file as KBaseFBA.EscherMap required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name escher_map_name: output KBaseFBA.EscherMap object name workspace_id: workspace ID return: obj_ref: return object reference """ self.validate_eschermap_params(params, ['staging_file_subdir_path', 'escher_map_name', 'workspace_id']) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') try: with open(scratch_file_path) as f: escher_data = json.load(f) except Exception: raise ValueError('Failed to parse JSON file.') escher_data = self._refactor_escher_data(escher_data) obj_ref = self._save_escher_map(escher_data, params['workspace_id'], params['escher_map_name']) returnVal = {'obj_ref': obj_ref} return returnVal def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. """ logging.info('start generating report') upload_message = 'Import Finished\n' get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False} object_data = self.dfu.get_objects(get_objects_params) upload_message += "Imported Escher Map Name: " upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n' upload_message += 'Imported File: {}\n'.format(params['staging_file_subdir_path']) report_params = {'message': upload_message, 'objects_created': [{'ref': obj_ref, 'description': 'Imported Escher Map'}], 'workspace_id': params['workspace_id'], 'report_object_name': 'kb_upload_methods_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output
class ImportFBAModelUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.fba = fba_tools(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_fbamodel_from_staging(self, params): log('--->\nrunning {}.{}\n params:\n{}'.format( self.__class__.__name__, sys._getframe().f_code.co_name, json.dumps(params, indent=1))) self._check_param(params, [ 'model_file', 'file_type', 'workspace_name', 'model_name', 'biomass' ], ['genome', 'compounds_file']) if params['file_type'] == 'tsv' and not params.get( 'compounds_file', None): raise ValueError('A compound file is required for tsv upload.') fba_tools_params = params.copy() for infile in ['model_file', 'compounds_file']: if not params.get(infile, None): continue download_staging_file_params = { 'staging_file_subdir_path': params[infile] } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') fba_tools_params[infile] = {'path': scratch_file_path} if params['file_type'] == 'sbml': res = self.fba.sbml_file_to_model(fba_tools_params) elif params['file_type'] == 'excel': res = self.fba.excel_file_to_model(fba_tools_params) elif params['file_type'] == 'tsv': res = self.fba.tsv_file_to_model(fba_tools_params) else: raise ValueError('"{}" is not a valid import file_type'.format( params['file_type'])) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service( download_staging_file_params.get('staging_file_subdir_path'), res['ref']) return {'obj_ref': res['ref']} @staticmethod def _check_param(in_params, req_param, opt_param=list()): """ Check if each of the params in the list are in the input params """ for param in req_param: if param not in in_params: raise ValueError( 'Required parameter "{}" is missing'.format(param)) defined_param = set(req_param + opt_param) for param in in_params: if param not in defined_param: print(('WARNING: received unexpected parameter "{}"'.format( param))) def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_excel(tsv)_as_media_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' upload_message += "FBAModel Object Name: " upload_message += params['model_name'] + '\n' upload_message += 'Imported File: {}\n'.format( params.get('model_file')) report_params = { 'message': upload_message, 'objects_created': [{ 'ref': obj_ref, 'description': 'Imported FBAModel' }], 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_methods_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class ImportMetagenomeGFFFastaUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url, service_ver='dev') self.uploader_utils = UploaderUtil(config) self.scratch = os.path.join(config['scratch'], 'import_Metagenome_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) def import_metagenome_gff_fasta_from_staging(self, params): """ import_gff_fasta_from_staging: wrapper method for GenomeFileUtil.fasta_gff_to_genome required params: fasta_file: fasta file from user's staging area gff_file: gff file from user's staging area genome_name: output genome object name workspace_name: workspace name that genome will be stored to file paths for both fasta and gff files must be subdirectory file path in staging area e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name optional params: release: Release Or Version Of The Source Data genetic_code: Genetic Code For The Organism type: 'Reference', 'User upload', 'Representative' return: genome_ref: return object reference report_name: name of generated report (if any) report_ref: report reference (if any) """ # logging.info('--->\nrunning ImportMetagenomeGFFFastaUtil.import_metagenome_gff_fasta_from_staging\n' + # f'params:\n{json.dumps(params, indent=1)}') self.validate_import_metagenome_gff_fasta_from_staging_params(params) for key in ('fasta_file', 'gff_file'): file_path = params[key] download_staging_file_params = { 'staging_file_subdir_path': file_path } dfu_returnVal = self.dfu.download_staging_file( download_staging_file_params) params[key] = {'path': dfu_returnVal['copy_file_path']} returnVal = self.gfu.fasta_gff_to_metagenome(params) """ Update the workspace object related meta-data for staged file """ # self.uploader_utils.update_staging_service(download_staging_file_params.get('staging_file_subdir_path'), # returnVal['genome_ref']) return returnVal def validate_import_metagenome_gff_fasta_from_staging_params(self, params): """ validate_import_metagenome_gff_fasta_from_staging_params: validates params passed to import_gff_fasta_from_staging method """ # check for required parameters for p in ['genome_name', 'workspace_name', 'fasta_file', 'gff_file']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') # for now must use workspace name, but no ws_id_to_name() function available if str(params["workspace_name"]).isdigit(): error_msg = '"{}" parameter is a workspace id and workspace name is required'.format( params["workspace_name"]) raise ValueError(error_msg) def generate_html_report(self, genome_ref, params): """ _generate_html_report: generate html summary report """ logging.info('start generating html report') genome_obj = self.dfu.get_objects({'object_refs': [genome_ref]}) html_report = list() tmp_dir = os.path.join(self.scratch, str(uuid.uuid4())) handler_utils._mkdir_p(tmp_dir) result_file_path = os.path.join(tmp_dir, 'report.html') genome_name = str(genome_obj.get('data')[0].get('info')[1]) genome_file = params.get('staging_file_subdir_path') genome_data = genome_obj.get('data')[0].get('data') genome_info = genome_obj.get('data')[0].get('info') genome_metadata = genome_info[10] source = genome_metadata.get('Source') num_contigs = genome_metadata.get('Number contigs') size = genome_metadata.get('Size') gc_content = genome_metadata.get('GC content') warnings = genome_data.get('warnings', []) feature_counts = sorted( list(genome_data.get('feature_counts', {}).items())) genome_overview_data = collections.OrderedDict() genome_overview_data['Name'] = '{} ({})'.format( genome_name, genome_ref) #genome_overview_data['Uploaded File'] = genome_file genome_overview_data['Date Uploaded'] = time.strftime("%c") genome_overview_data['Source'] = source genome_overview_data['Number of Contigs'] = num_contigs genome_overview_data['Size'] = size genome_overview_data['GC Content'] = gc_content genome_overview_data['Warnings'] = "\n".join(warnings) genome_overview_data.update(feature_counts) overview_content = '<br/><table>\n' for key, val in genome_overview_data.items(): overview_content += '<tr><td><b>{}</b></td>'.format(key) overview_content += '<td>{}</td></tr>\n'.format(val) overview_content += '</table>' feature_content = str( [[str(k), v] for k, v in list(genome_data.get('feature_counts', {}).items()) if k != 'gene']) contig_content = str( [[str(c), l] for c, l in zip(genome_data.get('contig_ids', []), genome_data.get('contig_lengths', []))]) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template', 'report_template_genome.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Overview_Content</p>', overview_content) report_template = report_template.replace( '*FEATURE_DATA*', feature_content) report_template = report_template.replace( '*CONTIG_DATA*', contig_content) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': tmp_dir, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for imported Annotated Metagenome Assembly' }) return html_report def generate_report(self, genome_ref, params): """ :param genome_ref: Return Val from GenomeFileUtil for Uploaded metagenome Need to get report warnings and message from it. :return: """ uuid_string = str(uuid.uuid4()) objects_created = [{ 'ref': genome_ref, 'description': 'Imported Annotated Metagenome Assembly' }] output_html_files = self.generate_html_report(genome_ref, params) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 300, 'report_object_name': 'kb_metagenome_upload_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class UnpackFileUtil: def _staging_service_host(self): deployment_path = os.environ["KB_DEPLOYMENT_CONFIG"] parser = SafeConfigParser() parser.read(deployment_path) endpoint = parser.get('kb_uploadmethods', 'kbase-endpoint') staging_service_host = endpoint + '/staging_service' return staging_service_host def _file_to_staging(self, file_path_list, subdir_folder=None): """ _file_to_staging: upload file(s) to staging area """ subdir_folder_str = '/' if not subdir_folder else '/{}'.format( subdir_folder) staging_service_host = self._staging_service_host() end_point = staging_service_host + '/upload' headers = {'Authorization': self.token} files = {'destPath': subdir_folder_str} for file_path in file_path_list: files.update({ 'uploads': (os.path.basename(file_path), open(file_path, 'rb')) }) resp = _requests.post(end_point, headers=headers, files=files) if resp.status_code != 200: raise ValueError( 'Upload file {} failed.\nError Code: {}\n{}\n'.format( file_path, resp.status_code, resp.text)) else: log("return message from server:\n{}\n".format(resp.text)) def _remove_irrelevant_files(self, file_path): """ _remove_irrelevant_files: remove irrelevant files """ target_name = os.path.basename(file_path) file_dir = os.path.dirname(file_path) for dirpath, dirnames, filenames in os.walk(file_dir): for filename in filenames: if filename != target_name: irrelevant_file_path = os.sep.join([dirpath, filename]) os.remove(irrelevant_file_path) log('removing irrelevant file: {}'.format( irrelevant_file_path)) def _r_unpack(self, file_path, count): """ _r_unpack: recursively unpack file_path """ if count == 0: self._remove_irrelevant_files(file_path) count += 1 if os.path.isfile(file_path): log('processing: {}{}'.format('-' * count, file_path)) t = magic.from_file(file_path, mime=True) if os.path.basename(file_path).endswith('.DS_Store'): os.remove(file_path) log('removing file: {}{}'.format('-' * count, file_path)) elif t in [ 'application/' + x for x in ('x-gzip', 'gzip', 'x-bzip', 'x-bzip2', 'bzip', 'bzip2', 'x-tar', 'tar', 'x-gtar', 'zip', 'x-zip-compressed') ]: file_dir = os.path.dirname(file_path) files_before_unpack = os.listdir(file_dir) self.dfu.unpack_file({'file_path': file_path}).get('file_path') files_after_unpack = os.listdir(file_dir) new_files = [ item for item in files_after_unpack if item not in files_before_unpack ] for new_file in new_files: self._r_unpack(os.sep.join([file_dir, new_file]), count) os.remove(file_path) log('removing file: {}{}'.format('-' * count, file_path)) else: return file_path else: if os.path.basename(file_path).startswith('_'): shutil.rmtree(file_path, ignore_errors=True) log('removing folder: {}{}'.format('-' * count, file_path)) else: for dirpath, dirnames, filenames in os.walk(file_path): for filename in filenames: self._r_unpack(os.sep.join([dirpath, filename]), count) def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.user_id = config['USER_ID'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) def unpack_staging_file(self, params): """ Unpack a staging area file params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name result: unpacked_file_path: unpacked file path(s) in staging area """ log('--->\nrunning UnpackFileUtil.unpack_staging_file\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) scratch_file_path = self.dfu.download_staging_file(params).get( 'copy_file_path') self._r_unpack(scratch_file_path, 0) unpacked_file_path_list = [] for dirpath, dirnames, filenames in os.walk( os.path.dirname(scratch_file_path)): for filename in filenames: unpacked_file_path_list.append(os.sep.join([dirpath, filename])) log("Unpacked files:\n {}".format( '\n '.join(unpacked_file_path_list))) self._file_to_staging( unpacked_file_path_list, os.path.dirname(params.get('staging_file_subdir_path'))) unpacked_file_path = ','.join(unpacked_file_path_list) returnVal = {'unpacked_file_path': unpacked_file_path} return returnVal def unpack_web_file(self, params): """ Download and unpack a web file to staging area params: file_url: file URL download_type: one of ['Direct Download', 'FTP', 'DropBox', 'Google Drive'] result: unpacked_file_path: unpacked file path(s) in staging area """ log('--->\nrunning UnpackFileUtil.unpack_web_file\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) scratch_file_path = self.dfu.download_web_file(params).get( 'copy_file_path') self._r_unpack(scratch_file_path, 0) unpacked_file_path_list = [] for dirpath, dirnames, filenames in os.walk( os.path.dirname(scratch_file_path)): for filename in filenames: unpacked_file_path_list.append(os.sep.join([dirpath, filename])) log("Unpacked files:\n {}".format( '\n '.join(unpacked_file_path_list))) self._file_to_staging(unpacked_file_path_list) unpacked_file_path = ','.join(unpacked_file_path_list) returnVal = {'unpacked_file_path': unpacked_file_path} return returnVal def generate_report(self, unpacked_file_path, params): """ generate_report: generate summary report unpacked_file_path: generated unpacked file path(s) in staging area. (return of unpack_staging_file or unpack_web_file) """ log("generating report") uuid_string = str(uuid.uuid4()) unpacked_file_path_list = unpacked_file_path.split(',') subdir = os.path.dirname( params.get('staging_file_subdir_path')) + '/' if params.get( 'staging_file_subdir_path') else '/' upload_message = 'Uploaded Files: {}\n'.format( len(unpacked_file_path_list)) for file_path in unpacked_file_path_list: upload_message += subdir + os.path.basename(file_path) + '\n' report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_mothods_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class CompMolNWChem_Thermo: ''' Module Name: CompMolNWChem_Thermo Module Description: A KBase module: CompMolNWChem_Thermo ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/nkkchem/CompMolNWChem_Thermo.git" GIT_COMMIT_HASH = "7e7b026e26614c14a308141a82acbb2ee913c1e3" #BEGIN_CLASS_HEADER def _generate_output_file_list(self, result_directory): """ _generate_output_file_list: zip result files and generate file_links for report """ #log('start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join(output_directory, 'Thermo_Result.zip') plot_file = os.path.join(output_directory, 'Thermo_Plot.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if not (file.endswith('.zip') or file.endswith('.png') or file.endswith('.DS_Store')): zip_file.write( os.path.join(root, file), os.path.join(os.path.basename(root), file)) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by CompMolNWChem_Thermo App' }) with zipfile.ZipFile(plot_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if file.endswith('.png'): zip_file.write( os.path.join(root, file), os.path.join(os.path.basename(root), file)) output_files.append({ 'path': plot_file, 'name': os.path.basename(plot_file), 'label': os.path.basename(plot_file), 'description': 'Plot(s) generated by CompMolNWChem_Thermo App' }) return output_files def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _save_to_ws_and_report(self, ws_id, source, compoundset, message=None): """Save compound set to the workspace and make report""" info = self.dfu.save_objects({ 'id': ws_id, "objects": [{ "type": "KBaseBiochem.CompoundSet", "data": compoundset, "name": compoundset['name'] }] })[0] compoundset_ref = "%s/%s/%s" % (info[6], info[0], info[4]) if not message: message = 'Imported %s as %s' % (source, info[1]) report_params = { 'objects_created': [{ 'ref': compoundset_ref, 'description': 'Compound Set' }], 'message': message, 'workspace_name': info[7], 'report_object_name': 'compound_set_creation_report' } # Construct the output to send back report_client = KBaseReport(self.callback_url) report_info = report_client.create_extended_report(report_params) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'compoundset_ref': compoundset_ref } return output #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.comp = CompoundSetUtils(self.callback_url) self.scratch = config['scratch'] logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #self.scratch = config['scratch'] #END_CONSTRUCTOR pass def run_CompMolNWChem_Thermo(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_CompMolNWChem_Thermo #-------------------------------------------------------------------------------------------------------------------- # Initial Tests to Check for Proper Inputs for name in ['Input_File', 'Input_Method', 'workspace_name']: if name not in params: raise ValueError('Parameter "' + name + '"is required but missing') if not isinstance(params['Input_File'], str): raise ValueError('Input_File must be a string') #-------------------------------------------------------------------------------------------------------------------- # Check the Input Method and set Equation_Input as the necessary if params['Input_Method'] == "file": Equation_Input = self.dfu.download_staging_file({ 'staging_file_subdir_path': params['Input_File'] }).get('copy_file_path') else: Reaction = params['Input_File'] with open('nwchem-scripts/EC_modelseed_ID_reactions.csv' ) as csv_input: Reaction_Frame = pd.read_csv(csv_input) length = len(Reaction_Frame.index) for itr in range(0, length): if Reaction == Reaction_Frame.EC.iloc[itr]: Equation_Input = Reaction_Frame.reaction.iloc[itr] elif Reaction == Reaction_Frame.rxn_ID.iloc[itr]: Equation_Input = Reaction_Frame.reaction.iloc[itr] Eq_List = [Equation_Input] with open('Reaction.csv', 'w', newline='') as file: writer = csv.writer(file) writer.writerow(Eq_List) Equation_Input = 'Reaction.csv' mol2_file_dir = None ext = os.path.splitext(Equation_Input)[1] file_name = os.path.basename(Equation_Input) #-------------------------------------------------------------------------------------------------------------------- # SNAKEMAKE PIPELINE START, READ FILES IN AND CHECK EXISTING REACTIONS from snakemake import snakemake modelseedID_to_deltaG = {} calculated = [] Done = False # fill with modelseed ID and correspodning delta_G of metabolites # This is the dictionary of all Modelseed cpds with their delta_G values with open('nwchem-scripts/modelSeed_ID_delta_G_calculated.csv', 'r') as dfile: lines = dfile.readlines() for line in lines: key = line.split(',')[0] G = line.split(',')[1] modelseedID_to_deltaG[key] = float(G) calculated.append(key) modelseedID_to_deltaG['cpd00001'] += 2.38 modelseedID_to_deltaG['cpd00067'] = -268.61 keggID_to_charge = {} with open('Ph7_charge_modelseedID.csv', 'r') as dfile: lines = dfile.readlines() for i in lines: key = i.split(',')[0] charge = i.split(',')[1] keggID_to_charge[key] = charge # map metabolites to the reactions with stoichmetry reactionlist = Equation_Input with open(reactionlist, 'r') as f: reactions = f.readlines()[0].rstrip().replace(' ', '') reactants = reactions.split('<=>')[0].split('+') products = reactions.split('<=>')[1].split('+') charge_reactants = 0 G_reactants = 0 left = [] stoich_left = [] for reactant in reactants: stoich_left.append( int((re.findall('(\(\d+\))cpd', reactant)[0]).replace('(', '').replace(')', ''))) reactant = re.search( re.findall('[a-zA-Z]{3}\d{5}', reactant)[0], reactant).group(0) left.append(reactant.strip()) charge_products = 0 G_products = 0 right = [] stoich_right = [] for product in products: stoich_right.append( int((re.findall('(\(\d+\))cpd', product)[0]).replace('(', '').replace(')', ''))) product = re.search( re.findall('[a-zA-Z]{3}\d{5}', product)[0], product).group(0) right.append(product.strip()) print(stoich_right, right) print(stoich_left, left) # Extract delta_g for reactions for which we have performed calculations if (set(left).issubset(set(calculated))) and set(right).issubset( set(calculated)): for jj in range(len(left)): G_reactants += stoich_left[jj] * float( modelseedID_to_deltaG[left[jj]]) charge_reactants += stoich_left[jj] * float( keggID_to_charge[left[jj]]) for kk in range(len(right)): G_products += stoich_right[kk] * float( modelseedID_to_deltaG[right[kk]]) charge_products += stoich_right[kk] * float( keggID_to_charge[right[kk]]) charge = float(charge_products - charge_reactants) G = float(G_products - G_reactants) if charge == -1: G = G - 268.61 if charge == -2: G = G - 268.61 * 2 if charge == -3: G = G - 268.61 * 3 if charge == +1: G = G + 268.61 * 1 if charge == +2: G = G + 268.61 * 2 if charge == +3: G = G + 268.61 * 3 if charge == +4: G = G + 268.61 * 4 if charge == -4: G = G - 268.61 * 4 if charge == 0: G = G print("Reaction free energy for given reaction is: ", G) Done = True else: print('Calculations is not finished for one of the metabolotes') #If the delta_g for the reaction is not there, we need to run our snakemake pipeline to calculate the reaction free energy #for the reaction. if not Done: id_to_smiles = {} data = open('/kb/module/modelSeed_ID_SMILES.csv', 'r') for lines in data.readlines(): id = lines.split(',')[0] smiles = lines.split(',')[1].rstrip() id_to_smiles[id] = smiles data.close() metabolites = [] for each in left: each = each.strip() metabolites.append(each) for each in right: each = each.strip() metabolites.append(each) # Check to see if cache data already exists for these compounds for molecule in metabolites: moldir = molecule cache_data = {'Metabolites': moldir} cache_id = caching.get_cache_id(ctx['token'], cache_data) result = caching.download_cache_string(ctx['token'], cache_id) if not result or not result.strip(): print('None') else: print('Some') for molecule in metabolites: moldir = molecule if not os.path.exists(moldir): os.mkdir(moldir) initial_structure_dir = moldir + '/initial_structure' if not os.path.exists(initial_structure_dir): os.mkdir(initial_structure_dir) md_structure_dir = moldir + '/md' if not os.path.exists(md_structure_dir): os.mkdir(md_structure_dir) dft_structure_dir = moldir + '/dft' if not os.path.exists(dft_structure_dir): os.mkdir(dft_structure_dir) inchifile_str = initial_structure_dir + '/' + moldir + '.smiles' with open(inchifile_str, 'w+') as f: f.write(id_to_smiles[moldir]) os.system( 'snakemake -p --cores 3 --snakefile snakemake-scripts/final_pipeline.snakemake -w 12000' ) else: print('Calculation already known, moving on') # Build KBase Output. Should output entire /simulation directory and build a CompoundSet with Mol2 Files #result_directory = '/kb/module/snakemake-scripts' result_directory = '/kb/module/dft' #----------------------------------------------------------------------------------------------------------------- # Extract the information from the results file df = pd.DataFrame() G = pd.DataFrame() for molecule in metabolites: moldir = molecule result_file = moldir + '_properties.dat' # Grab the third last line of the properties file with open('dft/' + result_file, 'r') as f: all_lines = f.readlines() length = len(all_lines) d = all_lines[(length - 3)] d = d.split() df = df.append(d) # Save the last number of the line to a new file. with open('dft/' + moldir + '_extracted_properties.dat', 'w') as f: string = str(df.iloc[-1].values) string = string.replace("['", "") string = string.replace("']", "") f.write(string) G_calc = float(string) # Cache The Results cache_data = {'Metabolite': moldir} result = {"Metabolite": moldir, "Free_Energy": G_calc} cache_id = CachingUtils.get_cache_id(ctx['token'], cache_data) CachingUtils.upload_to_cache(ctx['token'], cache_id, result) #----------------------------------------------------------------------------------------------------------------- ## Create Extended Report output_files = self._generate_output_file_list(result_directory) message = "Reaction free energy for given reaction is " + str(G) report_params = { 'message': message, 'workspace_id': params['workspace_id'], 'objects_created': [], 'file_links': output_files, 'report_object_name': 'kb_deseq2_report_' + str(uuid.uuid4()) } report = KBaseReport(self.callback_url) report_info = report.create_extended_report(report_params) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } return [output] #END run_CompMolNWChem_Thermo # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_CompMolNWChem_Thermo return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class ImportAttributeMappingUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.genapi = GenericsAPI(self.callback_url) def import_attribute_mapping_from_staging(self, params): """ import_attribute_mapping_from_staging: wrapper method for fba_tools.tsv_file_to_attribute_mapping required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name attribute_mapping_name: output conditionSet object name workspace_name: workspace name/ID of the object return: obj_ref: return object reference """ log('--->\nrunning ImportConditionSetUtil.import_attribute_mapping_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_attribute_mapping_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') ws_id = params['workspace_id'] import_attribute_mapping_params = { 'output_obj_name': params['attribute_mapping_name'], 'output_ws_id': ws_id, 'input_file_path': scratch_file_path } ref = self.genapi.file_to_fbamodel_attribute_mapping( import_attribute_mapping_params) returnVal = {'obj_ref': ref.get('attribute_mapping_ref')} return returnVal @staticmethod def validate_import_attribute_mapping_from_staging_params(params): """ validate_import_attribute_mapping_from_staging_params: validates params passed to import_attribute_mapping_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'workspace_id', 'attribute_mapping_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_attribute_mapping_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False} object_data = self.dfu.get_objects(get_objects_params) upload_message += "FBAModelSet Name: " upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n' upload_message += 'Imported File: {}\n'.format( params.get('staging_file_subdir_path')) report_params = { 'message': upload_message, 'objects_created': [{ 'ref': obj_ref, 'description': 'Imported FBAModelSet' }], 'workspace_id': params['workspace_id'], 'report_object_name': 'import_model_attri_mapping_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class ImportSRAUtil: SRA_TOOLKIT_PATH = '/kb/deployment/bin/fastq-dump' def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def _check_fastq_dump_result(self, tmp_dir, sra_name): """ _check_fastq_dump_result: check fastq_dump result is PE or SE """ return os.path.exists(tmp_dir + '/' + sra_name + '/1') def _sra_to_fastq(self, scratch_sra_file_path, params): """ _sra_to_fastq: convert SRA file to FASTQ file(s) """ tmp_dir = os.path.join(self.scratch, str(uuid.uuid4())) handler_utils._mkdir_p(tmp_dir) command = self.SRA_TOOLKIT_PATH + ' --split-3 -T -O ' command += tmp_dir + ' ' + scratch_sra_file_path self._run_command(command) sra_name = os.path.basename(scratch_sra_file_path).partition('.')[0] paired_end = self._check_fastq_dump_result(tmp_dir, sra_name) if paired_end: self._validate_paired_end_advanced_params(params) fwd_file = os.path.join(tmp_dir, sra_name, '1', 'fastq') os.rename(fwd_file, fwd_file + '.fastq') fwd_file = fwd_file + '.fastq' rev_file = os.path.join(tmp_dir, sra_name, '2', 'fastq') os.rename(rev_file, rev_file + '.fastq') rev_file = rev_file + '.fastq' else: self._validate_single_end_advanced_params(params) fwd_file = os.path.join(tmp_dir, sra_name, 'fastq') os.rename(fwd_file, fwd_file + '.fastq') fwd_file = fwd_file + '.fastq' rev_file = None fastq_file_path = { 'fwd_file': fwd_file, 'rev_file': rev_file } return fastq_file_path def _validate_single_end_advanced_params(self, params): """ _validate_single_end_advanced_params: validate advanced params for single end reads """ if (params.get('insert_size_mean') or params.get('insert_size_std_dev') or params.get('read_orientation_outward')): error_msg = 'Advanced params "Mean Insert Size", "St. Dev. of Insert Size" or ' error_msg += '"Reads Orientation Outward" is Paried End Reads specific' raise ValueError(error_msg) if 'interleaved' in params: del params['interleaved'] def _validate_paired_end_advanced_params(self, params): """ _validate_paired_end_advanced_params: validate advanced params for paired end reads """ sequencing_tech = params.get('sequencing_tech') if sequencing_tech in ['PacBio CCS', 'PacBio CLR']: error_msg = 'Sequencing Technology: "PacBio CCS" or "PacBio CLR" ' error_msg += 'is Single End Reads specific' raise ValueError(error_msg) def _validate_upload_staging_file_availability(self, staging_file_subdir_path): """ _validate_upload_file_path_availability: validates file availability in user's staging area """ pass # TODO ftp_server needs to be fixed for subdir # list = ftp_service(self.callback_url).list_files() # if staging_file_subdir_path not in list: # error_msg = 'Target file: {} is NOT available.\n'.format( # staging_file_subdir_path.rpartition('/')[-1]) # error_msg += 'Available files:\n {}'.format("\n".join(list)) # raise ValueError(error_msg) def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = os.path.join(config['scratch'], 'import_SRA_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_sra_from_staging(self, params): ''' import_sra_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome required params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name sequencing_tech: sequencing technology name: output reads file name workspace_name: workspace name/ID of the object Optional Params: single_genome: whether the reads are from a single genome or a metagenome. insert_size_mean: mean (average) insert length insert_size_std_dev: standard deviation of insert lengths read_orientation_outward: whether reads in a pair point outward return: obj_ref: return object reference ''' log('--->\nrunning ImportSRAUtil.import_sra_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_sra_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_sra_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') log('Downloaded staging file to: {}'.format(scratch_sra_file_path)) fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, params) import_sra_reads_params = params import_sra_reads_params.update(fastq_file_path) workspace_name_or_id = params.get('workspace_name') if str(workspace_name_or_id).isdigit(): import_sra_reads_params['wsid'] = int(workspace_name_or_id) else: import_sra_reads_params['wsname'] = str(workspace_name_or_id) log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format( json.dumps(import_sra_reads_params, indent=1))) returnVal = self.ru.upload_reads(import_sra_reads_params) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'), returnVal['obj_ref']) return returnVal def import_sra_from_web(self, params): ''' import_sra_from_web: wrapper method for GenomeFileUtil.genbank_to_genome required params: download_type: download type for web source fastq file ('Direct Download', 'FTP', 'DropBox', 'Google Drive') workspace_name: workspace name/ID of the object sra_urls_to_add: dict of SRA file URLs required params: file_url: SRA file URL sequencing_tech: sequencing technology name: output reads file name Optional Params: single_genome: whether the reads are from a single genome or a metagenome. insert_size_mean: mean (average) insert length insert_size_std_dev: standard deviation of insert lengths read_orientation_outward: whether reads in a pair point outward return: obj_ref: return object reference ''' log('--->\nrunning ImportSRAUtil.import_sra_from_web\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_sra_from_web_params(params) download_type = params.get('download_type') workspace_name = params.get('workspace_name') obj_refs = [] uploaded_files = [] for sra_url_to_add in params.get('sra_urls_to_add'): download_web_file_params = { 'download_type': download_type, 'file_url': sra_url_to_add.get('file_url') } scratch_sra_file_path = self.dfu.download_web_file( download_web_file_params).get('copy_file_path') log('Downloaded web file to: {}'.format(scratch_sra_file_path)) fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, sra_url_to_add) import_sra_reads_params = sra_url_to_add import_sra_reads_params.update(fastq_file_path) workspace_name_or_id = workspace_name if str(workspace_name_or_id).isdigit(): import_sra_reads_params['wsid'] = int(workspace_name_or_id) else: import_sra_reads_params['wsname'] = str(workspace_name_or_id) log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format( json.dumps(import_sra_reads_params, indent=1))) obj_ref = self.ru.upload_reads(import_sra_reads_params).get('obj_ref') obj_refs.append(obj_ref) uploaded_files.append(sra_url_to_add.get('file_url')) return {'obj_refs': obj_refs, 'uploaded_files': uploaded_files} def validate_import_sra_from_staging_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in ['staging_file_subdir_path', 'sequencing_tech', 'name', 'workspace_name']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') self._validate_upload_staging_file_availability(params.get('staging_file_subdir_path')) def validate_import_sra_from_web_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in ['download_type', 'workspace_name', 'sra_urls_to_add']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) if not isinstance(params.get('sra_urls_to_add'), list): raise ValueError('sra_urls_to_add is not type list as required') for sra_url_to_add in params.get('sra_urls_to_add'): for p in ['file_url', 'sequencing_tech', 'name']: if p not in sra_url_to_add: raise ValueError('"{}" parameter is required, but missing'.format(p)) def generate_report(self, obj_refs_list, params): """ generate_report: generate summary report obj_refs: generated workspace object references. (return of import_sra_from_staging/web) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) objects_created = list() objects_data = list() for obj_ref in obj_refs_list: get_objects_params = { 'object_refs': [obj_ref], 'ignore_errors': False } objects_data.append(self.dfu.get_objects(get_objects_params)) objects_created.append({'ref': obj_ref, 'description': 'Imported Reads'}) output_html_files = self.generate_html_report(objects_data, params, uuid_string) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 460, 'report_object_name': 'kb_sra_upload_report_' + uuid_string} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def generate_html_report(self, reads_objs, params, uuid_string): """ _generate_html_report: generate html summary report """ log('Start generating html report') pprint(params) tmp_dir = os.path.join(self.scratch, uuid_string) handler_utils._mkdir_p(tmp_dir) result_file_path = os.path.join(tmp_dir, 'report.html') html_report = list() objects_content = '' for index, reads_obj in enumerate(reads_objs): idx = str(index) reads_data = reads_obj.get('data')[0].get('data') reads_info = reads_obj.get('data')[0].get('info') reads_ref = str(reads_info[6]) + '/' + str(reads_info[0]) + '/' + str(reads_info[4]) reads_obj_name = str(reads_info[1]) with open(os.path.join(os.path.dirname(__file__), 'report_template_sra/table_panel.html'), 'r') as object_content_file: report_template = object_content_file.read() report_template = report_template.replace('_NUM', str(idx)) report_template = report_template.replace('OBJECT_NAME', reads_obj_name) if index == 0: report_template = report_template.replace('panel-collapse collapse', 'panel-collapse collapse in') objects_content += report_template base_percentages = '' for key, val in reads_data.get('base_percentages').items(): base_percentages += '{}({}%) '.format(key, val) reads_overview_data = collections.OrderedDict() reads_overview_data['Name'] = '{} ({})'.format(reads_obj_name, reads_ref) reads_overview_data['Uploaded File'] = params.get('uploaded_files')[index] reads_overview_data['Date Uploaded'] = time.strftime("%c") reads_overview_data['Number of Reads'] = '{:,}'.format(reads_data.get('read_count')) reads_type = reads_info[2].lower() if 'single' in reads_type: reads_overview_data['Type'] = 'Single End' elif 'paired' in reads_type: reads_overview_data['Type'] = 'Paired End' else: reads_overview_data['Type'] = 'Unknown' reads_overview_data['Platform'] = reads_data.get('sequencing_tech', 'Unknown') reads_single_genome = str(reads_data.get('single_genome', 'Unknown')) if '0' in reads_single_genome: reads_overview_data['Single Genome'] = 'No' elif '1' in reads_single_genome: reads_overview_data['Single Genome'] = 'Yes' else: reads_overview_data['Single Genome'] = 'Unknown' insert_size_mean = params.get('insert_size_mean', 'Not Specified') if insert_size_mean is not None: reads_overview_data['Insert Size Mean'] = str(insert_size_mean) else: reads_overview_data['Insert Size Mean'] = 'Not Specified' insert_size_std_dev = params.get('insert_size_std_dev', 'Not Specified') if insert_size_std_dev is not None: reads_overview_data['Insert Size Std Dev'] = str(insert_size_std_dev) else: reads_overview_data['Insert Size Std Dev'] = 'Not Specified' reads_outward_orientation = str(reads_data.get('read_orientation_outward', 'Unknown')) if '0' in reads_outward_orientation: reads_overview_data['Outward Read Orientation'] = 'No' elif '1' in reads_outward_orientation: reads_overview_data['Outward Read Orientation'] = 'Yes' else: reads_overview_data['Outward Read Orientation'] = 'Unknown' reads_stats_data = collections.OrderedDict() reads_stats_data['Number of Reads'] = '{:,}'.format(reads_data.get('read_count')) reads_stats_data['Total Number of Bases'] = '{:,}'.format(reads_data.get('total_bases')) reads_stats_data['Mean Read Length'] = str(reads_data.get('read_length_mean')) reads_stats_data['Read Length Std Dev'] = str(reads_data.get('read_length_stdev')) dup_reads_percent = '{:.2f}'.format(float(reads_data.get('number_of_duplicates') * 100) / \ reads_data.get('read_count')) reads_stats_data['Number of Duplicate Reads(%)'] = '{} ({}%)' \ .format(str(reads_data.get('number_of_duplicates')), dup_reads_percent) reads_stats_data['Phred Type'] = str(reads_data.get('phred_type')) reads_stats_data['Quality Score Mean'] = '{0:.2f}'.format(reads_data.get('qual_mean')) reads_stats_data['Quality Score (Min/Max)'] = '{}/{}'.format(str(reads_data.get('qual_min')), str(reads_data.get('qual_max'))) reads_stats_data['GC Percentage'] = str(round(reads_data.get('gc_content') * 100, 2)) + '%' reads_stats_data['Base Percentages'] = base_percentages overview_content = '' for key, val in reads_overview_data.items(): overview_content += '<tr><td><b>{}</b></td>'.format(key) overview_content += '<td>{}</td>'.format(val) overview_content += '</tr>' stats_content = '' for key, val in reads_stats_data.items(): stats_content += '<tr><td><b>{}</b></td>'.format(key) stats_content += '<td>{}</td>'.format(val) stats_content += '</tr>' objects_content = objects_content.replace('###OVERVIEW_CONTENT###', overview_content) objects_content = objects_content.replace('###STATS_CONTENT###', stats_content) with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template_sra/report_head.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('###TABLE_PANELS_CONTENT###', objects_content) result_file.write(report_template) result_file.close() shutil.copytree(os.path.join(os.path.dirname(__file__), 'report_template_sra/bootstrap-3.3.7'), os.path.join(tmp_dir, 'bootstrap-3.3.7')) shutil.copy(os.path.join(os.path.dirname(__file__), 'report_template_sra/jquery-3.2.1.min.js'), os.path.join(tmp_dir, 'jquery-3.2.1.min.js')) matched_files = [] for root, dirnames, filenames in os.walk(tmp_dir): for filename in fnmatch.filter(filenames, '*.gz'): matched_files.append(os.path.join(root, filename)) for gz_file in matched_files: print(('Removing ' + gz_file)) os.remove(gz_file) report_shock_id = self.dfu.file_to_shock({'file_path': tmp_dir, 'pack': 'zip'})['shock_id'] html_report.append({'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Imported Assembly'}) return html_report
class ImportGFFFastaUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url, service_ver='beta') self.uploader_utils = UploaderUtil(config) def import_gff_fasta_from_staging(self, params): """ import_gff_fasta_from_staging: wrapper method for GenomeFileUtil.fasta_gff_to_genome required params: fasta_file: fasta file from user's staging area gff_file: gff file from user's staging area genome_name: output genome object name workspace_name: workspace name that genome will be stored to file paths for both fasta and gff files must be subdirectory file path in staging area e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name optional params: scientific_name: proper name for species, key for taxonomy lookup.Default to 'unknown_taxon' source: Source Of The GenBank File. Default to 'User' taxon_wsname - where the reference taxons are. Default to 'ReferenceTaxons' taxon_reference - if defined, will try to link the Genome to the specified taxonomy object release: Release Or Version Of The Source Data genetic_code: Genetic Code For The Organism type: 'Reference', 'User upload', 'Representative' return: genome_ref: return object reference report_name: name of generated report (if any) report_ref: report reference (if any) """ logging.info( '--->\nrunning ImportGFFFastaUtil.import_gff_fasta_from_staging\n' + f'params:\n{json.dumps(params, indent=1)}') self.validate_import_gff_fasta_from_staging_params(params) for key in ('fasta_file', 'gff_file'): file_path = params[key] download_staging_file_params = { 'staging_file_subdir_path': file_path } dfu_returnVal = self.dfu.download_staging_file( download_staging_file_params) params[key] = {'path': dfu_returnVal['copy_file_path']} returnVal = self.gfu.fasta_gff_to_genome(params) """ Update the workspace object related meta-data for staged file """ # self.uploader_utils.update_staging_service(download_staging_file_params.get('staging_file_subdir_path'), # returnVal['genome_ref']) return returnVal def validate_import_gff_fasta_from_staging_params(self, params): """ validate_import_gff_fasta_from_staging_params: validates params passed to import_gff_fasta_from_staging method """ # check for required parameters for p in ['genome_name', 'workspace_name', 'fasta_file', 'gff_file']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') # for now must use workspace name, but no ws_id_to_name() function available if str(params["workspace_name"]).isdigit(): error_msg = '"{}" parameter is a workspace id and workspace name is required'.format( params["workspace_name"]) raise ValueError(error_msg)
class ImportAssemblyUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = os.path.join(config['scratch'], 'import_assembly_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.uploader_utils = UploaderUtil(config) self.max_contigs_for_report = 200 def import_fasta_as_assembly_from_staging(self, params): """ import_fasta_as_assembly_from_staging: wrapper method for AssemblyUtil.save_assembly_from_fasta required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name assembly_name - output Assembly file name workspace_name - the name of the workspace it gets saved to. return: obj_ref: return object reference """ logging.info( '--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n' f'params:\n{json.dumps(params, indent=1)}') self.validate_import_fasta_as_assembly_from_staging(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = {'path': scratch_file_path} import_assembly_params = params import_assembly_params['file'] = file ref = self.au.save_assembly_from_fasta(import_assembly_params) """ Update the workspace object related meta-data for staged file """ # self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'), ref) returnVal = {'obj_ref': ref} return returnVal def validate_import_fasta_as_assembly_from_staging(self, params): """ validate_import_fasta_as_assembly_from_staging: validates params passed to import_fasta_as_assembly_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'workspace_name', 'assembly_name' ]: if p not in params: raise ValueError(f'"{p}" parameter is required, but missing') def generate_html_report(self, assembly_ref, assembly_object, params): """ _generate_html_report: generate html summary report """ logging.info('start generating html report') html_report = list() assembly_data = assembly_object.get('data')[0].get('data') assembly_info = assembly_object.get('data')[0].get('info') tmp_dir = os.path.join(self.scratch, str(uuid.uuid4())) handler_utils._mkdir_p(tmp_dir) result_file_path = os.path.join(tmp_dir, 'report.html') assembly_name = str(assembly_info[1]) assembly_file = params.get('staging_file_subdir_path') dna_size = assembly_data.get('dna_size') num_contigs = assembly_data.get('num_contigs') assembly_overview_data = collections.OrderedDict() assembly_overview_data['Name'] = '{} ({})'.format( assembly_name, assembly_ref) assembly_overview_data['Uploaded File'] = assembly_file assembly_overview_data['Date Uploaded'] = time.strftime("%c") assembly_overview_data['DNA Size'] = dna_size assembly_overview_data['Number of Contigs'] = num_contigs overview_content = ['<br/><table>\n'] for key, val in assembly_overview_data.items(): overview_content.append(f'<tr><td><b>{key}</b></td>') overview_content.append(f'<td>{val}</td></tr>\n') overview_content.append('</table>') contig_data = assembly_data.get('contigs').values() contig_content = str([str(e['contig_id']), e['length']] for e in contig_data) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template', 'report_template_assembly.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>*Overview_Content*</p>', ''.join(overview_content)) report_template = report_template.replace( '*CONTIG_DATA*', contig_content) result_file.write(report_template) result_file.close() report_shock_id = self.dfu.file_to_shock({ 'file_path': tmp_dir, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Imported Assembly' }) return html_report def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_fasta_as_assembly_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ object_data = self.dfu.get_objects({'object_refs': [obj_ref]}) report_params = { 'workspace_name': params.get('workspace_name'), 'objects_created': [{ 'ref': obj_ref, 'description': 'Imported Assembly' }], 'report_object_name': f'kb_upload_assembly_report_{uuid.uuid4()}' } num_contigs = object_data['data'][0]['data']['num_contigs'] if num_contigs > self.max_contigs_for_report: report_params['message'] = ( "The uploaded assembly has too many contigs to display " "here. Click on the object for a dedicated viewer") else: output_html_files = self.generate_html_report( obj_ref, object_data, params) report_params.update({ 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 375, }) kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class ImportGenbankUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = os.path.join(config['scratch'], 'import_GenBank_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url, service_ver='beta') self.uploader_utils = UploaderUtil(config) def import_genbank_from_staging(self, params): ''' import_genbank_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name genome_name - becomes the name of the object workspace_name - the name of the workspace it gets saved to. source - Source of the file typically something like RefSeq or Ensembl optional params: release - Release or version number of the data per example Ensembl has numbered releases of all their data: Release 31 generate_ids_if_needed - If field used for feature id is not there, generate ids (default behavior is raising an exception) genetic_code - Genetic code of organism. Overwrites determined GC from taxon object type - Reference, Representative or User upload return: genome_ref: return object reference ''' logging.info( '--->\nrunning ImportGenbankUtil.import_genbank_from_staging\n' + f'params:\n{json.dumps(params, indent=1)}') self.validate_import_genbank_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = {'path': scratch_file_path} import_genbank_params = params import_genbank_params['file'] = file del import_genbank_params['staging_file_subdir_path'] returnVal = self.gfu.genbank_to_genome(import_genbank_params) """ Update the workspace object related meta-data for staged file """ #self.uploader_utils.update_staging_service( # download_staging_file_params.get('staging_file_subdir_path'), # returnVal['genome_ref']) return returnVal def validate_import_genbank_from_staging_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'genome_name', 'workspace_name', 'source' ]: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') def generate_html_report(self, genome_ref, params): """ _generate_html_report: generate html summary report """ logging.info('start generating html report') genome_obj = self.dfu.get_objects({'object_refs': [genome_ref]}) html_report = list() tmp_dir = os.path.join(self.scratch, str(uuid.uuid4())) handler_utils._mkdir_p(tmp_dir) result_file_path = os.path.join(tmp_dir, 'report.html') genome_name = str(genome_obj.get('data')[0].get('info')[1]) genome_file = params.get('staging_file_subdir_path') genome_data = genome_obj.get('data')[0].get('data') genome_info = genome_obj.get('data')[0].get('info') source = genome_info[10].get('Source') num_contigs = genome_info[10].get('Number contigs') size = genome_info[10].get('Size') gc_content = genome_info[10].get('GC content') warnings = genome_data.get('warnings', []) feature_counts = sorted( list(genome_data.get('feature_counts', {}).items())) genome_overview_data = collections.OrderedDict() genome_overview_data['Name'] = '{} ({})'.format( genome_name, genome_ref) #genome_overview_data['Uploaded File'] = genome_file genome_overview_data['Date Uploaded'] = time.strftime("%c") genome_overview_data['Source'] = source genome_overview_data['Number of Contigs'] = num_contigs genome_overview_data['Size'] = size genome_overview_data['GC Content'] = gc_content genome_overview_data['Warnings'] = "\n".join(warnings) genome_overview_data.update(feature_counts) overview_content = '' overview_content += '<br/><table>\n' for key, val in genome_overview_data.items(): overview_content += '<tr><td><b>{}</b></td>'.format(key) overview_content += '<td>{}</td>'.format(val) overview_content += '</tr>\n' overview_content += '</table>' feature_content = str( [[str(k), v] for k, v in list(genome_data.get('feature_counts', {}).items()) if k != 'gene']) contig_content = str( [[str(c), l] for c, l in zip(genome_data.get('contig_ids', []), genome_data.get('contig_lengths', []))]) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template', 'report_template_genome.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Overview_Content</p>', overview_content) report_template = report_template.replace( '*FEATURE_DATA*', feature_content) report_template = report_template.replace( '*CONTIG_DATA*', contig_content) result_file.write(report_template) result_file.close() report_shock_id = self.dfu.file_to_shock({ 'file_path': tmp_dir, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for imported Genome' }) return html_report def generate_report(self, genome_ref, params): """ :param genome_ref: Return Val from GenomeFileUtil for Uploaded genome Need to get report warnings and message from it. :return: """ uuid_string = str(uuid.uuid4()) objects_created = [{ 'ref': genome_ref, 'description': 'Imported Genome' }] output_html_files = self.generate_html_report(genome_ref, params) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 300, 'report_object_name': 'kb_genome_upload_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output