class MatrixUtil: def _validate_import_matrix_from_excel_params(self, params): """ _validate_import_matrix_from_excel_params: validates params passed to import_matrix_from_excel method """ logging.info('start validating import_matrix_from_excel params') # check for required parameters for p in ['obj_type', 'matrix_name', 'workspace_name', 'scale']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) obj_type = params.get('obj_type') if obj_type not in self.matrix_types: raise ValueError('Unknown matrix object type: {}'.format(obj_type)) scale = params.get('scale') if scale not in SCALE_TYPES: raise ValueError('Unknown scale type: {}'.format(scale)) if params.get('input_file_path'): file_path = params.get('input_file_path') elif params.get('input_shock_id'): file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') elif params.get('input_staging_file_path'): file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': params.get('input_staging_file_path') }).get('copy_file_path') else: error_msg = "Must supply either a input_shock_id or input_file_path " error_msg += "or input_staging_file_path" raise ValueError(error_msg) refs = {k: v for k, v in params.items() if "_ref" in k} return (obj_type, file_path, params.get('workspace_name'), params.get('matrix_name'), refs, scale) def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ logging.info('Start uploading file to shock: {}'.format(file_path)) file_to_shock_params = {'file_path': file_path, 'pack': 'zip'} shock_id = self.dfu.file_to_shock(file_to_shock_params).get('shock_id') return shock_id @staticmethod def _mkdir_p(path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise @staticmethod def _find_between(s, start, end): """ _find_between: find string in between start and end """ return re.search('{}(.*){}'.format(start, end), s).group(1) @staticmethod def _write_mapping_sheet(file_path, sheet_name, mapping, index): """ _write_mapping_sheet: write mapping to sheet """ df_dict = collections.OrderedDict() df_dict[index[0]] = [] df_dict[index[1]] = [] for key, value in mapping.items(): df_dict.get(index[0]).append(key) df_dict.get(index[1]).append(value) df = pd.DataFrame.from_dict(df_dict) with pd.ExcelWriter(file_path, engine='openpyxl') as writer: writer.book = load_workbook(file_path) df.to_excel(writer, sheet_name=sheet_name) def _generate_report(self, matrix_obj_ref, workspace_name): """ _generate_report: generate summary report """ report_params = { 'message': '', 'objects_created': [{ 'ref': matrix_obj_ref, 'description': 'Imported Matrix' }], 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_excel_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output @staticmethod def _process_mapping_sheet(file_path, sheet_name): """ _process_mapping: process mapping sheet """ try: df = pd.read_excel(file_path, sheet_name=sheet_name, dtype='str') except XLRDError: return dict() else: mapping = {value[0]: value[1] for value in df.values.tolist()} return mapping def _process_attribute_mapping_sheet(self, file_path, sheet_name, matrix_name, workspace_id): """ _process_attribute_mapping_sheet: process attribute_mapping sheet """ try: df = pd.read_excel(file_path, sheet_name=sheet_name) except XLRDError: return '' else: obj_name = f'{matrix_name}_{sheet_name}' result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) file_path = os.path.join(result_directory, '{}.xlsx'.format(obj_name)) df.to_excel(file_path) import_attribute_mapping_params = { 'output_obj_name': obj_name, 'output_ws_id': workspace_id, 'input_file_path': file_path } ref = self.attr_util.file_to_attribute_mapping( import_attribute_mapping_params) return ref.get('attribute_mapping_ref') @staticmethod def _file_to_df(file_path): logging.info('start parsing file content to data frame') try: df = pd.read_excel(file_path, sheet_name='data', index_col=0) except XLRDError: try: df = pd.read_excel(file_path, index_col=0) logging.warning( 'WARNING: A sheet named "data" was not found in the attached file,' ' proceeding with the first sheet as the data sheet.') except XLRDError: try: reader = pd.read_csv(file_path, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(file_path, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide tsv, excel or csv file' ) df.index = df.index.astype('str') df.columns = df.columns.astype('str') # fill NA with "None" so that they are properly represented as nulls in the KBase Object df = df.where((pd.notnull(df)), None) return df def _file_to_data(self, file_path, refs, matrix_name, workspace_id): logging.info('Start reading and converting excel file data') data = refs df = self._file_to_df(file_path) matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist() } data.update({'data': matrix_data}) data.update( self._get_axis_attributes('col', matrix_data, refs, file_path, matrix_name, workspace_id)) data.update( self._get_axis_attributes('row', matrix_data, refs, file_path, matrix_name, workspace_id)) # processing metadata metadata = self._process_mapping_sheet(file_path, 'metadata') data['attributes'] = {} data['search_attributes'] = [] for k, v in metadata.items(): k = k.strip() v = v.strip() if k in TYPE_ATTRIBUTES: data[k] = v else: data['attributes'][k] = v data['search_attributes'].append(" | ".join((k, v))) return data def _get_axis_attributes(self, axis, matrix_data, refs, file_path, matrix_name, workspace_id): """Get the row/col_attributemapping and mapping of ids, validating as needed""" # Parameter specified mappings should take precedence over tabs in excel so only process # if attributemapping_ref is missing: attr_data = {} if refs.get(f'{axis}_attributemapping_ref'): attributemapping_ref = refs[f'{axis}_attributemapping_ref'] else: attributemapping_ref = self._process_attribute_mapping_sheet( file_path, f'{axis}_attribute_mapping', matrix_name, workspace_id) if attributemapping_ref: attr_data[f'{axis}_attributemapping_ref'] = attributemapping_ref # col/row_mappings may not be supplied id_mapping = self._process_mapping_sheet(file_path, f'{axis}_mapping') if id_mapping: attr_data[f'{axis}_mapping'] = id_mapping # if no mapping, axis ids must match the attribute mapping elif attributemapping_ref: am_data = self.dfu.get_objects( {'object_refs': [attributemapping_ref]})['data'][0]['data'] axis_ids = matrix_data[f'{axis}_ids'] unmatched_ids = set(axis_ids) - set(am_data['instances'].keys()) if unmatched_ids: name = "Column" if axis == 'col' else "Row" raise ValueError( f"The following {name} IDs from the uploaded matrix do not match " f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}" f"\nPlease verify the input data or upload an excel file with a" f"{name} mapping tab.") else: # just gen the IDs in this matrix attr_data[f'{axis}_mapping'] = {x: x for x in axis_ids} return attr_data @staticmethod def _build_header_str(attribute_names): #not going to be used header_str = '' width = 100.0 / len(attribute_names) header_str += '<tr class="header">' header_str += '<th style="width:{0:.2f}%;">Feature ID</th>'.format( width) for attribute_name in attribute_names: header_str += '<th style="width:{0:.2f}%;"'.format(width) header_str += '>{}</th>'.format(attribute_name) header_str += '</tr>' return header_str def _build_html_str(self, row_mapping, attributemapping_data, row_ids): #not going to be used logging.info('Start building html replacement') attribute_names = [ attributes.get('attribute') for attributes in attributemapping_data.get('attributes') ] header_str = self._build_header_str(attribute_names) table_str = '' instances = attributemapping_data.get('instances') for feature_id, attribute_id in row_mapping.items(): if feature_id in row_ids: feature_instances = instances.get(attribute_id) table_str += '<tr>' table_str += '<td>{}</td>'.format(feature_id) for feature_instance in feature_instances: table_str += '<td>{}</td>'.format(feature_instance) table_str += '</tr>' return header_str, table_str def _generate_search_html_report(self, header_str, table_str): #generate search html report html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'search.html') shutil.copy2( os.path.join(os.path.dirname(__file__), 'templates', 'kbase_icon.png'), output_directory) shutil.copy2( os.path.join(os.path.dirname(__file__), 'templates', 'search_icon.png'), output_directory) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'templates', 'search_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '//HEADER_STR', header_str) report_template = report_template.replace( '//TABLE_STR', table_str) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_directory, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Search Matrix App' }) return html_report def _generate_search_report(self, header_str, table_str, workspace_name): logging.info('Start creating report') output_html_files = self._generate_search_html_report( header_str, table_str) report_params = { 'message': '', 'workspace_name': workspace_name, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 366, 'report_object_name': 'kb_matrix_filter_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output @staticmethod def _filter_value_data(value_data, remove_ids, dimension): """Filters a value matrix based on column or row ids""" def _norm_id(_id): return _id.replace(" ", "_") val_df = pd.DataFrame(value_data['values'], index=value_data['row_ids'], columns=value_data['col_ids'], dtype='object') if dimension == 'row': filtered_df = val_df.drop(remove_ids, axis=0, errors='ignore') filtered_df = filtered_df.drop([_norm_id(x) for x in remove_ids], axis=0, errors='ignore') elif dimension == 'col': filtered_df = val_df.drop(remove_ids, axis=1, errors='ignore') filtered_df = filtered_df.drop([_norm_id(x) for x in remove_ids], axis=1, errors='ignore') else: raise ValueError('Unexpected dimension: {}'.format(dimension)) filtered_value_data = { "values": filtered_df.values.tolist(), "col_ids": list(filtered_df.columns), "row_ids": list(filtered_df.index), } return filtered_value_data def _standardize_df(self, df, with_mean=True, with_std=True): logging.info("Standardizing matrix data") df.fillna(0, inplace=True) x_train = df.values scaler = preprocessing.StandardScaler(with_mean=with_mean, with_std=with_std).fit(x_train) standardized_values = scaler.transform(x_train) standardize_df = pd.DataFrame(index=df.index, columns=df.columns, data=standardized_values) return standardize_df def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.data_util = DataUtil(config) self.attr_util = AttributesUtil(config) self.matrix_types = [ x.split(".")[1].split('-')[0] for x in self.data_util.list_generic_types() ] def standardize_matrix(self, params): """ standardize a matrix """ input_matrix_ref = params.get('input_matrix_ref') workspace_name = params.get('workspace_name') new_matrix_name = params.get('new_matrix_name') with_mean = params.get('with_mean', 1) with_std = params.get('with_std', 1) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name input_matrix_obj = self.dfu.get_objects( {'object_refs': [input_matrix_ref]})['data'][0] input_matrix_info = input_matrix_obj['info'] input_matrix_name = input_matrix_info[1] input_matrix_data = input_matrix_obj['data'] if not new_matrix_name: current_time = time.localtime() new_matrix_name = input_matrix_name + time.strftime( '_%H_%M_%S_%Y_%m_%d', current_time) data_matrix = self.data_util.fetch_data({ 'obj_ref': input_matrix_ref }).get('data_matrix') df = pd.read_json(data_matrix) standardize_df = self._standardize_df(df, with_mean, with_std) new_matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': standardize_df.values.tolist() } input_matrix_data['data'] = new_matrix_data logging.info("Saving new standardized matrix object") info = self.dfu.save_objects({ "id": workspace_id, "objects": [{ "type": input_matrix_info[2], "data": input_matrix_data, "name": new_matrix_name }] })[0] new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) objects_created = [{ 'ref': new_matrix_obj_ref, 'description': 'Standardized Matrix' }] report_params = { 'message': '', 'objects_created': objects_created, 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) return { 'new_matrix_obj_ref': new_matrix_obj_ref, 'report_name': output['name'], 'report_ref': output['ref'] } def filter_matrix(self, params): #not going to be used """ filter_matrix: create sub-matrix based on input feature_ids arguments: matrix_obj_ref: object reference of a matrix workspace_name: workspace name feature_ids: string of feature ids that result matrix contains filtered_matrix_name: name of newly created filtered matrix object """ matrix_obj_ref = params.get('matrix_obj_ref') workspace_name = params.get('workspace_name') remove_ids = params.get('remove_ids') dimension = params.get('dimension') filtered_matrix_name = params.get('filtered_matrix_name') matrix_source = self.dfu.get_objects({"object_refs": [matrix_obj_ref]})['data'][0] matrix_info = matrix_source.get('info') matrix_data = matrix_source.get('data') matrix_type = self._find_between(matrix_info[2], '\.', '\-') value_data = matrix_data.get('data') remove_ids = [x.strip() for x in remove_ids.split(',')] filtered_value_data = self._filter_value_data(value_data, remove_ids, dimension) # if the matrix has changed shape, update the mappings if len(filtered_value_data['row_ids']) < len( matrix_data['data']['row_ids']): if matrix_data.get('row_mapping'): matrix_data['row_mapping'] = { k: matrix_data['row_mapping'][k] for k in filtered_value_data['row_ids'] } if matrix_data.get('feature_mapping'): matrix_data['feature_mapping'] = { k: matrix_data['feature_mapping'][k] for k in filtered_value_data['row_ids'] } if len(filtered_value_data['col_ids']) < len( matrix_data['data']['col_ids']): if matrix_data.get('col_mapping'): matrix_data['col_mapping'] = { k: matrix_data['col_mapping'][k] for k in filtered_value_data['col_ids'] } matrix_data['data'] = filtered_value_data if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name filtered_matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(matrix_type), 'obj_name': filtered_matrix_name, 'data': matrix_data, 'workspace_name': workspace_id })['obj_ref'] returnVal = {'matrix_obj_refs': [filtered_matrix_obj_ref]} report_output = self._generate_report(filtered_matrix_obj_ref, workspace_name) returnVal.update(report_output) return returnVal def search_matrix(self, params): #not going to be used """ search_matrix: generate a HTML report that allows users to select feature ids arguments: matrix_obj_ref: object reference of a matrix workspace_name: workspace name """ matrix_obj_ref = params.get('matrix_obj_ref') workspace_name = params.get('workspace_name') matrix_source = self.dfu.get_objects({"object_refs": [matrix_obj_ref]})['data'][0] matrix_data = matrix_source.get('data') row_mapping = matrix_data.get('row_mapping') row_attributemapping_ref = matrix_data.get('row_attributemapping_ref') row_ids = matrix_data['data']['row_ids'] if not (row_mapping and row_attributemapping_ref): raise ValueError( 'Matrix obejct is missing either row_mapping or row_attributemapping_ref' ) attributemapping_data = self.dfu.get_objects( {"object_refs": [row_attributemapping_ref]})['data'][0]['data'] header_str, table_str = self._build_html_str(row_mapping, attributemapping_data, row_ids) returnVal = self._generate_search_report(header_str, table_str, workspace_name) return returnVal def import_matrix_from_excel(self, params): """ import_matrix_from_excel: import matrix object from excel arguments: obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix matrix_name: matrix object name workspace_name: workspace name matrix object to be saved to input_shock_id: file shock id or input_file_path: absolute file path or input_staging_file_path: staging area file path optional arguments: col_attributemapping_ref: column AttributeMapping reference row_attributemapping_ref: row AttributeMapping reference genome_ref: genome reference matrix_obj_ref: Matrix reference """ (obj_type, file_path, workspace_name, matrix_name, refs, scale) = self._validate_import_matrix_from_excel_params(params) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name data = self._file_to_data(file_path, refs, matrix_name, workspace_id) data['scale'] = scale if params.get('description'): data['description'] = params['description'] matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': data, 'workspace_name': workspace_id })['obj_ref'] returnVal = {'matrix_obj_ref': matrix_obj_ref} report_output = self._generate_report(matrix_obj_ref, workspace_name) returnVal.update(report_output) return returnVal def export_matrix(self, params): """ export_matrix: univeral downloader for matrix data object arguments: obj_ref: generics object reference optional arguments: generics_module: select the generics data to be retrieved from e.g. for an given data type like below: typedef structure { FloatMatrix2D data; condition_set_ref condition_set_ref; } SomeGenericsMatrix; and only data is needed generics_module should be {'data': 'FloatMatrix2D'} """ logging.info('Start exporting matrix') if 'input_ref' in params: params['obj_ref'] = params.pop('input_ref') obj_source = self.dfu.get_objects( {"object_refs": [params.get('obj_ref')]})['data'][0] obj_data = obj_source.get('data') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) file_path = os.path.join(result_directory, '{}.xlsx'.format(obj_source.get('info')[1])) data_matrix = self.data_util.fetch_data(params).get('data_matrix') df = pd.read_json(data_matrix) df.to_excel(file_path, sheet_name='data') if obj_data.get('col_mapping'): self._write_mapping_sheet(file_path, 'col_mapping', obj_data.get('col_mapping'), ['col_name', 'instance_name']) obj_data.pop('col_mapping') if obj_data.get('row_mapping'): self._write_mapping_sheet(file_path, 'row_mapping', obj_data.get('row_mapping'), ['row_name', 'instance_name']) obj_data.pop('row_mapping') try: obj_data.pop('data') except KeyError: logging.warning('Missing key [data]') obj_data.update(obj_data.get('attributes', {})) # flatten for printing self._write_mapping_sheet(file_path, 'metadata', obj_data, ['name', 'value']) shock_id = self._upload_to_shock(file_path) return {'shock_id': shock_id}
class GenericsAPI: ''' Module Name: GenericsAPI Module Description: ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "1.0.8" GIT_URL = "[email protected]:Tianhao-Gu/GenericsAPI.git" GIT_COMMIT_HASH = "e5a7c9fc2952bf44ebf8ec76d92322f00b606b3e" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.config['SDK_CALLBACK_URL'] = os.environ['SDK_CALLBACK_URL'] self.config['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.attr_util = AttributesUtil(self.config) self.matrix_util = MatrixUtil(self.config) self.corr_util = CorrelationUtil(self.config) self.data_util = DataUtil(self.config) self.network_util = NetworkUtil(self.config) self.biom_util = BiomUtil(self.config) self.pca_util = PCAUtil(self.config) self.data_table_util = DataTableUtil(self.config) self.template_util = TemplateUtil(self.config) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR pass def fetch_data(self, ctx, params): """ fetch_data: fetch generics data as pandas dataframe for a generics data object :param params: instance of type "FetchDataParams" (Input of the fetch_data function obj_ref: generics object reference Optional arguments: generics_module: the generics data module to be retrieved from e.g. for an given data type like below: typedef structure { FloatMatrix2D data; condition_set_ref condition_set_ref; } SomeGenericsMatrix; generics_module should be {'data': 'FloatMatrix2D', 'condition_set_ref': 'condition_set_ref'}) -> structure: parameter "obj_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "generics_module" of mapping from String to String :returns: instance of type "FetchDataReturn" (Ouput of the fetch_data function data_matrix: a pandas dataframe in json format) -> structure: parameter "data_matrix" of String """ # ctx is the context object # return variables are: returnVal #BEGIN fetch_data returnVal = self.data_util.fetch_data(params) #END fetch_data # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method fetch_data return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def export_matrix(self, ctx, params): """ :param params: instance of type "ExportParams" (Input of the export_matrix function obj_ref: generics object reference Optional arguments: generics_module: select the generics data to be retrieved from e.g. for an given data type like below: typedef structure { FloatMatrix2D data; condition_set_ref condition_set_ref; } SomeGenericsMatrix; and only 'FloatMatrix2D' is needed generics_module should be {'data': FloatMatrix2D'}) -> structure: parameter "obj_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "generics_module" of mapping from String to String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: returnVal #BEGIN export_matrix returnVal = self.matrix_util.export_matrix(params) #END export_matrix # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method export_matrix return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def validate_data(self, ctx, params): """ validate_data: validate data :param params: instance of type "ValidateParams" (Input of the validate_data function obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1' data: data to be validated) -> structure: parameter "obj_type" of String, parameter "data" of mapping from String to String :returns: instance of type "ValidateOutput" -> structure: parameter "validated" of type "boolean" (A boolean - 0 for false, 1 for true.), parameter "failed_constraint" of mapping from String to String """ # ctx is the context object # return variables are: returnVal #BEGIN validate_data returnVal = self.data_util.validate_data(params) #END validate_data # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method validate_data return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def import_matrix_from_excel(self, ctx, params): """ import_matrix_from_excel: import matrix object from excel :param params: instance of type "ImportMatrixParams" (Input of the import_matrix_from_excel function obj_type: a type in KBaseMatrices input_shock_id: file shock id input_file_path: absolute file path input_staging_file_path: staging area file path matrix_name: matrix object name description: optional, a description of the matrix workspace_name: workspace name matrix object to be saved to optional: col_attributemapping_ref: column AttributeMapping reference row_attributemapping_ref: row AttributeMapping reference genome_ref: genome reference diff_expr_matrix_ref: DifferentialExpressionMatrix reference biochemistry_ref: (for ChemicalAbundanceMatrix) reads_set_ref: (raw data for AmpliconMatrix) sample_set_ref: SampleSet object reference) -> structure: parameter "obj_type" of String, parameter "input_shock_id" of String, parameter "input_file_path" of String, parameter "input_staging_file_path" of String, parameter "matrix_name" of String, parameter "amplicon_set_name" of String, parameter "scale" of String, parameter "description" of String, parameter "workspace_name" of type "workspace_name" (workspace name of the object), parameter "genome_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "col_attributemapping_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "row_attributemapping_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "diff_expr_matrix_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "biochemistry_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "reads_set_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "sample_set_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "unit" of String, parameter "type" of String :returns: instance of type "ImportMatrixOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "matrix_obj_ref" of type "obj_ref" (An X/Y/Z style reference) """ # ctx is the context object # return variables are: returnVal #BEGIN import_matrix_from_excel returnVal = self.matrix_util.import_matrix_from_excel(params) #END import_matrix_from_excel # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method import_matrix_from_excel return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def import_matrix_from_biom(self, ctx, params): """ import_matrix_from_biom: import matrix object from BIOM file format :param params: instance of type "ImportOTUParams" -> structure: parameter "obj_type" of String, parameter "taxonomic_abundance_tsv" of String, parameter "taxonomic_fasta" of String, parameter "input_local_file" of String, parameter "matrix_name" of String, parameter "amplicon_set_name" of String, parameter "scale" of String, parameter "description" of String, parameter "workspace_name" of type "workspace_name" (workspace name of the object), parameter "genome_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "col_attributemapping_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "row_attributemapping_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "diff_expr_matrix_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "biochemistry_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "reads_set_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "sample_set_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "metadata_keys" of list of String, parameter "extraction_kit" of String, parameter "amplicon_type" of String, parameter "target_gene_region" of String, parameter "forward_primer_sequence" of String, parameter "reverse_primer_sequence" of String, parameter "sequencing_platform" of String, parameter "sequencing_run" of String, parameter "sequencing_kit" of String, parameter "sequencing_quality_filter_cutoff" of String, parameter "clustering_cutoff" of Double, parameter "clustering_method" of String :returns: instance of type "ImportMatrixOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "matrix_obj_ref" of type "obj_ref" (An X/Y/Z style reference) """ # ctx is the context object # return variables are: returnVal #BEGIN import_matrix_from_biom returnVal = self.biom_util.import_matrix_from_biom(params) #END import_matrix_from_biom # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method import_matrix_from_biom return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def save_object(self, ctx, params): """ save_object: validate data constraints and save matrix object :param params: instance of type "SaveObjectParams" (Input of the import_matrix_from_excel function obj_type: saving object data type obj_name: saving object name data: data to be saved workspace_name: workspace name matrix object to be saved to) -> structure: parameter "obj_type" of String, parameter "obj_name" of String, parameter "data" of mapping from String to String, parameter "workspace_name" of type "workspace_name" (workspace name of the object) :returns: instance of type "SaveObjectOutput" -> structure: parameter "obj_ref" of type "obj_ref" (An X/Y/Z style reference) """ # ctx is the context object # return variables are: returnVal #BEGIN save_object returnVal = self.data_util.save_object(params) #END save_object # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method save_object return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def search_matrix(self, ctx, params): """ search_matrix: generate a HTML report that allows users to select feature ids :param params: instance of type "MatrixSelectorParams" (Input of the search_matrix function matrix_obj_ref: object reference of a matrix workspace_name: workspace name objects to be saved to) -> structure: parameter "matrix_obj_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "workspace_name" of type "workspace_name" (workspace name of the object) :returns: instance of type "MatrixSelectorOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN search_matrix returnVal = self.matrix_util.search_matrix(params) #END search_matrix # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method search_matrix return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def filter_matrix(self, ctx, params): """ filter_matrix: create sub-matrix based on input filter_ids :param params: instance of type "MatrixFilterParams" (Input of the filter_matrix function matrix_obj_ref: object reference of a matrix workspace_name: workspace name objects to be saved to filter_ids: string of column or row ids that result matrix contains filtered_matrix_name: name of newly created filtered matrix object) -> structure: parameter "matrix_obj_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "workspace_name" of type "workspace_name" (workspace name of the object), parameter "filtered_matrix_name" of String, parameter "remove_ids" of String, parameter "dimension" of String :returns: instance of type "MatrixFilterOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "matrix_obj_refs" of list of type "obj_ref" (An X/Y/Z style reference) """ # ctx is the context object # return variables are: returnVal #BEGIN filter_matrix returnVal = self.matrix_util.filter_matrix(params) #END filter_matrix # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method filter_matrix return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def standardize_matrix(self, ctx, params): """ standardize_matrix: standardize a matrix :param params: instance of type "StandardizeMatrixParams" (Input of the standardize_matrix function input_matrix_ref: object reference of a matrix workspace_name: workspace name objects to be saved to with_mean: center data before scaling with_std: scale data to unit variance new_matrix_name: name of newly created matrix object) -> structure: parameter "input_matrix_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "workspace_name" of type "workspace_name" (workspace name of the object), parameter "with_mean" of type "boolean" (A boolean - 0 for false, 1 for true.), parameter "with_std" of type "boolean" (A boolean - 0 for false, 1 for true.), parameter "dimension" of String, parameter "new_matrix_name" of String :returns: instance of type "StandardizeMatrixOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "new_matrix_obj_ref" of type "obj_ref" (An X/Y/Z style reference) """ # ctx is the context object # return variables are: returnVal #BEGIN standardize_matrix returnVal = self.matrix_util.standardize_matrix(params) #END standardize_matrix # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method standardize_matrix return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def transform_matrix(self, ctx, params): """ :param params: instance of type "TransformMatrixParams" -> structure: parameter "input_matrix_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "workspace_name" of type "workspace_name" (workspace name of the object), parameter "workspace_id" of Long, parameter "new_matrix_name" of String, parameter "abundance_filtering_params" of mapping from String to String, parameter "standardization_params" of mapping from String to String, parameter "ratio_transformation_params" of mapping from String to String, parameter "perform_relative_abundance" of type "boolean" (A boolean - 0 for false, 1 for true.) :returns: instance of type "TransformMatrixOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "new_matrix_obj_ref" of type "obj_ref" (An X/Y/Z style reference) """ # ctx is the context object # return variables are: returnVal #BEGIN transform_matrix returnVal = self.matrix_util.transform_matrix(params) #END transform_matrix # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method transform_matrix return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def perform_rarefy(self, ctx, params): """ :param params: instance of type "RarefyMatrixParams" -> structure: parameter "input_matrix_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "workspace_id" of Long, parameter "new_matrix_name" of String, parameter "seed_number" of Long, parameter "dimension" of String :returns: instance of type "RarefyMatrixOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "new_matrix_obj_ref" of type "obj_ref" (An X/Y/Z style reference) """ # ctx is the context object # return variables are: returnVal #BEGIN perform_rarefy returnVal = self.matrix_util.perform_rarefy(params) #END perform_rarefy # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method perform_rarefy return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def perform_variable_stats_matrix(self, ctx, params): """ :param params: instance of type "VariableStatsParams" -> structure: parameter "input_matrix_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "attribute_mapping_obj_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "workspace_id" of Long, parameter "dist_metric" of String, parameter "dimension" of String, parameter "grouping" of String, parameter "permutations" of Long, parameter "perform_anosim" of type "boolean" (A boolean - 0 for false, 1 for true.), parameter "perform_permanova" of type "boolean" (A boolean - 0 for false, 1 for true.), parameter "perform_permdisp" of type "boolean" (A boolean - 0 for false, 1 for true.) :returns: instance of type "VariableStatsOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN perform_variable_stats_matrix returnVal = self.matrix_util.perform_variable_stats_matrix(params) #END perform_variable_stats_matrix # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError( 'Method perform_variable_stats_matrix return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def perform_mantel_test(self, ctx, params): """ :param params: instance of type "MantelTestParams" -> structure: parameter "input_matrix_refs" of list of type "obj_ref" (An X/Y/Z style reference), parameter "workspace_id" of Long, parameter "dist_metric" of String, parameter "dimension" of String, parameter "correlation_method" of String, parameter "permutations" of Long, parameter "alternative_hypothesis" of String :returns: instance of type "MantelTestOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN perform_mantel_test returnVal = self.matrix_util.perform_mantel_test(params) #END perform_mantel_test # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method perform_mantel_test return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def file_to_attribute_mapping(self, ctx, params): """ :param params: instance of type "FileToAttributeMappingParams" (input_shock_id and input_file_path - alternative input params,) -> structure: parameter "input_shock_id" of String, parameter "input_file_path" of String, parameter "output_ws_id" of String, parameter "output_obj_name" of String :returns: instance of type "FileToAttributeMappingOutput" -> structure: parameter "attribute_mapping_ref" of type "obj_ref" (An X/Y/Z style reference) """ # ctx is the context object # return variables are: result #BEGIN file_to_attribute_mapping logging.info( "Starting 'file_to_attribute_mapping' with params:{}".format( params)) self.attr_util.validate_params(params, ("output_ws_id", "output_obj_name"), ('input_shock_id', 'input_file_path')) result = self.attr_util.file_to_attribute_mapping(params) #END file_to_attribute_mapping # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method file_to_attribute_mapping return value ' + 'result is not type dict as required.') # return the results return [result] def file_to_fbamodel_attribute_mapping(self, ctx, params): """ :param params: instance of type "FileToAttributeMappingParams" (input_shock_id and input_file_path - alternative input params,) -> structure: parameter "input_shock_id" of String, parameter "input_file_path" of String, parameter "output_ws_id" of String, parameter "output_obj_name" of String :returns: instance of type "FileToAttributeMappingOutput" -> structure: parameter "attribute_mapping_ref" of type "obj_ref" (An X/Y/Z style reference) """ # ctx is the context object # return variables are: result #BEGIN file_to_fbamodel_attribute_mapping logging.info( "Starting 'file_to_fbamodel_attribute_mapping' with params:{}". format(params)) self.attr_util.validate_params(params, ("output_ws_id", "output_obj_name"), ('input_shock_id', 'input_file_path')) params['import_fbamodel_attri_mapping'] = True result = self.attr_util.file_to_attribute_mapping(params) #END file_to_fbamodel_attribute_mapping # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError( 'Method file_to_fbamodel_attribute_mapping return value ' + 'result is not type dict as required.') # return the results return [result] def update_matrix_attribute_mapping(self, ctx, params): """ :param params: instance of type "UpdateMatrixAMParams" -> structure: parameter "staging_file_subdir_path" of String, parameter "dimension" of String, parameter "input_matrix_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "workspace_name" of String, parameter "output_am_obj_name" of String, parameter "output_matrix_obj_name" of String :returns: instance of type "UpdateMatrixAMOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "new_matrix_obj_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "new_attribute_mapping_ref" of type "obj_ref" (An X/Y/Z style reference) """ # ctx is the context object # return variables are: returnVal #BEGIN update_matrix_attribute_mapping logging.info( "Starting 'update_matrix_attribute_mapping' with params:{}".format( params)) self.attr_util.validate_params( params, ("staging_file_subdir_path", "dimension", "workspace_name", "output_am_obj_name", "input_matrix_ref", "output_matrix_obj_name")) returnVal = self.attr_util.update_matrix_attribute_mapping(params) #END update_matrix_attribute_mapping # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError( 'Method update_matrix_attribute_mapping return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def attribute_mapping_to_tsv_file(self, ctx, params): """ :param params: instance of type "AttributeMappingToTsvFileParams" -> structure: parameter "input_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "destination_dir" of String :returns: instance of type "AttributeMappingToTsvFileOutput" -> structure: parameter "file_path" of String """ # ctx is the context object # return variables are: result #BEGIN attribute_mapping_to_tsv_file logging.info( "Starting 'attribute_mapping_to_tsv_file' with params:{}".format( params)) self.attr_util.validate_params(params, ("destination_dir", "input_ref")) am_id, result = self.attr_util.to_tsv(params) #END attribute_mapping_to_tsv_file # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError( 'Method attribute_mapping_to_tsv_file return value ' + 'result is not type dict as required.') # return the results return [result] def export_attribute_mapping_tsv(self, ctx, params): """ :param params: instance of type "ExportObjectParams" -> structure: parameter "input_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: result #BEGIN export_attribute_mapping_tsv logging.info( "Starting 'export_attribute_mapping_tsv' with params:{}".format( params)) self.attr_util.validate_params(params, ("input_ref", )) params['destination_dir'] = self.scratch am_id, files = self.attr_util.to_tsv(params) result = self.attr_util.export(files['file_path'], am_id, params['input_ref']) #END export_attribute_mapping_tsv # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError( 'Method export_attribute_mapping_tsv return value ' + 'result is not type dict as required.') # return the results return [result] def export_attribute_mapping_excel(self, ctx, params): """ :param params: instance of type "ExportObjectParams" -> structure: parameter "input_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: result #BEGIN export_attribute_mapping_excel logging.info( "Starting 'export_attribute_mapping_excel' with params:{}".format( params)) self.attr_util.validate_params(params, ("input_ref", )) params['destination_dir'] = self.scratch am_id, files = self.attr_util.to_excel(params) result = self.attr_util.export(files['file_path'], am_id, params['input_ref']) #END export_attribute_mapping_excel # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError( 'Method export_attribute_mapping_excel return value ' + 'result is not type dict as required.') # return the results return [result] def export_cluster_set_excel(self, ctx, params): """ :param params: instance of type "ExportObjectParams" -> structure: parameter "input_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: result #BEGIN export_cluster_set_excel logging.info( "Starting 'export_cluster_set_excel' with params:{}".format( params)) self.attr_util.validate_params(params, ("input_ref", )) params['destination_dir'] = self.scratch cs_id, files = self.attr_util.to_excel(params) result = self.attr_util.export(files['file_path'], cs_id, params['input_ref']) #END export_cluster_set_excel # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method export_cluster_set_excel return value ' + 'result is not type dict as required.') # return the results return [result] def export_corr_matrix_excel(self, ctx, params): """ :param params: instance of type "ExportObjectParams" -> structure: parameter "input_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: result #BEGIN export_corr_matrix_excel logging.info( "Starting 'export_corr_matrix_excel' with params:{}".format( params)) result = self.corr_util.export_corr_matrix_excel(params) #END export_corr_matrix_excel # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method export_corr_matrix_excel return value ' + 'result is not type dict as required.') # return the results return [result] def export_pca_matrix_excel(self, ctx, params): """ :param params: instance of type "ExportObjectParams" -> structure: parameter "input_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: result #BEGIN export_pca_matrix_excel result = self.pca_util.export_pca_matrix_excel(params) #END export_pca_matrix_excel # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method export_pca_matrix_excel return value ' + 'result is not type dict as required.') # return the results return [result] def export_amplicon_set_tsv(self, ctx, params): """ :param params: instance of type "ExportObjectParams" -> structure: parameter "input_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: result #BEGIN export_amplicon_set_tsv result = self.biom_util.export_amplicon_set_tsv(params) #END export_amplicon_set_tsv # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method export_amplicon_set_tsv return value ' + 'result is not type dict as required.') # return the results return [result] def compute_correlation_matrix(self, ctx, params): """ compute_correlation_matrix: create sub-matrix based on input filter_ids :param params: instance of type "CompCorrParams" (Input of the compute_correlation_matrix function input_obj_ref: object reference of a matrix workspace_name: workspace name objects to be saved to corr_matrix_name: correlation matrix object name dimension: compute correlation on column or row, one of ['col', 'row'] method: correlation method, one of ['pearson', 'kendall', 'spearman'] plot_corr_matrix: plot correlation matrix in report, default False plot_scatter_matrix: plot scatter matrix in report, default False compute_significance: also compute Significance in addition to correlation matrix) -> structure: parameter "input_obj_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "workspace_name" of type "workspace_name" (workspace name of the object), parameter "corr_matrix_name" of String, parameter "dimension" of String, parameter "method" of String, parameter "plot_corr_matrix" of type "boolean" (A boolean - 0 for false, 1 for true.), parameter "plot_scatter_matrix" of type "boolean" (A boolean - 0 for false, 1 for true.), parameter "compute_significance" of type "boolean" (A boolean - 0 for false, 1 for true.) :returns: instance of type "CompCorrOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "corr_matrix_obj_ref" of type "obj_ref" (An X/Y/Z style reference) """ # ctx is the context object # return variables are: returnVal #BEGIN compute_correlation_matrix returnVal = self.corr_util.compute_correlation_matrix(params) #END compute_correlation_matrix # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError( 'Method compute_correlation_matrix return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def compute_correlation_across_matrices(self, ctx, params): """ compute_correlation_across_matrices: compute correlation matrix across matrices :param params: instance of type "CompCorrMetriceParams" (Input of the compute_correlation_across_matrices function matrix_ref_1: object reference of a matrix matrix_ref_2: object reference of a matrix workspace_name: workspace name objects to be saved to corr_matrix_name: correlation matrix object name dimension: compute correlation on column or row, one of ['col', 'row'] method: correlation method, one of ['pearson', 'kendall', 'spearman'] plot_corr_matrix: plot correlation matrix in report, default False compute_significance: also compute Significance in addition to correlation matrix) -> structure: parameter "matrix_ref_1" of type "obj_ref" (An X/Y/Z style reference), parameter "matrix_ref_2" of type "obj_ref" (An X/Y/Z style reference), parameter "workspace_name" of type "workspace_name" (workspace name of the object), parameter "corr_matrix_name" of String, parameter "dimension" of String, parameter "method" of String, parameter "plot_corr_matrix" of type "boolean" (A boolean - 0 for false, 1 for true.), parameter "compute_significance" of type "boolean" (A boolean - 0 for false, 1 for true.), parameter "corr_threshold" of Double :returns: instance of type "CompCorrOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "corr_matrix_obj_ref" of type "obj_ref" (An X/Y/Z style reference) """ # ctx is the context object # return variables are: returnVal #BEGIN compute_correlation_across_matrices returnVal = self.corr_util.compute_correlation_across_matrices(params) #END compute_correlation_across_matrices # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError( 'Method compute_correlation_across_matrices return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def build_network(self, ctx, params): """ build_network: filter correlation matrix and build network :param params: instance of type "BuildNetworkParams" (Input of the build_network function corr_matrix_ref: CorrelationMatrix object workspace_name: workspace name objects to be saved to network_obj_name: Network object name filter_on_threshold: Dictory holder that holds filter on thredshold params params in filter_on_threshold: coefficient_threshold: correlation coefficient threshold (select pairs with greater correlation coefficient)) -> structure: parameter "corr_matrix_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "workspace_name" of type "workspace_name" (workspace name of the object), parameter "network_obj_name" of String, parameter "filter_on_threshold" of mapping from String to String :returns: instance of type "BuildNetworkOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "network_obj_ref" of type "obj_ref" (An X/Y/Z style reference) """ # ctx is the context object # return variables are: returnVal #BEGIN build_network returnVal = self.network_util.build_network(params) #END build_network # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method build_network return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def run_pca(self, ctx, params): """ run_pca: PCA analysis on matrix :param params: instance of type "PCAParams" (Input of the run_pca function input_obj_ref: object reference of a matrix workspace_name: the name of the workspace pca_matrix_name: name of PCA (KBaseExperiments.PCAMatrix) object dimension: compute PCA on column or row, one of ['col', 'row'] n_components - number of components (default 2) attribute_mapping_obj_ref - associated attribute_mapping_obj_ref scale_size_by - used for PCA plot to scale data size color_marker_by - used for PCA plot to group data) -> structure: parameter "input_obj_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "workspace_name" of String, parameter "pca_matrix_name" of String, parameter "dimension" of String, parameter "n_components" of Long, parameter "attribute_mapping_obj_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "scale_size_by" of mapping from String to String, parameter "color_marker_by" of mapping from String to String :returns: instance of type "PCAOutput" (Ouput of the run_pca function pca_ref: PCA object reference (as KBaseExperiments.PCAMatrix data type) report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport) -> structure: parameter "pca_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN run_pca returnVal = self.pca_util.run_pca(params) #END run_pca # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method run_pca return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def view_matrix(self, ctx, params): """ view_matrix: generate a report for matrix viewer :param params: instance of type "ViewMatrixParams" -> structure: parameter "input_matrix_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "workspace_name" of String, parameter "with_attribute_info" of type "boolean" (A boolean - 0 for false, 1 for true.) :returns: instance of type "ViewMatrixOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN view_matrix returnVal = self.data_table_util.view_matrix_as_table(params) #END view_matrix # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method view_matrix return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def build_chemical_abundance_template(self, ctx, params): """ :param params: instance of type "ChemAbunTempParams" -> structure: parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "sample_set_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "chemical_data_included" of mapping from String to Long, parameter "chemical_ids_included" of mapping from String to Long :returns: instance of type "ViewMatrixOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN build_chemical_abundance_template returnVal = self.template_util.build_chemical_abundance_template( params) #END build_chemical_abundance_template # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError( 'Method build_chemical_abundance_template return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class BiomUtil: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _process_params(self, params): logging.info('start validating import_matrix_from_biom params') # check for required parameters for p in [ 'obj_type', 'matrix_name', 'workspace_id', 'scale', 'amplicon_type', 'sequencing_technology', 'sequencing_instrument', 'target_gene', 'target_subfragment', 'taxon_calling' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) # check sequencing_technology and sequencing_instrument matching sequencing_technology = params.get('sequencing_technology') sequencing_instrument = params.get('sequencing_instrument') if sequencing_technology not in SEQ_INSTRUMENTS_MAP: raise ValueError('Unexpected sequencing technology: {}'.format( sequencing_technology)) expected_instruments = SEQ_INSTRUMENTS_MAP.get(sequencing_technology) if sequencing_instrument not in expected_instruments: raise ValueError( 'Please select sequencing instrument among {} for {}'.format( expected_instruments, sequencing_technology)) # check target_gene and target_subfragment matching target_gene = params.get('target_gene') target_subfragment = list(set(params.get('target_subfragment'))) params['target_subfragment'] = target_subfragment if target_gene not in TARGET_GENE_SUBFRAGMENT_MAP: raise ValueError('Unexpected target gene: {}'.format(target_gene)) expected_subfragments = TARGET_GENE_SUBFRAGMENT_MAP.get(target_gene) if not set(target_subfragment) <= set(expected_subfragments): raise ValueError( 'Please select target subfragments among {} for {}'.format( expected_subfragments, target_gene)) # check taxon_calling taxon_calling = params.get('taxon_calling') taxon_calling_method = list( set(taxon_calling.get('taxon_calling_method'))) params['taxon_calling_method'] = taxon_calling_method if 'denoising' in taxon_calling_method: denoise_method = taxon_calling.get('denoise_method') sequence_error_cutoff = taxon_calling.get('sequence_error_cutoff') if not (denoise_method and sequence_error_cutoff): raise ValueError( 'Please provide denoise_method and sequence_error_cutoff') params['denoise_method'] = denoise_method params['sequence_error_cutoff'] = sequence_error_cutoff if 'clustering' in taxon_calling_method: clustering_method = taxon_calling.get('clustering_method') clustering_cutoff = taxon_calling.get('clustering_cutoff') if not (clustering_method and clustering_cutoff): raise ValueError( 'Please provide clustering_method and clustering_cutoff') params['clustering_method'] = clustering_method params['clustering_cutoff'] = clustering_cutoff obj_type = params.get('obj_type') if obj_type not in self.matrix_types: raise ValueError('Unknown matrix object type: {}'.format(obj_type)) scale = params.get('scale') if scale not in SCALE_TYPES: raise ValueError('Unknown scale type: {}'.format(scale)) biom_file = None tsv_file = None fasta_file = None metadata_keys = DEFAULT_META_KEYS input_local_file = params.get('input_local_file', False) if params.get('taxonomic_abundance_tsv') and params.get( 'taxonomic_fasta'): tsv_file = params.get('taxonomic_abundance_tsv') fasta_file = params.get('taxonomic_fasta') if not (tsv_file and fasta_file): raise ValueError('missing TSV or FASTA file') if not input_local_file: tsv_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': tsv_file }).get('copy_file_path') fasta_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': fasta_file }).get('copy_file_path') metadata_keys_str = params.get('metadata_keys') if metadata_keys_str: metadata_keys += [ x.strip() for x in metadata_keys_str.split(',') ] mode = 'tsv_fasta' elif params.get('biom_fasta'): biom_fasta = params.get('biom_fasta') biom_file = biom_fasta.get('biom_file_biom_fasta') fasta_file = biom_fasta.get('fasta_file_biom_fasta') if not (biom_file and fasta_file): raise ValueError('missing BIOM or FASTA file') if not input_local_file: biom_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': biom_file }).get('copy_file_path') fasta_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': fasta_file }).get('copy_file_path') mode = 'biom_fasta' elif params.get('tsv_fasta'): tsv_fasta = params.get('tsv_fasta') tsv_file = tsv_fasta.get('tsv_file_tsv_fasta') fasta_file = tsv_fasta.get('fasta_file_tsv_fasta') if not (tsv_file and fasta_file): raise ValueError('missing TSV or FASTA file') if not input_local_file: tsv_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': tsv_file }).get('copy_file_path') fasta_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': fasta_file }).get('copy_file_path') metadata_keys_str = tsv_fasta.get('metadata_keys_tsv_fasta') if metadata_keys_str: metadata_keys += [ x.strip() for x in metadata_keys_str.split(',') ] mode = 'tsv_fasta' else: raise ValueError('missing valide file group type in parameters') return (biom_file, tsv_file, fasta_file, mode, list(set(metadata_keys))) def _validate_fasta_file(self, df, fasta_file): logging.info('start validating FASTA file') try: fastq_dict = SeqIO.index(fasta_file, "fasta") except Exception: raise ValueError( 'Cannot parse file. Please provide valide FASTA file') matrix_ids = df.index file_ids = fastq_dict.keys() unmatched_ids = set(matrix_ids) - set(file_ids) if unmatched_ids: raise ValueError( 'FASTA file does not have [{}] OTU id'.format(unmatched_ids)) def _file_to_amplicon_data(self, biom_file, tsv_file, fasta_file, mode, refs, matrix_name, workspace_id, scale, description, metadata_keys=None): amplicon_data = refs if mode.startswith('biom'): logging.info('start parsing BIOM file for matrix data') table = biom.load_table(biom_file) observation_metadata = table._observation_metadata sample_metadata = table._sample_metadata matrix_data = { 'row_ids': table._observation_ids.tolist(), 'col_ids': table._sample_ids.tolist(), 'values': table.matrix_data.toarray().tolist() } logging.info('start building attribute mapping object') amplicon_data.update( self.get_attribute_mapping("row", observation_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data.update( self.get_attribute_mapping("col", sample_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data['attributes'] = {} for k in ('create_date', 'generated_by'): val = getattr(table, k) if not val: continue if isinstance(val, bytes): amplicon_data['attributes'][k] = val.decode('utf-8') else: amplicon_data['attributes'][k] = str(val) elif mode.startswith('tsv'): observation_metadata = None sample_metadata = None try: logging.info('start parsing TSV file for matrix data') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide tsv file') else: self._validate_fasta_file(df, fasta_file) metadata_df = None if metadata_keys: shared_metadata_keys = list( set(metadata_keys) & set(df.columns)) if mode == 'tsv' and 'consensus_sequence' not in shared_metadata_keys: raise ValueError( 'TSV file does not include consensus_sequence') if shared_metadata_keys: metadata_df = df[shared_metadata_keys] df.drop(columns=shared_metadata_keys, inplace=True) try: df = df.astype(float) except ValueError: err_msg = 'Found some non-float values. Matrix contains only numeric values\n' err_msg += 'Please list any non-numeric column names in Metadata Keys field' raise ValueError(err_msg) df.fillna(0, inplace=True) df.index = df.index.astype('str') df.columns = df.columns.astype('str') matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist() } logging.info('start building attribute mapping object') amplicon_data.update( self.get_attribute_mapping("row", observation_metadata, matrix_data, matrix_name, refs, workspace_id, metadata_df=metadata_df)) amplicon_data.update( self.get_attribute_mapping("col", sample_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data['attributes'] = {} else: raise ValueError( 'error parsing _file_to_amplicon_data, mode: {}'.format(mode)) amplicon_data.update({'data': matrix_data}) amplicon_data['search_attributes'] = [ f'{k}|{v}' for k, v in amplicon_data['attributes'].items() ] amplicon_data['scale'] = scale if description: amplicon_data['description'] = description return amplicon_data def get_attribute_mapping(self, axis, metadata, matrix_data, matrix_name, refs, workspace_id, metadata_df=None): mapping_data = {} axis_ids = matrix_data[f'{axis}_ids'] if refs.get('sample_set_ref') and axis == 'col': name = matrix_name + "_{}_attributes".format(axis) mapping_data[ f'{axis}_attributemapping_ref'] = self._sample_set_to_attribute_mapping( axis_ids, refs.get('sample_set_ref'), name, workspace_id) mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif refs.get(f'{axis}_attributemapping_ref'): am_data = self.dfu.get_objects( {'object_refs': [refs[f'{axis}_attributemapping_ref']]})['data'][0]['data'] unmatched_ids = set(axis_ids) - set(am_data['instances'].keys()) if unmatched_ids: name = "Column" if axis == 'col' else "Row" raise ValueError( f"The following {name} IDs from the uploaded matrix do not match " f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}" f"\nPlease verify the input data or upload an excel file with a" f"{name} mapping tab.") else: mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif metadata: name = matrix_name + "_{}_attributes".format(axis) mapping_data[ f'{axis}_attributemapping_ref'] = self._metadata_to_attribute_mapping( axis_ids, metadata, name, workspace_id) # if coming from biom file, metadata and axis IDs are guaranteed to match mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif metadata_df is not None: name = matrix_name + "_{}_attributes".format(axis) mapping_data[ f'{axis}_attributemapping_ref'] = self._meta_df_to_attribute_mapping( axis_ids, metadata_df, name, workspace_id) mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} return mapping_data def _meta_df_to_attribute_mapping(self, axis_ids, metadata_df, obj_name, ws_id): data = {'ontology_mapping_method': "TSV file", 'instances': {}} metadata_df = metadata_df.astype(str) attribute_keys = metadata_df.columns.tolist() data['attributes'] = [{ 'attribute': key, 'source': 'upload' } for key in attribute_keys] if 'taxonomy' in attribute_keys: data['attributes'].append({ 'attribute': 'parsed_user_taxonomy', 'source': 'upload' }) for axis_id in axis_ids: data['instances'][axis_id] = metadata_df.loc[axis_id].tolist() if 'taxonomy' in attribute_keys: parsed_user_taxonomy = None taxonomy_index = attribute_keys.index('taxonomy') taxonomy_str = metadata_df.loc[axis_id].tolist( )[taxonomy_index] parsed_user_taxonomy = self.taxon_util.process_taxonomic_str( taxonomy_str) data['instances'][axis_id].append(parsed_user_taxonomy) logging.info( 'start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": data, "name": obj_name }] })[0] return f'{info[6]}/{info[0]}/{info[4]}' def _sample_set_to_attribute_mapping(self, axis_ids, sample_set_ref, obj_name, ws_id): am_data = self.sampleservice_util.sample_set_to_attribute_mapping( sample_set_ref) unmatched_ids = set(axis_ids) - set(am_data['instances'].keys()) if unmatched_ids: name = "Column" raise ValueError( f"The following {name} IDs from the uploaded matrix do not match " f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}" f"\nPlease verify the input data or upload an excel file with a" f"{name} mapping tab.") logging.info( 'start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": am_data, "name": obj_name }] })[0] return f'{info[6]}/{info[0]}/{info[4]}' def _metadata_to_attribute_mapping(self, instances, metadata, obj_name, ws_id): data = {'ontology_mapping_method': "BIOM file", 'instances': {}} sample_set = metadata[0:min(len(metadata), 25)] metadata_keys = sorted( set((k for m_dict in sample_set for k in m_dict))) data['attributes'] = [{ 'attribute': key, 'source': 'upload' } for key in metadata_keys] for inst, meta in zip(instances, metadata): data['instances'][inst] = [ str(meta[attr]) for attr in metadata_keys ] logging.info( 'start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": data, "name": obj_name }] })[0] return f'{info[6]}/{info[0]}/{info[4]}' def _generate_visualization_content(self, output_directory, heatmap_dir, data_df, top_heatmap_dir, top_percent, display_count): row_data_summary = data_df.T.describe().round(2).to_string() col_data_summary = data_df.describe().round(2).to_string() tab_def_content = '' tab_content = '' viewer_name = 'data_summary' tab_def_content += '''\n<div class="tab">\n''' tab_def_content += '''\n<button class="tablinks" ''' tab_def_content += '''onclick="openTab(event, '{}')"'''.format( viewer_name) tab_def_content += ''' id="defaultOpen"''' tab_def_content += '''>Matrix Statistics</button>\n''' tab_content += '''\n<div id="{}" class="tabcontent" style="overflow:auto">'''.format( viewer_name) tab_content += '''\n<h5>Amplicon Matrix Size: {} x {}</h5>'''.format( len(data_df.index), len(data_df.columns)) tab_content += '''\n<h5>Row Aggregating Statistics</h5>''' html = '''\n<pre class="tab">''' + str(row_data_summary).replace( "\n", "<br>") + "</pre>" tab_content += html tab_content += '''\n<br>''' tab_content += '''\n<hr style="height:2px;border-width:0;color:gray;background-color:gray">''' tab_content += '''\n<br>''' tab_content += '''\n<h5>Column Aggregating Statistics</h5>''' html = '''\n<pre class="tab">''' + str(col_data_summary).replace( "\n", "<br>") + "</pre>" tab_content += html tab_content += '\n</div>\n' if top_heatmap_dir: viewer_name = 'TopHeatmapViewer' tab_def_content += '''\n<button class="tablinks" ''' tab_def_content += '''onclick="openTab(event, '{}')"'''.format( viewer_name) tab_def_content += '''>Top {}% ({} Rows) Heatmap</button>\n'''.format( round(top_percent, 2), display_count) heatmap_report_files = os.listdir(top_heatmap_dir) heatmap_index_page = None for heatmap_report_file in heatmap_report_files: if heatmap_report_file.endswith('.html'): heatmap_index_page = heatmap_report_file shutil.copy2( os.path.join(top_heatmap_dir, heatmap_report_file), output_directory) if heatmap_index_page: tab_content += '''\n<div id="{}" class="tabcontent">'''.format( viewer_name) msg = 'Top {} percent of matrix sorted by sum of abundance values.'.format( round(top_percent, 2)) tab_content += '''<p style="color:red;" >{}</p>'''.format(msg) tab_content += '\n<iframe height="1300px" width="100%" ' tab_content += 'src="{}" '.format(heatmap_index_page) tab_content += 'style="border:none;"></iframe>' tab_content += '\n</div>\n' else: tab_content += '''\n<div id="{}" class="tabcontent">'''.format( viewer_name) tab_content += '''\n<p style="color:red;" >''' tab_content += '''Heatmap is too large to be displayed.</p>\n''' tab_content += '\n</div>\n' viewer_name = 'MatrixHeatmapViewer' tab_def_content += '''\n<button class="tablinks" ''' tab_def_content += '''onclick="openTab(event, '{}')"'''.format( viewer_name) tab_def_content += '''>Matrix Heatmap</button>\n''' heatmap_report_files = os.listdir(heatmap_dir) heatmap_index_page = None for heatmap_report_file in heatmap_report_files: if heatmap_report_file.endswith('.html'): heatmap_index_page = heatmap_report_file shutil.copy2(os.path.join(heatmap_dir, heatmap_report_file), output_directory) if heatmap_index_page: tab_content += '''\n<div id="{}" class="tabcontent">'''.format( viewer_name) tab_content += '\n<iframe height="1300px" width="100%" ' tab_content += 'src="{}" '.format(heatmap_index_page) tab_content += 'style="border:none;"></iframe>' tab_content += '\n</div>\n' else: tab_content += '''\n<div id="{}" class="tabcontent">'''.format( viewer_name) tab_content += '''\n<p style="color:red;" >''' tab_content += '''Heatmap is too large to be displayed.</p>\n''' tab_content += '\n</div>\n' tab_def_content += '\n</div>\n' return tab_def_content + tab_content def _generate_heatmap_html_report(self, data): logging.info('Start generating heatmap report page') data_df = pd.DataFrame(data['values'], index=data['row_ids'], columns=data['col_ids']) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) tsv_file_path = os.path.join( result_directory, 'heatmap_data_{}.tsv'.format(str(uuid.uuid4()))) data_df.to_csv(tsv_file_path) if data_df.index.size < 10000: heatmap_dir = self.report_util.build_heatmap_html({ 'tsv_file_path': tsv_file_path, 'cluster_data': True })['html_dir'] else: logging.info( 'Original matrix is too large. Skip clustering data in report.' ) heatmap_dir = self.report_util.build_heatmap_html({ 'tsv_file_path': tsv_file_path, 'cluster_data': False })['html_dir'] top_heatmap_dir = None top_percent = 100 display_count = 200 # roughly count for display items if len(data_df.index) > 1000: top_percent = min(display_count / data_df.index.size * 100, 100) top_heatmap_dir = self.report_util.build_heatmap_html({ 'tsv_file_path': tsv_file_path, 'sort_by_sum': True, 'top_percent': top_percent })['html_dir'] output_directory = os.path.join(self.scratch, str(uuid.uuid4())) logging.info( 'Start generating html report in {}'.format(output_directory)) html_report = list() self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'matrix_viewer_report.html') visualization_content = self._generate_visualization_content( output_directory, heatmap_dir, data_df, top_heatmap_dir, top_percent, display_count) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'templates', 'matrix_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Visualization_Content</p>', visualization_content) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_directory, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Import Amplicon Matrix App' }) return html_report def _generate_report(self, matrix_obj_ref, new_row_attr_ref, new_col_attr_ref, workspace_id, data=None): """ _generate_report: generate summary report """ objects_created = [{ 'ref': matrix_obj_ref, 'description': 'Imported Amplicon Matrix' }] if new_row_attr_ref: objects_created.append({ 'ref': new_row_attr_ref, 'description': 'Imported Amplicons(Row) Attribute Mapping' }) if new_col_attr_ref: objects_created.append({ 'ref': new_col_attr_ref, 'description': 'Imported Samples(Column) Attribute Mapping' }) if data: output_html_files = self._generate_heatmap_html_report(data) report_params = { 'message': '', 'objects_created': objects_created, 'workspace_id': workspace_id, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 1400, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } else: report_params = { 'message': '', 'objects_created': objects_created, 'workspace_id': workspace_id, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.report_util = kb_GenericsReport(self.callback_url) self.data_util = DataUtil(config) self.sampleservice_util = SampleServiceUtil(config) self.attr_util = AttributesUtil(config) self.matrix_util = MatrixUtil(config) self.taxon_util = TaxonUtil(config) self.matrix_types = [ x.split(".")[1].split('-')[0] for x in self.data_util.list_generic_types() ] self.taxon_wsname = config['taxon-workspace-name'] self.kbse = KBaseSearchEngine(config['search-url']) self.taxon_cache = dict() def fetch_sequence(self, matrix_ref): logging.info('start to fetch consensus sequence') input_matrix_obj = self.dfu.get_objects({'object_refs': [matrix_ref]})['data'][0] input_matrix_info = input_matrix_obj['info'] matrix_name = input_matrix_info[1] matrix_type = input_matrix_info[2] matrix_data = input_matrix_obj['data'] if 'KBaseMatrices.AmpliconMatrix' not in matrix_type: raise ValueError('Unexpected data type: {}'.format(matrix_type)) handle = matrix_data.get('sequencing_file_handle') if not handle: raise ValueError( 'Missing sequencing_file_handle from the matrix object') output_directory = os.path.join(self.scratch, str(uuid.uuid4())) logging.info('Start generating consensus sequence file in {}'.format( output_directory)) self._mkdir_p(output_directory) matrix_fasta_file = self.dfu.shock_to_file({ 'handle_id': handle, 'file_path': self.scratch }).get('file_path') try: logging.info('start parsing FASTA file') fastq_dict = SeqIO.index(matrix_fasta_file, "fasta") except Exception: raise ValueError( 'Cannot parse file. Please provide valide FASTA file') row_ids = matrix_data['data']['row_ids'] fasta_file_path = os.path.join( output_directory, matrix_name + 'consensus_sequence.fasta') with open(fasta_file_path, 'w') as f: for row_id in row_ids: consensus_sequence = str(fastq_dict.get(row_id).seq) f.write('>' + str(row_id) + '\n') f.write(consensus_sequence + '\n') return fasta_file_path def import_matrix_from_biom(self, params): """ arguments: obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix matrix_name: matrix object name workspace_id: workspace id matrix object to be saved to input_shock_id: file shock id or input_file_path: absolute file path or input_staging_file_path: staging area file path optional arguments: col_attributemapping_ref: column AttributeMapping reference row_attributemapping_ref: row AttributeMapping reference genome_ref: genome reference matrix_obj_ref: Matrix reference """ (biom_file, tsv_file, fasta_file, mode, metadata_keys) = self._process_params(params) workspace_id = params.get('workspace_id') matrix_name = params.get('matrix_name') obj_type = params.get('obj_type') scale = params.get('scale') description = params.get('description') refs = {k: v for k, v in params.items() if "_ref" in k} amplicon_data = self._file_to_amplicon_data(biom_file, tsv_file, fasta_file, mode, refs, matrix_name, workspace_id, scale, description, metadata_keys) for key in [ 'amplicon_type', 'amplification', 'extraction', 'target_gene', 'target_subfragment', 'pcr_primers', 'library_kit', 'library_layout', 'library_screening_strategy', 'sequencing_center', 'sequencing_date', 'sequencing_technology', 'sequencing_instrument', 'sequencing_quality_filter_cutoff', 'read_length_cutoff', 'read_pairing', 'barcode_error_rate', 'chimera_detection_and_removal', 'taxon_calling_method', 'denoise_method', 'sequence_error_cutoff', 'clustering_method', 'clustering_cutoff', 'sample_set_ref', 'reads_set_ref' ]: if params.get(key): amplicon_data[key] = params[key] new_row_attr_ref = None if not params.get('row_attributemapping_ref'): new_row_attr_ref = amplicon_data.get('row_attributemapping_ref') new_col_attr_ref = None if not params.get('col_attributemapping_ref'): new_col_attr_ref = amplicon_data.get('col_attributemapping_ref') if fasta_file: logging.info( 'start saving consensus sequence file to shock: {}'.format( fasta_file)) handle_id = self.dfu.file_to_shock({ 'file_path': fasta_file, 'make_handle': True })['handle']['hid'] amplicon_data['sequencing_file_handle'] = handle_id logging.info('start saving Matrix object: {}'.format(matrix_name)) matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': amplicon_data, 'workspace_id': workspace_id })['obj_ref'] if params.get('sample_set_ref'): self.matrix_util._link_matrix_to_samples(matrix_obj_ref, amplicon_data, params['sample_set_ref']) returnVal = {'matrix_obj_ref': matrix_obj_ref} report_output = self._generate_report(matrix_obj_ref, new_row_attr_ref, new_col_attr_ref, workspace_id, data=amplicon_data['data']) returnVal.update(report_output) return returnVal
class BiomUtil: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _process_params(self, params): logging.info('start validating import_matrix_from_biom params') # check for required parameters for p in [ 'obj_type', 'matrix_name', 'workspace_name', 'scale', 'amplicon_set_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) obj_type = params.get('obj_type') if obj_type not in self.matrix_types: raise ValueError('Unknown matrix object type: {}'.format(obj_type)) scale = params.get('scale') if scale not in SCALE_TYPES: raise ValueError('Unknown scale type: {}'.format(scale)) biom_file = None tsv_file = None fasta_file = None metadata_keys = DEFAULT_META_KEYS if params.get('biom_tsv'): biom_tsv = params.get('biom_tsv') biom_file = biom_tsv.get('biom_file_biom_tsv') tsv_file = biom_tsv.get('tsv_file_biom_tsv') if not (biom_file and tsv_file): raise ValueError('missing BIOM or TSV file') biom_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': biom_file }).get('copy_file_path') tsv_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': tsv_file }).get('copy_file_path') mode = 'biom_tsv' elif params.get('biom_fasta'): biom_fasta = params.get('biom_fasta') biom_file = biom_fasta.get('biom_file_biom_fasta') fasta_file = biom_fasta.get('fasta_file_biom_fasta') if not (biom_file and fasta_file): raise ValueError('missing BIOM or FASTA file') biom_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': biom_file }).get('copy_file_path') fasta_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': fasta_file }).get('copy_file_path') mode = 'biom_fasta' elif params.get('tsv_fasta'): tsv_fasta = params.get('tsv_fasta') tsv_file = tsv_fasta.get('tsv_file_tsv_fasta') fasta_file = tsv_fasta.get('fasta_file_tsv_fasta') if not (tsv_file and fasta_file): raise ValueError('missing TSV or FASTA file') tsv_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': tsv_file }).get('copy_file_path') fasta_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': fasta_file }).get('copy_file_path') metadata_keys_str = tsv_fasta.get('metadata_keys_tsv_fasta') if metadata_keys_str: metadata_keys += [ x.strip() for x in metadata_keys_str.split(',') ] mode = 'tsv_fasta' elif params.get('tsv'): tsv = params.get('tsv') tsv_file = tsv.get('tsv_file_tsv') if not tsv_file: raise ValueError('missing TSV file') tsv_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': tsv_file }).get('copy_file_path') metadata_keys_str = tsv.get('metadata_keys_tsv') if metadata_keys_str: metadata_keys += [ x.strip() for x in metadata_keys_str.split(',') ] mode = 'tsv' else: raise ValueError('missing valide file group type in parameters') return (biom_file, tsv_file, fasta_file, mode, list(set(metadata_keys))) def _retrieve_value(self, biom_metadata_dict, tsv_metadata_df, key, required=False): if key in biom_metadata_dict: return {k.lower(): v for k, v in biom_metadata_dict.items()}.get(key) elif key in tsv_metadata_df: return {k.lower(): v for k, v in tsv_metadata_df.items()}.get(key) elif required: raise ValueError('missing necessary [{}] from file'.format(key)) else: return None def _search_taxon(self, scientific_name): """ logic borrowed from: GFU.GenomeInterface https://github.com/kbaseapps/GenomeFileUtil/blob/master/lib/GenomeFileUtil/core/GenomeInterface.py#L216 """ taxon_id = None search_params = { "object_types": ["taxon"], "match_filter": { "lookup_in_keys": { "scientific_name": { "value": scientific_name } }, "exclude_subobjects": 1 }, "access_filter": { "with_private": 0, "with_public": 1 }, "sorting_rules": [{ "is_object_property": 0, "property": "timestamp", "ascending": 0 }] } objects = self.kbse.search_objects(search_params)['objects'] if not objects: search_params['match_filter']['lookup_in_keys'] = { "aliases": { "value": scientific_name } } objects = self.kbse.search_objects(search_params)['objects'] if objects: taxon_id = objects[0].get('object_name') return taxon_id def _fetch_taxon_level(self, taxon_char): taxon_level_mapping = { 'l': 'Life', 'd': 'Domain', 'k': 'Kingdom', 'p': 'Phylum', 'c': 'Class', 'o': 'Order', 'f': 'Family', 'g': 'Genus', 's': 'Species' } return taxon_level_mapping.get(taxon_char[0].lower(), 'Unknown') def _fetch_taxonomy(self, datarow): lineage = self._retrieve_value([], datarow, 'taxonomy') if isinstance(lineage, str): delimiter = csv.Sniffer().sniff(lineage).delimiter lineage = [x.strip() for x in lineage.split(delimiter)] taxonomy = {'lineage': lineage} for key in ['score', 'taxonomy_source', 'species_name']: val = self._retrieve_value([], datarow, key) if val: taxonomy[key] = val for item in lineage[::-1]: scientific_name = item.split('_')[-1] taxon_level_char = item.split('_')[0] if scientific_name: taxon_id = self._search_taxon(scientific_name) if taxon_id: taxon_ref = f"{self.taxon_wsname}/{taxon_id}" taxon_level = self._fetch_taxon_level(taxon_level_char) taxonomy.update({ 'taxon_ref': taxon_ref, 'taxon_id': taxon_id, 'scientific_name': scientific_name, 'taxon_level': taxon_level }) break return taxonomy def _retrieve_tsv_amplicon_set_data(self, tsv_file): amplicons = dict() try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide TSV file') if 'consensus_sequence' not in df.columns.tolist(): raise ValueError('TSV file does not include consensus_sequence') logging.info('start processing each row in TSV') for observation_id in df.index: taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = { 'consensus_sequence': df.loc[observation_id, 'consensus_sequence'], 'taxonomy': taxonomy } amplicons.update({observation_id: amplicon}) logging.info('finished parsing TSV file') return amplicons def _retrieve_tsv_fasta_amplicon_set_data(self, tsv_file, fasta_file): amplicons = dict() try: logging.info('start parsing FASTA file') fastq_dict = SeqIO.index(fasta_file, "fasta") except Exception: raise ValueError( 'Cannot parse file. Please provide valide FASTA file') try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide TSV file') logging.info('start processing files') for observation_id in df.index: if observation_id not in fastq_dict: raise ValueError('FASTA file does not have [{}] OTU id'.format( observation_id)) taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = { 'consensus_sequence': str(fastq_dict.get(observation_id).seq), 'taxonomy': taxonomy } amplicons.update({observation_id: amplicon}) logging.info('finished processing files') return amplicons def _retrieve_biom_fasta_amplicon_set_data(self, biom_file, fasta_file): amplicons = dict() try: logging.info('start parsing FASTA file') fastq_dict = SeqIO.index(fasta_file, "fasta") except Exception: raise ValueError( 'Cannot parse file. Please provide valide FASTA file') logging.info('start parsing BIOM file') table = biom.load_table(biom_file) observation_ids = table._observation_ids.tolist() observation_metadata = table._observation_metadata logging.info('start processing files') for index, observation_id in enumerate(observation_ids): if observation_id not in fastq_dict: raise ValueError('FASTA file does not have [{}] OTU id'.format( observation_id)) taxonomy = self._fetch_taxonomy(observation_metadata[index]) amplicon = { 'consensus_sequence': str(fastq_dict.get(observation_id).seq), 'taxonomy': taxonomy } amplicons.update({observation_id: amplicon}) logging.info('finished processing files') return amplicons def _retrieve_biom_tsv_amplicon_set_data(self, biom_file, tsv_file): amplicons = dict() try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide tsv file') if 'consensus_sequence' not in df.columns.tolist(): raise ValueError('TSV file does not include consensus_sequence') logging.info('start parsing BIOM file') table = biom.load_table(biom_file) observation_ids = table._observation_ids.tolist() observation_metadata = table._observation_metadata logging.info('start processing files') for index, observation_id in enumerate(observation_ids): if observation_id not in df.index: raise ValueError('TSV file does not have [{}] OTU id'.format( observation_id)) taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = { 'consensus_sequence': df.loc[observation_id, 'consensus_sequence'], 'taxonomy': taxonomy } amplicons.update({observation_id: amplicon}) logging.info('finished processing files') return amplicons def _file_to_amplicon_set_data(self, biom_file, tsv_file, fasta_file, mode, refs, description, matrix_obj_ref): logging.info('start parsing amplicon_set_data') amplicon_set_data = dict() if mode == 'biom_tsv': amplicons = self._retrieve_biom_tsv_amplicon_set_data( biom_file, tsv_file) elif mode == 'biom_fasta': amplicons = self._retrieve_biom_fasta_amplicon_set_data( biom_file, fasta_file) elif mode == 'tsv_fasta': amplicons = self._retrieve_tsv_fasta_amplicon_set_data( tsv_file, fasta_file) elif mode == 'tsv': amplicons = self._retrieve_tsv_amplicon_set_data(tsv_file) else: raise ValueError( 'error parsing _file_to_amplicon_set_data, mode: {}'.format( mode)) amplicon_set_data.update({'amplicons': amplicons}) if 'reads_set_ref' in refs: amplicon_set_data['reads_set_ref'] = refs.get('reads_set_ref') if description: amplicon_set_data['description'] = description matrix_obj_ref_array = matrix_obj_ref.split('/') amplicon_set_data['amplicon_matrix_ref'] = '{}/{}'.format( matrix_obj_ref_array[0], matrix_obj_ref_array[1]) return amplicon_set_data def _file_to_amplicon_data(self, biom_file, tsv_file, mode, refs, matrix_name, workspace_id, scale, description, metadata_keys=None): amplicon_data = refs if mode.startswith('biom'): logging.info('start parsing BIOM file for matrix data') table = biom.load_table(biom_file) observation_metadata = table._observation_metadata sample_metadata = table._sample_metadata matrix_data = { 'row_ids': table._observation_ids.tolist(), 'col_ids': table._sample_ids.tolist(), 'values': table.matrix_data.toarray().tolist() } logging.info('start building attribute mapping object') amplicon_data.update( self.get_attribute_mapping("row", observation_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data.update( self.get_attribute_mapping("col", sample_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data['attributes'] = {} for k in ('create_date', 'generated_by'): val = getattr(table, k) if not val: continue if isinstance(val, bytes): amplicon_data['attributes'][k] = val.decode('utf-8') else: amplicon_data['attributes'][k] = str(val) elif mode.startswith('tsv'): observation_metadata = None sample_metadata = None try: logging.info('start parsing TSV file for matrix data') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide tsv file') else: metadata_df = None if metadata_keys: shared_metadata_keys = list( set(metadata_keys) & set(df.columns)) if mode == 'tsv' and 'consensus_sequence' not in shared_metadata_keys: raise ValueError( 'TSV file does not include consensus_sequence') if shared_metadata_keys: metadata_df = df[shared_metadata_keys] df.drop(columns=shared_metadata_keys, inplace=True) try: df = df.astype(float) except ValueError: err_msg = 'Found some non-float values. Matrix contains only numeric values\n' err_msg += 'Please list any non-numeric column names in Metadata Keys field' raise ValueError(err_msg) df.fillna(0, inplace=True) matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist() } logging.info('start building attribute mapping object') amplicon_data.update( self.get_attribute_mapping("row", observation_metadata, matrix_data, matrix_name, refs, workspace_id, metadata_df)) amplicon_data.update( self.get_attribute_mapping("col", sample_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data['attributes'] = {} else: raise ValueError( 'error parsing _file_to_amplicon_data, mode: {}'.format(mode)) amplicon_data.update({'data': matrix_data}) amplicon_data['search_attributes'] = [ f'{k}|{v}' for k, v in amplicon_data['attributes'].items() ] amplicon_data['scale'] = scale if description: amplicon_data['description'] = description return amplicon_data def get_attribute_mapping(self, axis, metadata, matrix_data, matrix_name, refs, workspace_id, metadata_df=None): mapping_data = {} axis_ids = matrix_data[f'{axis}_ids'] if refs.get(f'{axis}_attributemapping_ref'): am_data = self.dfu.get_objects( {'object_refs': [refs[f'{axis}_attributemapping_ref']]})['data'][0]['data'] unmatched_ids = set(axis_ids) - set(am_data['instances'].keys()) if unmatched_ids: name = "Column" if axis == 'col' else "Row" raise ValueError( f"The following {name} IDs from the uploaded matrix do not match " f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}" f"\nPlease verify the input data or upload an excel file with a" f"{name} mapping tab.") else: mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif metadata: name = matrix_name + "_{}_attributes".format(axis) mapping_data[ f'{axis}_attributemapping_ref'] = self._metadata_to_attribute_mapping( axis_ids, metadata, name, workspace_id) # if coming from biom file, metadata and axis IDs are guaranteed to match mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif metadata_df is not None: name = matrix_name + "_{}_attributes".format(axis) mapping_data[ f'{axis}_attributemapping_ref'] = self._meta_df_to_attribute_mapping( axis_ids, metadata_df, name, workspace_id) mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} return mapping_data def _meta_df_to_attribute_mapping(self, axis_ids, metadata_df, obj_name, ws_id): data = {'ontology_mapping_method': "TSV file", 'instances': {}} attribute_keys = metadata_df.columns.tolist() data['attributes'] = [{ 'attribute': key, 'source': 'upload' } for key in attribute_keys] for axis_id in axis_ids: data['instances'][axis_id] = metadata_df.loc[axis_id].tolist() logging.info( 'start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": data, "name": obj_name }] })[0] return f'{info[6]}/{info[0]}/{info[4]}' def _metadata_to_attribute_mapping(self, instances, metadata, obj_name, ws_id): data = {'ontology_mapping_method': "BIOM file", 'instances': {}} sample_set = metadata[0:min(len(metadata), 25)] metadata_keys = sorted( set((k for m_dict in sample_set for k in m_dict))) data['attributes'] = [{ 'attribute': key, 'source': 'upload' } for key in metadata_keys] for inst, meta in zip(instances, metadata): data['instances'][inst] = [ str(meta[attr]) for attr in metadata_keys ] logging.info( 'start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": data, "name": obj_name }] })[0] return f'{info[6]}/{info[0]}/{info[4]}' def _generate_report(self, matrix_obj_ref, amplicon_set_obj_ref, new_row_attr_ref, new_col_attr_ref, workspace_name): """ _generate_report: generate summary report """ objects_created = [{ 'ref': matrix_obj_ref, 'description': 'Imported Amplicon Matrix' }, { 'ref': amplicon_set_obj_ref, 'description': 'Imported Amplicon Set' }] if new_row_attr_ref: objects_created.append({ 'ref': new_row_attr_ref, 'description': 'Imported Amplicons(Row) Attribute Mapping' }) if new_col_attr_ref: objects_created.append({ 'ref': new_col_attr_ref, 'description': 'Imported Samples(Column) Attribute Mapping' }) report_params = { 'message': '', 'objects_created': objects_created, 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _df_to_tsv(self, amplicon_set_df, result_dir, amplicon_set_ref): logging.info('writting amplicon set data frame to tsv file') amplicon_set_obj = self.dfu.get_objects( {'object_refs': [amplicon_set_ref]})['data'][0] amplicon_set_info = amplicon_set_obj['info'] amplicon_set_name = amplicon_set_info[1] file_path = os.path.join(result_dir, amplicon_set_name + ".tsv") amplicon_set_df.to_csv(file_path, sep='\t', index=True, header=True) return file_path def _amplicon_set_to_df(self, amplicon_set_ref): logging.info('converting amplicon set to data frame') am_set_data = self.dfu.get_objects({'object_refs': [amplicon_set_ref] })['data'][0]['data'] amplicon_matrix_ref = am_set_data.get('amplicon_matrix_ref') matrix_data = self.dfu.get_objects( {'object_refs': [amplicon_matrix_ref]})['data'][0]['data'] matrix_value_data = matrix_data.get('data') index = matrix_value_data.get('row_ids') columns = matrix_value_data.get('col_ids') values = matrix_value_data.get('values') df = pd.DataFrame(values, index=index, columns=columns) amplicons = am_set_data.get('amplicons') meta_index = list() meta_columns = [ 'taxonomy', 'taxon_id', 'taxon_ref', 'taxon_level', 'score', 'taxonomy_source', 'species_name', 'consensus_sequence' ] meta_values = list() for otu_id, amplicon in amplicons.items(): meta_index.append(otu_id) taxonomy_data = amplicon.get('taxonomy') taxonomy = taxonomy_data.get('lineage') taxon_id = taxonomy_data.get('taxon_id') taxon_ref = taxonomy_data.get('taxon_ref') taxon_level = taxonomy_data.get('taxon_level') score = taxonomy_data.get('score') taxonomy_source = taxonomy_data.get('taxonomy_source') species_name = taxonomy_data.get('species_name') consensus_sequence = amplicon.get('consensus_sequence') meta_values.append([ taxonomy, taxon_id, taxon_ref, taxon_level, score, taxonomy_source, species_name, consensus_sequence ]) meta_df = pd.DataFrame(meta_values, index=meta_index, columns=meta_columns) merged_df = df.merge(meta_df, left_index=True, right_index=True, how='left', validate='one_to_one') return merged_df def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.data_util = DataUtil(config) self.attr_util = AttributesUtil(config) self.matrix_util = MatrixUtil(config) self.matrix_types = [ x.split(".")[1].split('-')[0] for x in self.data_util.list_generic_types() ] self.taxon_wsname = config['taxon-workspace-name'] self.kbse = KBaseSearchEngine(config['search-url']) def import_matrix_from_biom(self, params): """ arguments: obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix matrix_name: matrix object name workspace_name: workspace name matrix object to be saved to input_shock_id: file shock id or input_file_path: absolute file path or input_staging_file_path: staging area file path optional arguments: col_attributemapping_ref: column AttributeMapping reference row_attributemapping_ref: row AttributeMapping reference genome_ref: genome reference matrix_obj_ref: Matrix reference """ (biom_file, tsv_file, fasta_file, mode, metadata_keys) = self._process_params(params) workspace_name = params.get('workspace_name') matrix_name = params.get('matrix_name') amplicon_set_name = params.get('amplicon_set_name') obj_type = params.get('obj_type') scale = params.get('scale') description = params.get('description') refs = {k: v for k, v in params.items() if "_ref" in k} if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name amplicon_data = self._file_to_amplicon_data(biom_file, tsv_file, mode, refs, matrix_name, workspace_id, scale, description, metadata_keys) new_row_attr_ref = None if not params.get('row_attributemapping_ref'): new_row_attr_ref = amplicon_data.get('row_attributemapping_ref') new_col_attr_ref = None if not params.get('col_attributemapping_ref'): new_col_attr_ref = amplicon_data.get('col_attributemapping_ref') logging.info('start saving Matrix object: {}'.format(matrix_name)) matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': amplicon_data, 'workspace_name': workspace_id })['obj_ref'] amplicon_set_data = self._file_to_amplicon_set_data( biom_file, tsv_file, fasta_file, mode, refs, description, matrix_obj_ref) logging.info( 'start saving AmpliconSet object: {}'.format(amplicon_set_name)) amplicon_set_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseExperiments.AmpliconSet', 'obj_name': amplicon_set_name, 'data': amplicon_set_data, 'workspace_name': workspace_id })['obj_ref'] logging.info( 'start resaving Matrix object with amplicon set: {}'.format( matrix_name)) amplicon_data['amplicon_set_ref'] = '{}/{}'.format( workspace_id, amplicon_set_name) matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': amplicon_data, 'workspace_name': workspace_id })['obj_ref'] returnVal = { 'matrix_obj_ref': matrix_obj_ref, 'amplicon_set_obj_ref': amplicon_set_obj_ref } report_output = self._generate_report(matrix_obj_ref, amplicon_set_obj_ref, new_row_attr_ref, new_col_attr_ref, workspace_name) returnVal.update(report_output) return returnVal def export_amplicon_set_tsv(self, params): """ export AmpliconSet as TSV """ logging.info('start exporting amplicon set object') amplicon_set_ref = params.get('input_ref') amplicon_set_df = self._amplicon_set_to_df(amplicon_set_ref) result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) self._df_to_tsv(amplicon_set_df, result_dir, amplicon_set_ref) package_details = self.dfu.package_for_download({ 'file_path': result_dir, 'ws_refs': [amplicon_set_ref] }) return {'shock_id': package_details['shock_id']}