Exemplo n.º 1
0
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     self.token = config['KB_AUTH_TOKEN']
     self.dfu = DataFileUtil(self.callback_url)
     self.data_util = DataUtil(config)
     self.matrix_types = [x.split(".")[1].split('-')[0]
                          for x in self.data_util.list_generic_types()]
Exemplo n.º 2
0
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     self.token = config['KB_AUTH_TOKEN']
     self.dfu = DataFileUtil(self.callback_url)
     self.data_util = DataUtil(config)
     self.attr_util = AttributesUtil(config)
     self.matrix_util = MatrixUtil(config)
     self.matrix_types = [x.split(".")[1].split('-')[0]
                          for x in self.data_util.list_generic_types()]
     self.taxon_wsname = config['taxon-workspace-name']
     self.kbse = KBaseSearchEngine(config['search-url'])
Exemplo n.º 3
0
 def __init__(self, config):
     self.ws_url = config["workspace-url"]
     self.callback_url = config['SDK_CALLBACK_URL']
     self.token = config['KB_AUTH_TOKEN']
     self.shock_url = config['shock-url']
     self.srv_wiz_url = config['srv-wiz-url']
     self.scratch = config['scratch']
     self.dfu = DataFileUtil(self.callback_url)
     self.kbse = KBaseSearchEngine(config['search-url'])
     self.data_util = DataUtil(config)
     self.wsClient = workspaceService(self.ws_url, token=self.token)
     self.DEFAULT_ONTOLOGY_ID = "Custom:Term"
     self.DEFAULT_UNIT_ID = "Custom:Unit"
     self.ONT_LABEL_DEL = " - "
     self.ONT_TERM_DEL = ":"
Exemplo n.º 4
0
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']

        self.data_util = DataUtil(config)
        self.dfu = DataFileUtil(self.callback_url)

        plt.switch_backend('agg')
Exemplo n.º 5
0
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.config['SDK_CALLBACK_URL'] = os.environ['SDK_CALLBACK_URL']
        self.config['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN']
        self.scratch = config['scratch']
        self.attr_util = AttributesUtil(self.config)
        self.matrix_util = MatrixUtil(self.config)
        self.corr_util = CorrelationUtil(self.config)
        self.data_util = DataUtil(self.config)
        self.network_util = NetworkUtil(self.config)
        self.biom_util = BiomUtil(self.config)
        self.pca_util = PCAUtil(self.config)
        self.data_table_util = DataTableUtil(self.config)

        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass
Exemplo n.º 6
0
class MatrixUtil:
    def _validate_import_matrix_from_excel_params(self, params):
        """
        _validate_import_matrix_from_excel_params:
            validates params passed to import_matrix_from_excel method
        """
        logging.info('start validating import_matrix_from_excel params')

        # check for required parameters
        for p in ['obj_type', 'matrix_name', 'workspace_name', 'scale']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        obj_type = params.get('obj_type')
        if obj_type not in self.matrix_types:
            raise ValueError('Unknown matrix object type: {}'.format(obj_type))

        scale = params.get('scale')
        if scale not in SCALE_TYPES:
            raise ValueError('Unknown scale type: {}'.format(scale))

        if params.get('input_file_path'):
            file_path = params.get('input_file_path')
        elif params.get('input_shock_id'):
            file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        elif params.get('input_staging_file_path'):
            file_path = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                params.get('input_staging_file_path')
            }).get('copy_file_path')
        else:
            error_msg = "Must supply either a input_shock_id or input_file_path "
            error_msg += "or input_staging_file_path"
            raise ValueError(error_msg)

        refs = {k: v for k, v in params.items() if "_ref" in k}

        return (obj_type, file_path, params.get('workspace_name'),
                params.get('matrix_name'), refs, scale)

    def _upload_to_shock(self, file_path):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
        """
        logging.info('Start uploading file to shock: {}'.format(file_path))

        file_to_shock_params = {'file_path': file_path, 'pack': 'zip'}
        shock_id = self.dfu.file_to_shock(file_to_shock_params).get('shock_id')

        return shock_id

    @staticmethod
    def _mkdir_p(path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    @staticmethod
    def _find_between(s, start, end):
        """
        _find_between: find string in between start and end
        """

        return re.search('{}(.*){}'.format(start, end), s).group(1)

    @staticmethod
    def _write_mapping_sheet(file_path, sheet_name, mapping, index):
        """
        _write_mapping_sheet: write mapping to sheet
        """
        df_dict = collections.OrderedDict()

        df_dict[index[0]] = []
        df_dict[index[1]] = []

        for key, value in mapping.items():
            df_dict.get(index[0]).append(key)
            df_dict.get(index[1]).append(value)

        df = pd.DataFrame.from_dict(df_dict)

        with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
            writer.book = load_workbook(file_path)
            df.to_excel(writer, sheet_name=sheet_name)

    def _generate_report(self, matrix_obj_ref, workspace_name):
        """
        _generate_report: generate summary report
        """

        report_params = {
            'message':
            '',
            'objects_created': [{
                'ref': matrix_obj_ref,
                'description': 'Imported Matrix'
            }],
            'workspace_name':
            workspace_name,
            'report_object_name':
            'import_matrix_from_excel_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    @staticmethod
    def _process_mapping_sheet(file_path, sheet_name):
        """
        _process_mapping: process mapping sheet
        """

        try:
            df = pd.read_excel(file_path, sheet_name=sheet_name, dtype='str')
        except XLRDError:
            return dict()
        else:
            mapping = {value[0]: value[1] for value in df.values.tolist()}

        return mapping

    def _process_attribute_mapping_sheet(self, file_path, sheet_name,
                                         matrix_name, workspace_id):
        """
        _process_attribute_mapping_sheet: process attribute_mapping sheet
        """

        try:
            df = pd.read_excel(file_path, sheet_name=sheet_name)
        except XLRDError:
            return ''
        else:
            obj_name = f'{matrix_name}_{sheet_name}'
            result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
            self._mkdir_p(result_directory)
            file_path = os.path.join(result_directory,
                                     '{}.xlsx'.format(obj_name))
            df.to_excel(file_path)
            import_attribute_mapping_params = {
                'output_obj_name': obj_name,
                'output_ws_id': workspace_id,
                'input_file_path': file_path
            }

            ref = self.attr_util.file_to_attribute_mapping(
                import_attribute_mapping_params)

            return ref.get('attribute_mapping_ref')

    @staticmethod
    def _file_to_df(file_path):
        logging.info('start parsing file content to data frame')

        try:
            df = pd.read_excel(file_path, sheet_name='data', index_col=0)

        except XLRDError:
            try:
                df = pd.read_excel(file_path, index_col=0)
                logging.warning(
                    'WARNING: A sheet named "data" was not found in the attached file,'
                    ' proceeding with the first sheet as the data sheet.')

            except XLRDError:

                try:
                    reader = pd.read_csv(file_path, sep=None, iterator=True)
                    inferred_sep = reader._engine.data.dialect.delimiter
                    df = pd.read_csv(file_path, sep=inferred_sep, index_col=0)
                except Exception:
                    raise ValueError(
                        'Cannot parse file. Please provide valide tsv, excel or csv file'
                    )

        df.index = df.index.astype('str')
        df.columns = df.columns.astype('str')
        # fill NA with "None" so that they are properly represented as nulls in the KBase Object
        df = df.where((pd.notnull(df)), None)

        return df

    def _file_to_data(self, file_path, refs, matrix_name, workspace_id):
        logging.info('Start reading and converting excel file data')
        data = refs

        df = self._file_to_df(file_path)

        matrix_data = {
            'row_ids': df.index.tolist(),
            'col_ids': df.columns.tolist(),
            'values': df.values.tolist()
        }

        data.update({'data': matrix_data})
        data.update(
            self._get_axis_attributes('col', matrix_data, refs, file_path,
                                      matrix_name, workspace_id))
        data.update(
            self._get_axis_attributes('row', matrix_data, refs, file_path,
                                      matrix_name, workspace_id))

        # processing metadata
        metadata = self._process_mapping_sheet(file_path, 'metadata')
        data['attributes'] = {}
        data['search_attributes'] = []
        for k, v in metadata.items():
            k = k.strip()
            v = v.strip()
            if k in TYPE_ATTRIBUTES:
                data[k] = v
            else:
                data['attributes'][k] = v
                data['search_attributes'].append(" | ".join((k, v)))

        return data

    def _get_axis_attributes(self, axis, matrix_data, refs, file_path,
                             matrix_name, workspace_id):
        """Get the row/col_attributemapping and mapping of ids, validating as needed"""
        # Parameter specified mappings should take precedence over tabs in excel so only process
        # if attributemapping_ref is missing:
        attr_data = {}

        if refs.get(f'{axis}_attributemapping_ref'):
            attributemapping_ref = refs[f'{axis}_attributemapping_ref']
        else:
            attributemapping_ref = self._process_attribute_mapping_sheet(
                file_path, f'{axis}_attribute_mapping', matrix_name,
                workspace_id)

        if attributemapping_ref:
            attr_data[f'{axis}_attributemapping_ref'] = attributemapping_ref

        # col/row_mappings may not be supplied
        id_mapping = self._process_mapping_sheet(file_path, f'{axis}_mapping')
        if id_mapping:
            attr_data[f'{axis}_mapping'] = id_mapping
        # if no mapping, axis ids must match the attribute mapping
        elif attributemapping_ref:
            am_data = self.dfu.get_objects(
                {'object_refs': [attributemapping_ref]})['data'][0]['data']
            axis_ids = matrix_data[f'{axis}_ids']
            unmatched_ids = set(axis_ids) - set(am_data['instances'].keys())
            if unmatched_ids:
                name = "Column" if axis == 'col' else "Row"
                raise ValueError(
                    f"The following {name} IDs from the uploaded matrix do not match "
                    f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}"
                    f"\nPlease verify the input data or upload an excel file with a"
                    f"{name} mapping tab.")
            else:
                # just gen the IDs in this matrix
                attr_data[f'{axis}_mapping'] = {x: x for x in axis_ids}

        return attr_data

    @staticmethod
    def _build_header_str(attribute_names):  #not going to be used

        header_str = ''
        width = 100.0 / len(attribute_names)

        header_str += '<tr class="header">'
        header_str += '<th style="width:{0:.2f}%;">Feature ID</th>'.format(
            width)

        for attribute_name in attribute_names:
            header_str += '<th style="width:{0:.2f}%;"'.format(width)
            header_str += '>{}</th>'.format(attribute_name)
        header_str += '</tr>'

        return header_str

    def _build_html_str(self, row_mapping, attributemapping_data,
                        row_ids):  #not going to be used

        logging.info('Start building html replacement')

        attribute_names = [
            attributes.get('attribute')
            for attributes in attributemapping_data.get('attributes')
        ]

        header_str = self._build_header_str(attribute_names)

        table_str = ''

        instances = attributemapping_data.get('instances')

        for feature_id, attribute_id in row_mapping.items():
            if feature_id in row_ids:
                feature_instances = instances.get(attribute_id)

                table_str += '<tr>'
                table_str += '<td>{}</td>'.format(feature_id)

                for feature_instance in feature_instances:
                    table_str += '<td>{}</td>'.format(feature_instance)
                table_str += '</tr>'

        return header_str, table_str

    def _generate_search_html_report(self, header_str,
                                     table_str):  #generate search html report

        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'search.html')

        shutil.copy2(
            os.path.join(os.path.dirname(__file__), 'templates',
                         'kbase_icon.png'), output_directory)
        shutil.copy2(
            os.path.join(os.path.dirname(__file__), 'templates',
                         'search_icon.png'), output_directory)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__), 'templates',
                                 'search_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '//HEADER_STR', header_str)
                report_template = report_template.replace(
                    '//TABLE_STR', table_str)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Search Matrix App'
        })

        return html_report

    def _generate_search_report(self, header_str, table_str, workspace_name):
        logging.info('Start creating report')

        output_html_files = self._generate_search_html_report(
            header_str, table_str)

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name':
            'kb_matrix_filter_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    @staticmethod
    def _filter_value_data(value_data, remove_ids, dimension):
        """Filters a value matrix based on column or row ids"""
        def _norm_id(_id):
            return _id.replace(" ", "_")

        val_df = pd.DataFrame(value_data['values'],
                              index=value_data['row_ids'],
                              columns=value_data['col_ids'],
                              dtype='object')

        if dimension == 'row':
            filtered_df = val_df.drop(remove_ids, axis=0, errors='ignore')
            filtered_df = filtered_df.drop([_norm_id(x) for x in remove_ids],
                                           axis=0,
                                           errors='ignore')
        elif dimension == 'col':
            filtered_df = val_df.drop(remove_ids, axis=1, errors='ignore')
            filtered_df = filtered_df.drop([_norm_id(x) for x in remove_ids],
                                           axis=1,
                                           errors='ignore')
        else:
            raise ValueError('Unexpected dimension: {}'.format(dimension))

        filtered_value_data = {
            "values": filtered_df.values.tolist(),
            "col_ids": list(filtered_df.columns),
            "row_ids": list(filtered_df.index),
        }

        return filtered_value_data

    def _standardize_df(self, df, with_mean=True, with_std=True):

        logging.info("Standardizing matrix data")

        df.fillna(0, inplace=True)

        x_train = df.values

        scaler = preprocessing.StandardScaler(with_mean=with_mean,
                                              with_std=with_std).fit(x_train)

        standardized_values = scaler.transform(x_train)

        standardize_df = pd.DataFrame(index=df.index,
                                      columns=df.columns,
                                      data=standardized_values)

        return standardize_df

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.data_util = DataUtil(config)
        self.attr_util = AttributesUtil(config)
        self.matrix_types = [
            x.split(".")[1].split('-')[0]
            for x in self.data_util.list_generic_types()
        ]

    def standardize_matrix(self, params):
        """
        standardize a matrix
        """

        input_matrix_ref = params.get('input_matrix_ref')
        workspace_name = params.get('workspace_name')
        new_matrix_name = params.get('new_matrix_name')
        with_mean = params.get('with_mean', 1)
        with_std = params.get('with_std', 1)

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        input_matrix_obj = self.dfu.get_objects(
            {'object_refs': [input_matrix_ref]})['data'][0]
        input_matrix_info = input_matrix_obj['info']
        input_matrix_name = input_matrix_info[1]
        input_matrix_data = input_matrix_obj['data']

        if not new_matrix_name:
            current_time = time.localtime()
            new_matrix_name = input_matrix_name + time.strftime(
                '_%H_%M_%S_%Y_%m_%d', current_time)

        data_matrix = self.data_util.fetch_data({
            'obj_ref': input_matrix_ref
        }).get('data_matrix')
        df = pd.read_json(data_matrix)

        standardize_df = self._standardize_df(df, with_mean, with_std)

        new_matrix_data = {
            'row_ids': df.index.tolist(),
            'col_ids': df.columns.tolist(),
            'values': standardize_df.values.tolist()
        }

        input_matrix_data['data'] = new_matrix_data

        logging.info("Saving new standardized matrix object")
        info = self.dfu.save_objects({
            "id":
            workspace_id,
            "objects": [{
                "type": input_matrix_info[2],
                "data": input_matrix_data,
                "name": new_matrix_name
            }]
        })[0]

        new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])

        objects_created = [{
            'ref': new_matrix_obj_ref,
            'description': 'Standardized Matrix'
        }]

        report_params = {
            'message': '',
            'objects_created': objects_created,
            'workspace_name': workspace_name,
            'report_object_name':
            'import_matrix_from_biom_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        return {
            'new_matrix_obj_ref': new_matrix_obj_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }

    def filter_matrix(self, params):  #not going to be used
        """
        filter_matrix: create sub-matrix based on input feature_ids

        arguments:
        matrix_obj_ref: object reference of a matrix
        workspace_name: workspace name
        feature_ids: string of feature ids that result matrix contains
        filtered_matrix_name: name of newly created filtered matrix object
        """

        matrix_obj_ref = params.get('matrix_obj_ref')
        workspace_name = params.get('workspace_name')
        remove_ids = params.get('remove_ids')
        dimension = params.get('dimension')
        filtered_matrix_name = params.get('filtered_matrix_name')

        matrix_source = self.dfu.get_objects({"object_refs":
                                              [matrix_obj_ref]})['data'][0]
        matrix_info = matrix_source.get('info')
        matrix_data = matrix_source.get('data')

        matrix_type = self._find_between(matrix_info[2], '\.', '\-')

        value_data = matrix_data.get('data')
        remove_ids = [x.strip() for x in remove_ids.split(',')]
        filtered_value_data = self._filter_value_data(value_data, remove_ids,
                                                      dimension)

        # if the matrix has changed shape, update the mappings
        if len(filtered_value_data['row_ids']) < len(
                matrix_data['data']['row_ids']):
            if matrix_data.get('row_mapping'):
                matrix_data['row_mapping'] = {
                    k: matrix_data['row_mapping'][k]
                    for k in filtered_value_data['row_ids']
                }
            if matrix_data.get('feature_mapping'):
                matrix_data['feature_mapping'] = {
                    k: matrix_data['feature_mapping'][k]
                    for k in filtered_value_data['row_ids']
                }

        if len(filtered_value_data['col_ids']) < len(
                matrix_data['data']['col_ids']):
            if matrix_data.get('col_mapping'):
                matrix_data['col_mapping'] = {
                    k: matrix_data['col_mapping'][k]
                    for k in filtered_value_data['col_ids']
                }
        matrix_data['data'] = filtered_value_data

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        filtered_matrix_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(matrix_type),
            'obj_name':
            filtered_matrix_name,
            'data':
            matrix_data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        returnVal = {'matrix_obj_refs': [filtered_matrix_obj_ref]}

        report_output = self._generate_report(filtered_matrix_obj_ref,
                                              workspace_name)

        returnVal.update(report_output)

        return returnVal

    def search_matrix(self, params):  #not going to be used
        """
        search_matrix: generate a HTML report that allows users to select feature ids

        arguments:
        matrix_obj_ref: object reference of a matrix
        workspace_name: workspace name
        """

        matrix_obj_ref = params.get('matrix_obj_ref')
        workspace_name = params.get('workspace_name')

        matrix_source = self.dfu.get_objects({"object_refs":
                                              [matrix_obj_ref]})['data'][0]
        matrix_data = matrix_source.get('data')

        row_mapping = matrix_data.get('row_mapping')
        row_attributemapping_ref = matrix_data.get('row_attributemapping_ref')

        row_ids = matrix_data['data']['row_ids']

        if not (row_mapping and row_attributemapping_ref):
            raise ValueError(
                'Matrix obejct is missing either row_mapping or row_attributemapping_ref'
            )

        attributemapping_data = self.dfu.get_objects(
            {"object_refs": [row_attributemapping_ref]})['data'][0]['data']

        header_str, table_str = self._build_html_str(row_mapping,
                                                     attributemapping_data,
                                                     row_ids)

        returnVal = self._generate_search_report(header_str, table_str,
                                                 workspace_name)

        return returnVal

    def import_matrix_from_excel(self, params):
        """
        import_matrix_from_excel: import matrix object from excel

        arguments:
        obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix
        matrix_name: matrix object name
        workspace_name: workspace name matrix object to be saved to
        input_shock_id: file shock id
        or
        input_file_path: absolute file path
        or
        input_staging_file_path: staging area file path

        optional arguments:
        col_attributemapping_ref: column AttributeMapping reference
        row_attributemapping_ref: row AttributeMapping reference
        genome_ref: genome reference
        matrix_obj_ref: Matrix reference
        """

        (obj_type, file_path, workspace_name, matrix_name, refs,
         scale) = self._validate_import_matrix_from_excel_params(params)

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        data = self._file_to_data(file_path, refs, matrix_name, workspace_id)
        data['scale'] = scale
        if params.get('description'):
            data['description'] = params['description']

        matrix_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(obj_type),
            'obj_name':
            matrix_name,
            'data':
            data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        returnVal = {'matrix_obj_ref': matrix_obj_ref}

        report_output = self._generate_report(matrix_obj_ref, workspace_name)

        returnVal.update(report_output)

        return returnVal

    def export_matrix(self, params):
        """
        export_matrix: univeral downloader for matrix data object

        arguments:
        obj_ref: generics object reference

        optional arguments:
        generics_module: select the generics data to be retrieved from
                        e.g. for an given data type like below:
                        typedef structure {
                          FloatMatrix2D data;
                          condition_set_ref condition_set_ref;
                        } SomeGenericsMatrix;
                        and only data is needed
                        generics_module should be
                        {'data': 'FloatMatrix2D'}
        """
        logging.info('Start exporting matrix')

        if 'input_ref' in params:
            params['obj_ref'] = params.pop('input_ref')

        obj_source = self.dfu.get_objects(
            {"object_refs": [params.get('obj_ref')]})['data'][0]
        obj_data = obj_source.get('data')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        file_path = os.path.join(result_directory,
                                 '{}.xlsx'.format(obj_source.get('info')[1]))

        data_matrix = self.data_util.fetch_data(params).get('data_matrix')
        df = pd.read_json(data_matrix)

        df.to_excel(file_path, sheet_name='data')

        if obj_data.get('col_mapping'):
            self._write_mapping_sheet(file_path, 'col_mapping',
                                      obj_data.get('col_mapping'),
                                      ['col_name', 'instance_name'])
            obj_data.pop('col_mapping')

        if obj_data.get('row_mapping'):
            self._write_mapping_sheet(file_path, 'row_mapping',
                                      obj_data.get('row_mapping'),
                                      ['row_name', 'instance_name'])
            obj_data.pop('row_mapping')

        try:
            obj_data.pop('data')
        except KeyError:
            logging.warning('Missing key [data]')

        obj_data.update(obj_data.get('attributes', {}))  # flatten for printing
        self._write_mapping_sheet(file_path, 'metadata', obj_data,
                                  ['name', 'value'])

        shock_id = self._upload_to_shock(file_path)

        return {'shock_id': shock_id}
Exemplo n.º 7
0
class BiomUtil:
    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _process_params(self, params):
        logging.info('start validating import_matrix_from_biom params')

        # check for required parameters
        for p in [
                'obj_type', 'matrix_name', 'workspace_id', 'scale',
                'amplicon_type', 'sequencing_technology',
                'sequencing_instrument', 'target_gene', 'target_subfragment',
                'taxon_calling'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        # check sequencing_technology and sequencing_instrument matching
        sequencing_technology = params.get('sequencing_technology')
        sequencing_instrument = params.get('sequencing_instrument')
        if sequencing_technology not in SEQ_INSTRUMENTS_MAP:
            raise ValueError('Unexpected sequencing technology: {}'.format(
                sequencing_technology))
        expected_instruments = SEQ_INSTRUMENTS_MAP.get(sequencing_technology)
        if sequencing_instrument not in expected_instruments:
            raise ValueError(
                'Please select sequencing instrument among {} for {}'.format(
                    expected_instruments, sequencing_technology))

        # check target_gene and target_subfragment matching
        target_gene = params.get('target_gene')
        target_subfragment = list(set(params.get('target_subfragment')))
        params['target_subfragment'] = target_subfragment

        if target_gene not in TARGET_GENE_SUBFRAGMENT_MAP:
            raise ValueError('Unexpected target gene: {}'.format(target_gene))
        expected_subfragments = TARGET_GENE_SUBFRAGMENT_MAP.get(target_gene)
        if not set(target_subfragment) <= set(expected_subfragments):
            raise ValueError(
                'Please select target subfragments among {} for {}'.format(
                    expected_subfragments, target_gene))

        # check taxon_calling
        taxon_calling = params.get('taxon_calling')
        taxon_calling_method = list(
            set(taxon_calling.get('taxon_calling_method')))
        params['taxon_calling_method'] = taxon_calling_method

        if 'denoising' in taxon_calling_method:
            denoise_method = taxon_calling.get('denoise_method')
            sequence_error_cutoff = taxon_calling.get('sequence_error_cutoff')

            if not (denoise_method and sequence_error_cutoff):
                raise ValueError(
                    'Please provide denoise_method and sequence_error_cutoff')

            params['denoise_method'] = denoise_method
            params['sequence_error_cutoff'] = sequence_error_cutoff

        if 'clustering' in taxon_calling_method:
            clustering_method = taxon_calling.get('clustering_method')
            clustering_cutoff = taxon_calling.get('clustering_cutoff')

            if not (clustering_method and clustering_cutoff):
                raise ValueError(
                    'Please provide clustering_method and clustering_cutoff')

            params['clustering_method'] = clustering_method
            params['clustering_cutoff'] = clustering_cutoff

        obj_type = params.get('obj_type')
        if obj_type not in self.matrix_types:
            raise ValueError('Unknown matrix object type: {}'.format(obj_type))

        scale = params.get('scale')
        if scale not in SCALE_TYPES:
            raise ValueError('Unknown scale type: {}'.format(scale))

        biom_file = None
        tsv_file = None
        fasta_file = None
        metadata_keys = DEFAULT_META_KEYS

        input_local_file = params.get('input_local_file', False)

        if params.get('taxonomic_abundance_tsv') and params.get(
                'taxonomic_fasta'):
            tsv_file = params.get('taxonomic_abundance_tsv')
            fasta_file = params.get('taxonomic_fasta')

            if not (tsv_file and fasta_file):
                raise ValueError('missing TSV or FASTA file')

            if not input_local_file:
                tsv_file = self.dfu.download_staging_file({
                    'staging_file_subdir_path':
                    tsv_file
                }).get('copy_file_path')

                fasta_file = self.dfu.download_staging_file({
                    'staging_file_subdir_path':
                    fasta_file
                }).get('copy_file_path')

            metadata_keys_str = params.get('metadata_keys')
            if metadata_keys_str:
                metadata_keys += [
                    x.strip() for x in metadata_keys_str.split(',')
                ]
            mode = 'tsv_fasta'
        elif params.get('biom_fasta'):
            biom_fasta = params.get('biom_fasta')
            biom_file = biom_fasta.get('biom_file_biom_fasta')
            fasta_file = biom_fasta.get('fasta_file_biom_fasta')

            if not (biom_file and fasta_file):
                raise ValueError('missing BIOM or FASTA file')

            if not input_local_file:
                biom_file = self.dfu.download_staging_file({
                    'staging_file_subdir_path':
                    biom_file
                }).get('copy_file_path')

                fasta_file = self.dfu.download_staging_file({
                    'staging_file_subdir_path':
                    fasta_file
                }).get('copy_file_path')
            mode = 'biom_fasta'
        elif params.get('tsv_fasta'):
            tsv_fasta = params.get('tsv_fasta')
            tsv_file = tsv_fasta.get('tsv_file_tsv_fasta')
            fasta_file = tsv_fasta.get('fasta_file_tsv_fasta')

            if not (tsv_file and fasta_file):
                raise ValueError('missing TSV or FASTA file')

            if not input_local_file:
                tsv_file = self.dfu.download_staging_file({
                    'staging_file_subdir_path':
                    tsv_file
                }).get('copy_file_path')

                fasta_file = self.dfu.download_staging_file({
                    'staging_file_subdir_path':
                    fasta_file
                }).get('copy_file_path')

            metadata_keys_str = tsv_fasta.get('metadata_keys_tsv_fasta')
            if metadata_keys_str:
                metadata_keys += [
                    x.strip() for x in metadata_keys_str.split(',')
                ]
            mode = 'tsv_fasta'
        else:
            raise ValueError('missing valide file group type in parameters')

        return (biom_file, tsv_file, fasta_file, mode,
                list(set(metadata_keys)))

    def _validate_fasta_file(self, df, fasta_file):
        logging.info('start validating FASTA file')
        try:
            fastq_dict = SeqIO.index(fasta_file, "fasta")
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide FASTA file')

        matrix_ids = df.index
        file_ids = fastq_dict.keys()

        unmatched_ids = set(matrix_ids) - set(file_ids)

        if unmatched_ids:
            raise ValueError(
                'FASTA file does not have [{}] OTU id'.format(unmatched_ids))

    def _file_to_amplicon_data(self,
                               biom_file,
                               tsv_file,
                               fasta_file,
                               mode,
                               refs,
                               matrix_name,
                               workspace_id,
                               scale,
                               description,
                               metadata_keys=None):

        amplicon_data = refs

        if mode.startswith('biom'):
            logging.info('start parsing BIOM file for matrix data')
            table = biom.load_table(biom_file)
            observation_metadata = table._observation_metadata
            sample_metadata = table._sample_metadata

            matrix_data = {
                'row_ids': table._observation_ids.tolist(),
                'col_ids': table._sample_ids.tolist(),
                'values': table.matrix_data.toarray().tolist()
            }

            logging.info('start building attribute mapping object')
            amplicon_data.update(
                self.get_attribute_mapping("row", observation_metadata,
                                           matrix_data, matrix_name, refs,
                                           workspace_id))
            amplicon_data.update(
                self.get_attribute_mapping("col", sample_metadata, matrix_data,
                                           matrix_name, refs, workspace_id))

            amplicon_data['attributes'] = {}
            for k in ('create_date', 'generated_by'):
                val = getattr(table, k)
                if not val:
                    continue
                if isinstance(val, bytes):
                    amplicon_data['attributes'][k] = val.decode('utf-8')
                else:
                    amplicon_data['attributes'][k] = str(val)
        elif mode.startswith('tsv'):
            observation_metadata = None
            sample_metadata = None
            try:
                logging.info('start parsing TSV file for matrix data')
                reader = pd.read_csv(tsv_file, sep=None, iterator=True)
                inferred_sep = reader._engine.data.dialect.delimiter
                df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
            except Exception:
                raise ValueError(
                    'Cannot parse file. Please provide valide tsv file')
            else:
                self._validate_fasta_file(df, fasta_file)
                metadata_df = None
                if metadata_keys:
                    shared_metadata_keys = list(
                        set(metadata_keys) & set(df.columns))
                    if mode == 'tsv' and 'consensus_sequence' not in shared_metadata_keys:
                        raise ValueError(
                            'TSV file does not include consensus_sequence')
                    if shared_metadata_keys:
                        metadata_df = df[shared_metadata_keys]
                        df.drop(columns=shared_metadata_keys, inplace=True)
                try:
                    df = df.astype(float)
                except ValueError:
                    err_msg = 'Found some non-float values. Matrix contains only numeric values\n'
                    err_msg += 'Please list any non-numeric column names in  Metadata Keys field'
                    raise ValueError(err_msg)
                df.fillna(0, inplace=True)
                df.index = df.index.astype('str')
                df.columns = df.columns.astype('str')
                matrix_data = {
                    'row_ids': df.index.tolist(),
                    'col_ids': df.columns.tolist(),
                    'values': df.values.tolist()
                }

            logging.info('start building attribute mapping object')
            amplicon_data.update(
                self.get_attribute_mapping("row",
                                           observation_metadata,
                                           matrix_data,
                                           matrix_name,
                                           refs,
                                           workspace_id,
                                           metadata_df=metadata_df))
            amplicon_data.update(
                self.get_attribute_mapping("col", sample_metadata, matrix_data,
                                           matrix_name, refs, workspace_id))

            amplicon_data['attributes'] = {}
        else:
            raise ValueError(
                'error parsing _file_to_amplicon_data, mode: {}'.format(mode))

        amplicon_data.update({'data': matrix_data})

        amplicon_data['search_attributes'] = [
            f'{k}|{v}' for k, v in amplicon_data['attributes'].items()
        ]

        amplicon_data['scale'] = scale
        if description:
            amplicon_data['description'] = description

        return amplicon_data

    def get_attribute_mapping(self,
                              axis,
                              metadata,
                              matrix_data,
                              matrix_name,
                              refs,
                              workspace_id,
                              metadata_df=None):
        mapping_data = {}
        axis_ids = matrix_data[f'{axis}_ids']
        if refs.get('sample_set_ref') and axis == 'col':
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[
                f'{axis}_attributemapping_ref'] = self._sample_set_to_attribute_mapping(
                    axis_ids, refs.get('sample_set_ref'), name, workspace_id)
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}
        elif refs.get(f'{axis}_attributemapping_ref'):
            am_data = self.dfu.get_objects(
                {'object_refs':
                 [refs[f'{axis}_attributemapping_ref']]})['data'][0]['data']
            unmatched_ids = set(axis_ids) - set(am_data['instances'].keys())
            if unmatched_ids:
                name = "Column" if axis == 'col' else "Row"
                raise ValueError(
                    f"The following {name} IDs from the uploaded matrix do not match "
                    f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}"
                    f"\nPlease verify the input data or upload an excel file with a"
                    f"{name} mapping tab.")
            else:
                mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}
        elif metadata:
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[
                f'{axis}_attributemapping_ref'] = self._metadata_to_attribute_mapping(
                    axis_ids, metadata, name, workspace_id)
            # if coming from biom file, metadata and axis IDs are guaranteed to match
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}
        elif metadata_df is not None:
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[
                f'{axis}_attributemapping_ref'] = self._meta_df_to_attribute_mapping(
                    axis_ids, metadata_df, name, workspace_id)
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}

        return mapping_data

    def _meta_df_to_attribute_mapping(self, axis_ids, metadata_df, obj_name,
                                      ws_id):
        data = {'ontology_mapping_method': "TSV file", 'instances': {}}
        metadata_df = metadata_df.astype(str)
        attribute_keys = metadata_df.columns.tolist()
        data['attributes'] = [{
            'attribute': key,
            'source': 'upload'
        } for key in attribute_keys]

        if 'taxonomy' in attribute_keys:
            data['attributes'].append({
                'attribute': 'parsed_user_taxonomy',
                'source': 'upload'
            })

        for axis_id in axis_ids:
            data['instances'][axis_id] = metadata_df.loc[axis_id].tolist()
            if 'taxonomy' in attribute_keys:
                parsed_user_taxonomy = None
                taxonomy_index = attribute_keys.index('taxonomy')
                taxonomy_str = metadata_df.loc[axis_id].tolist(
                )[taxonomy_index]
                parsed_user_taxonomy = self.taxon_util.process_taxonomic_str(
                    taxonomy_str)
                data['instances'][axis_id].append(parsed_user_taxonomy)

        logging.info(
            'start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id":
            ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": data,
                "name": obj_name
            }]
        })[0]

        return f'{info[6]}/{info[0]}/{info[4]}'

    def _sample_set_to_attribute_mapping(self, axis_ids, sample_set_ref,
                                         obj_name, ws_id):

        am_data = self.sampleservice_util.sample_set_to_attribute_mapping(
            sample_set_ref)

        unmatched_ids = set(axis_ids) - set(am_data['instances'].keys())
        if unmatched_ids:
            name = "Column"
            raise ValueError(
                f"The following {name} IDs from the uploaded matrix do not match "
                f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}"
                f"\nPlease verify the input data or upload an excel file with a"
                f"{name} mapping tab.")

        logging.info(
            'start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id":
            ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": am_data,
                "name": obj_name
            }]
        })[0]

        return f'{info[6]}/{info[0]}/{info[4]}'

    def _metadata_to_attribute_mapping(self, instances, metadata, obj_name,
                                       ws_id):
        data = {'ontology_mapping_method': "BIOM file", 'instances': {}}
        sample_set = metadata[0:min(len(metadata), 25)]
        metadata_keys = sorted(
            set((k for m_dict in sample_set for k in m_dict)))
        data['attributes'] = [{
            'attribute': key,
            'source': 'upload'
        } for key in metadata_keys]
        for inst, meta in zip(instances, metadata):
            data['instances'][inst] = [
                str(meta[attr]) for attr in metadata_keys
            ]

        logging.info(
            'start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id":
            ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": data,
                "name": obj_name
            }]
        })[0]
        return f'{info[6]}/{info[0]}/{info[4]}'

    def _generate_visualization_content(self, output_directory, heatmap_dir,
                                        data_df, top_heatmap_dir, top_percent,
                                        display_count):

        row_data_summary = data_df.T.describe().round(2).to_string()
        col_data_summary = data_df.describe().round(2).to_string()

        tab_def_content = ''
        tab_content = ''

        viewer_name = 'data_summary'
        tab_def_content += '''\n<div class="tab">\n'''
        tab_def_content += '''\n<button class="tablinks" '''
        tab_def_content += '''onclick="openTab(event, '{}')"'''.format(
            viewer_name)
        tab_def_content += ''' id="defaultOpen"'''
        tab_def_content += '''>Matrix Statistics</button>\n'''

        tab_content += '''\n<div id="{}" class="tabcontent" style="overflow:auto">'''.format(
            viewer_name)
        tab_content += '''\n<h5>Amplicon Matrix Size: {} x {}</h5>'''.format(
            len(data_df.index), len(data_df.columns))
        tab_content += '''\n<h5>Row Aggregating Statistics</h5>'''
        html = '''\n<pre class="tab">''' + str(row_data_summary).replace(
            "\n", "<br>") + "</pre>"
        tab_content += html
        tab_content += '''\n<br>'''
        tab_content += '''\n<hr style="height:2px;border-width:0;color:gray;background-color:gray">'''
        tab_content += '''\n<br>'''
        tab_content += '''\n<h5>Column Aggregating Statistics</h5>'''
        html = '''\n<pre class="tab">''' + str(col_data_summary).replace(
            "\n", "<br>") + "</pre>"
        tab_content += html
        tab_content += '\n</div>\n'

        if top_heatmap_dir:
            viewer_name = 'TopHeatmapViewer'
            tab_def_content += '''\n<button class="tablinks" '''
            tab_def_content += '''onclick="openTab(event, '{}')"'''.format(
                viewer_name)
            tab_def_content += '''>Top {}% ({} Rows) Heatmap</button>\n'''.format(
                round(top_percent, 2), display_count)

            heatmap_report_files = os.listdir(top_heatmap_dir)

            heatmap_index_page = None
            for heatmap_report_file in heatmap_report_files:
                if heatmap_report_file.endswith('.html'):
                    heatmap_index_page = heatmap_report_file

                shutil.copy2(
                    os.path.join(top_heatmap_dir, heatmap_report_file),
                    output_directory)

            if heatmap_index_page:
                tab_content += '''\n<div id="{}" class="tabcontent">'''.format(
                    viewer_name)
                msg = 'Top {} percent of matrix sorted by sum of abundance values.'.format(
                    round(top_percent, 2))
                tab_content += '''<p style="color:red;" >{}</p>'''.format(msg)

                tab_content += '\n<iframe height="1300px" width="100%" '
                tab_content += 'src="{}" '.format(heatmap_index_page)
                tab_content += 'style="border:none;"></iframe>'
                tab_content += '\n</div>\n'
            else:
                tab_content += '''\n<div id="{}" class="tabcontent">'''.format(
                    viewer_name)
                tab_content += '''\n<p style="color:red;" >'''
                tab_content += '''Heatmap is too large to be displayed.</p>\n'''
                tab_content += '\n</div>\n'

        viewer_name = 'MatrixHeatmapViewer'
        tab_def_content += '''\n<button class="tablinks" '''
        tab_def_content += '''onclick="openTab(event, '{}')"'''.format(
            viewer_name)
        tab_def_content += '''>Matrix Heatmap</button>\n'''

        heatmap_report_files = os.listdir(heatmap_dir)

        heatmap_index_page = None
        for heatmap_report_file in heatmap_report_files:
            if heatmap_report_file.endswith('.html'):
                heatmap_index_page = heatmap_report_file

            shutil.copy2(os.path.join(heatmap_dir, heatmap_report_file),
                         output_directory)

        if heatmap_index_page:
            tab_content += '''\n<div id="{}" class="tabcontent">'''.format(
                viewer_name)
            tab_content += '\n<iframe height="1300px" width="100%" '
            tab_content += 'src="{}" '.format(heatmap_index_page)
            tab_content += 'style="border:none;"></iframe>'
            tab_content += '\n</div>\n'
        else:
            tab_content += '''\n<div id="{}" class="tabcontent">'''.format(
                viewer_name)
            tab_content += '''\n<p style="color:red;" >'''
            tab_content += '''Heatmap is too large to be displayed.</p>\n'''
            tab_content += '\n</div>\n'

        tab_def_content += '\n</div>\n'
        return tab_def_content + tab_content

    def _generate_heatmap_html_report(self, data):

        logging.info('Start generating heatmap report page')

        data_df = pd.DataFrame(data['values'],
                               index=data['row_ids'],
                               columns=data['col_ids'])
        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        tsv_file_path = os.path.join(
            result_directory, 'heatmap_data_{}.tsv'.format(str(uuid.uuid4())))
        data_df.to_csv(tsv_file_path)

        if data_df.index.size < 10000:
            heatmap_dir = self.report_util.build_heatmap_html({
                'tsv_file_path':
                tsv_file_path,
                'cluster_data':
                True
            })['html_dir']
        else:
            logging.info(
                'Original matrix is too large. Skip clustering data in report.'
            )
            heatmap_dir = self.report_util.build_heatmap_html({
                'tsv_file_path':
                tsv_file_path,
                'cluster_data':
                False
            })['html_dir']
        top_heatmap_dir = None
        top_percent = 100
        display_count = 200  # roughly count for display items
        if len(data_df.index) > 1000:
            top_percent = min(display_count / data_df.index.size * 100, 100)
            top_heatmap_dir = self.report_util.build_heatmap_html({
                'tsv_file_path':
                tsv_file_path,
                'sort_by_sum':
                True,
                'top_percent':
                top_percent
            })['html_dir']

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        logging.info(
            'Start generating html report in {}'.format(output_directory))

        html_report = list()

        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory,
                                        'matrix_viewer_report.html')

        visualization_content = self._generate_visualization_content(
            output_directory, heatmap_dir, data_df, top_heatmap_dir,
            top_percent, display_count)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__), 'templates',
                                 'matrix_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Visualization_Content</p>', visualization_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Import Amplicon Matrix App'
        })
        return html_report

    def _generate_report(self,
                         matrix_obj_ref,
                         new_row_attr_ref,
                         new_col_attr_ref,
                         workspace_id,
                         data=None):
        """
        _generate_report: generate summary report
        """

        objects_created = [{
            'ref': matrix_obj_ref,
            'description': 'Imported Amplicon Matrix'
        }]

        if new_row_attr_ref:
            objects_created.append({
                'ref':
                new_row_attr_ref,
                'description':
                'Imported Amplicons(Row) Attribute Mapping'
            })

        if new_col_attr_ref:
            objects_created.append({
                'ref':
                new_col_attr_ref,
                'description':
                'Imported Samples(Column) Attribute Mapping'
            })

        if data:
            output_html_files = self._generate_heatmap_html_report(data)

            report_params = {
                'message':
                '',
                'objects_created':
                objects_created,
                'workspace_id':
                workspace_id,
                'html_links':
                output_html_files,
                'direct_html_link_index':
                0,
                'html_window_height':
                1400,
                'report_object_name':
                'import_matrix_from_biom_' + str(uuid.uuid4())
            }

        else:
            report_params = {
                'message':
                '',
                'objects_created':
                objects_created,
                'workspace_id':
                workspace_id,
                'report_object_name':
                'import_matrix_from_biom_' + str(uuid.uuid4())
            }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.report_util = kb_GenericsReport(self.callback_url)
        self.data_util = DataUtil(config)
        self.sampleservice_util = SampleServiceUtil(config)
        self.attr_util = AttributesUtil(config)
        self.matrix_util = MatrixUtil(config)
        self.taxon_util = TaxonUtil(config)
        self.matrix_types = [
            x.split(".")[1].split('-')[0]
            for x in self.data_util.list_generic_types()
        ]
        self.taxon_wsname = config['taxon-workspace-name']
        self.kbse = KBaseSearchEngine(config['search-url'])
        self.taxon_cache = dict()

    def fetch_sequence(self, matrix_ref):
        logging.info('start to fetch consensus sequence')

        input_matrix_obj = self.dfu.get_objects({'object_refs':
                                                 [matrix_ref]})['data'][0]
        input_matrix_info = input_matrix_obj['info']
        matrix_name = input_matrix_info[1]
        matrix_type = input_matrix_info[2]
        matrix_data = input_matrix_obj['data']

        if 'KBaseMatrices.AmpliconMatrix' not in matrix_type:
            raise ValueError('Unexpected data type: {}'.format(matrix_type))

        handle = matrix_data.get('sequencing_file_handle')
        if not handle:
            raise ValueError(
                'Missing sequencing_file_handle from the matrix object')

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        logging.info('Start generating consensus sequence file in {}'.format(
            output_directory))
        self._mkdir_p(output_directory)

        matrix_fasta_file = self.dfu.shock_to_file({
            'handle_id': handle,
            'file_path': self.scratch
        }).get('file_path')

        try:
            logging.info('start parsing FASTA file')
            fastq_dict = SeqIO.index(matrix_fasta_file, "fasta")
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide FASTA file')

        row_ids = matrix_data['data']['row_ids']

        fasta_file_path = os.path.join(
            output_directory, matrix_name + 'consensus_sequence.fasta')

        with open(fasta_file_path, 'w') as f:
            for row_id in row_ids:
                consensus_sequence = str(fastq_dict.get(row_id).seq)
                f.write('>' + str(row_id) + '\n')
                f.write(consensus_sequence + '\n')

        return fasta_file_path

    def import_matrix_from_biom(self, params):
        """
        arguments:
        obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix
        matrix_name: matrix object name
        workspace_id: workspace id matrix object to be saved to
        input_shock_id: file shock id
        or
        input_file_path: absolute file path
        or
        input_staging_file_path: staging area file path

        optional arguments:
        col_attributemapping_ref: column AttributeMapping reference
        row_attributemapping_ref: row AttributeMapping reference
        genome_ref: genome reference
        matrix_obj_ref: Matrix reference
        """

        (biom_file, tsv_file, fasta_file, mode,
         metadata_keys) = self._process_params(params)

        workspace_id = params.get('workspace_id')
        matrix_name = params.get('matrix_name')
        obj_type = params.get('obj_type')
        scale = params.get('scale')
        description = params.get('description')
        refs = {k: v for k, v in params.items() if "_ref" in k}

        amplicon_data = self._file_to_amplicon_data(biom_file, tsv_file,
                                                    fasta_file, mode, refs,
                                                    matrix_name, workspace_id,
                                                    scale, description,
                                                    metadata_keys)

        for key in [
                'amplicon_type', 'amplification', 'extraction', 'target_gene',
                'target_subfragment', 'pcr_primers', 'library_kit',
                'library_layout', 'library_screening_strategy',
                'sequencing_center', 'sequencing_date',
                'sequencing_technology', 'sequencing_instrument',
                'sequencing_quality_filter_cutoff', 'read_length_cutoff',
                'read_pairing', 'barcode_error_rate',
                'chimera_detection_and_removal', 'taxon_calling_method',
                'denoise_method', 'sequence_error_cutoff', 'clustering_method',
                'clustering_cutoff', 'sample_set_ref', 'reads_set_ref'
        ]:
            if params.get(key):
                amplicon_data[key] = params[key]

        new_row_attr_ref = None
        if not params.get('row_attributemapping_ref'):
            new_row_attr_ref = amplicon_data.get('row_attributemapping_ref')

        new_col_attr_ref = None
        if not params.get('col_attributemapping_ref'):
            new_col_attr_ref = amplicon_data.get('col_attributemapping_ref')

        if fasta_file:
            logging.info(
                'start saving consensus sequence file to shock: {}'.format(
                    fasta_file))
            handle_id = self.dfu.file_to_shock({
                'file_path': fasta_file,
                'make_handle': True
            })['handle']['hid']
            amplicon_data['sequencing_file_handle'] = handle_id

        logging.info('start saving Matrix object: {}'.format(matrix_name))
        matrix_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(obj_type),
            'obj_name':
            matrix_name,
            'data':
            amplicon_data,
            'workspace_id':
            workspace_id
        })['obj_ref']

        if params.get('sample_set_ref'):
            self.matrix_util._link_matrix_to_samples(matrix_obj_ref,
                                                     amplicon_data,
                                                     params['sample_set_ref'])

        returnVal = {'matrix_obj_ref': matrix_obj_ref}

        report_output = self._generate_report(matrix_obj_ref,
                                              new_row_attr_ref,
                                              new_col_attr_ref,
                                              workspace_id,
                                              data=amplicon_data['data'])

        returnVal.update(report_output)

        return returnVal
Exemplo n.º 8
0
class DataTableUtil:

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _build_table_content(self, output_directory, matrix_df):
        """
        _build_table_content: generate HTML table content for FloatMatrix2D object
        """

        logging.info('Start generating table content page')

        page_content = """\n"""

        table_file_name = 'matrix_data_viewer_{}.html'.format(str(uuid.uuid4()))
        data_file_name = 'matrix_data_{}.json'.format(str(uuid.uuid4()))

        page_content += """<iframe height="900px" width="100%" """
        page_content += """src="{}" """.format(table_file_name)
        page_content += """style="border:none;"></iframe>\n"""

        table_headers = matrix_df.columns.tolist()
        table_content = """\n"""
        # build header and footer
        table_content += """\n<thead>\n<tr>\n"""
        for table_header in table_headers:
            table_content += """\n <th>{}</th>\n""".format(table_header)
        table_content += """\n</tr>\n</thead>\n"""

        table_content += """\n<tfoot>\n<tr>\n"""
        for table_header in table_headers:
            table_content += """\n <th>{}</th>\n""".format(table_header)
        table_content += """\n</tr>\n</tfoot>\n"""

        logging.info('start generating table json file')
        data_array = matrix_df.values.tolist()

        total_rec = len(data_array)
        json_dict = {'draw': 1,
                     'recordsTotal': total_rec,
                     'recordsFiltered': total_rec,
                     'data': data_array}

        with open(os.path.join(output_directory, data_file_name), 'w') as fp:
            json.dump(json_dict, fp)

        logging.info('start generating table html')
        with open(os.path.join(output_directory, table_file_name), 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'templates',
                                   'matrix_table_viewer_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>table_header</p>',
                                                          table_content)
                report_template = report_template.replace('ajax_file_path',
                                                          data_file_name)
                report_template = report_template.replace('deferLoading_size',
                                                          str(total_rec))
                result_file.write(report_template)
        #exit(page_content)
        '''
        <iframe height="900px" width="100%" src="matrix_data_viewer_f3dac561-d9a3-4b05-a79c-d5ce454a36e8.html" style="border:none;"></iframe>
        '''
        return page_content

    def _generate_visualization_content(self, output_directory, matrix_df):
        #exit(matrix_df)
        '''
        ID                        ...                                                                   taxonomy
        0  GG_OTU_1                        ...                          ['k__Bacteria', 'p__Proteobacteria', 'c__Gamma...
        1  GG_OTU_2                        ...                          ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostoc...
        2  GG_OTU_3                        ...                          ['k__Archaea', 'p__Euryarchaeota', 'c__Methano...
        3  GG_OTU_4                        ...                          ['k__Bacteria', 'p__Firmicutes', 'c__Clostridi...
        4  GG_OTU_5                        ...                          ['k__Bacteria', 'p__Proteobacteria', 'c__Gamma...

        [5 rows x 8 columns]
        '''
        tab_def_content = ''
        tab_content = ''

        tab_def_content += """\n<div class="tab">\n"""
        tab_def_content += """
        <button class="tablinks" onclick="openTab(event, 'MatrixData')" id="defaultOpen">Matrix Data</button>
        """

        corr_table_content = self._build_table_content(output_directory, matrix_df)
        tab_content += """\n<div id="MatrixData" class="tabcontent">{}</div>\n""".format(
                                                                                corr_table_content)

        tab_def_content += """\n</div>\n"""

        #exit(tab_def_content + tab_content)
        '''
        <div class="tab">

        <button class="tablinks" onclick="openTab(event, 'MatrixData')" id="defaultOpen">Matrix Data</button>
        
</div>

<div id="MatrixData" class="tabcontent">
<iframe height="900px" width="100%" src="matrix_data_viewer_273ab8bc-b5bf-4708-87c6-5fe01208be55.html" style="border:none;"></iframe>
</div>

        '''
        return tab_def_content + tab_content

    def _generate_matrix_html_report(self, matrix_df):

        """
        _generate_matrix_html_report: generate html summary report for matrix
        """

        logging.info('Start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'matrix_report.html')

        visualization_content = self._generate_visualization_content(output_directory, matrix_df)

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'templates', 'matrix_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Visualization_Content</p>',
                                                          visualization_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({'file_path': output_directory,
                                                  'pack': 'zip'})['shock_id']

        html_report.append({'shock_id': report_shock_id,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for Compute Correlation App'
                            })
        return html_report

    def _generate_corr_report(self, workspace_name, matrix_df):
        """
        _generate_report: generate summary report
        """
        logging.info('Start creating report')

        output_html_files = self._generate_matrix_html_report(matrix_df)

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 666,
                         'report_object_name': 'matrix_viewer_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _fetch_matrix_df(self, input_matrix_ref, with_attribute_info):
        #exit(input_matrix_ref)    44071/21/229
        logging.info('Start fetch matrix content')

        matrix_obj = self.dfu.get_objects({'object_refs': [input_matrix_ref]})['data'][0]
        matrix_info = matrix_obj['info']
        matrix_data = matrix_obj['data']

        matrix_type = matrix_info[2].split('-')[0].split('.')[-1]
        if matrix_type not in self.matrix_types:
            raise ValueError('Unexpected matrix type: {}'.format(matrix_type))

        data_matrix = self.data_util.fetch_data({'obj_ref': input_matrix_ref}).get('data_matrix')
        matrix_df = pd.read_json(data_matrix)
        matrix_df = matrix_df.reindex(index=natsorted(matrix_df.index))

        row_am_ref = matrix_data.get('row_attributemapping_ref')
        if with_attribute_info and row_am_ref:
            am_data = self.dfu.get_objects({'object_refs': [row_am_ref]})['data'][0]['data']
            instances = am_data['instances']
            columns = [x['attribute'] for x in am_data['attributes']]
            row_am_df = pd.DataFrame(list(instances.values()), index=instances.keys(), columns=columns)

            # row_am = self.data_util.fetch_data({'obj_ref': row_am_ref}).get('data_matrix')g
            # row_am_df = pd.read_json(row_am)

            matrix_df = matrix_df.join(row_am_df)

        matrix_df.index.name = 'ID'
        matrix_df.reset_index(inplace=True)
        matrix_df = matrix_df.astype(str)
        #exit(matrix_df)
        '''
        SystemExit:          ID                        ...                                                                   taxonomy
0  GG_OTU_1                        ...                          ['k__Bacteria', 'p__Proteobacteria', 'c__Gamma...
1  GG_OTU_2                        ...                          ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostoc...
2  GG_OTU_3                        ...                          ['k__Archaea', 'p__Euryarchaeota', 'c__Methano...
3  GG_OTU_4                        ...                          ['k__Bacteria', 'p__Firmicutes', 'c__Clostridi...
4  GG_OTU_5                        ...                          ['k__Bacteria', 'p__Proteobacteria', 'c__Gamma...

        '''
        return matrix_df

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.data_util = DataUtil(config)
        self.matrix_types = [x.split(".")[1].split('-')[0]
                             for x in self.data_util.list_generic_types()]

    def view_matrix_as_table(self, params):
        input_matrix_ref = params.get('input_matrix_ref')
        workspace_name = params.get('workspace_name')
        with_attribute_info = params.get('with_attribute_info', True)

        matrix_df = self._fetch_matrix_df(input_matrix_ref, with_attribute_info)

        returnVal = dict()
        report_output = self._generate_corr_report(workspace_name, matrix_df)

        returnVal.update(report_output)

        return returnVal
Exemplo n.º 9
0
class AttributesUtil:
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.srv_wiz_url = config['srv-wiz-url']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url)
        self.kbse = KBaseSearchEngine(config['search-url'])
        self.data_util = DataUtil(config)
        self.wsClient = workspaceService(self.ws_url, token=self.token)
        self.DEFAULT_ONTOLOGY_ID = "Custom:Term"
        self.DEFAULT_UNIT_ID = "Custom:Unit"
        self.ONT_LABEL_DEL = " - "
        self.ONT_TERM_DEL = ":"

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning(
                    "Unexpected parameter {} supplied".format(param))

    def file_to_attribute_mapping(self, params):
        """Convert a user supplied file to a compound set"""
        if 'input_file_path' in params:
            scratch_file_path = params['input_file_path']
        elif 'input_shock_id' in params:
            scratch_file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        else:
            raise ValueError(
                "Must supply either a input_shock_id or input_file_path")
        attr_mapping = self._file_to_am_obj(scratch_file_path)
        info = self.dfu.save_objects({
            "id":
            params['output_ws_id'],
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": attr_mapping,
                "name": params['output_obj_name']
            }]
        })[0]
        return {
            "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4])
        }

    def append_file_to_attribute_mapping(self,
                                         staging_file_subdir_path,
                                         old_am_ref,
                                         output_ws_id,
                                         new_am_name=None):
        """append an attribute mapping file to existing attribute mapping object
        """

        download_staging_file_params = {
            'staging_file_subdir_path': staging_file_subdir_path
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')

        append_am_data = self._file_to_am_obj(scratch_file_path)

        old_am_obj = self.dfu.get_objects({'object_refs':
                                           [old_am_ref]})['data'][0]

        old_am_info = old_am_obj['info']
        old_am_name = old_am_info[1]
        old_am_data = old_am_obj['data']

        new_am_data = self._check_and_append_am_data(old_am_data,
                                                     append_am_data)

        if not new_am_name:
            current_time = time.localtime()
            new_am_name = old_am_name + time.strftime('_%H_%M_%S_%Y_%m_%d',
                                                      current_time)

        info = self.dfu.save_objects({
            "id":
            output_ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": new_am_data,
                "name": new_am_name
            }]
        })[0]
        return {
            "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4])
        }

    def update_matrix_attribute_mapping(self, params):

        dimension = params.get('dimension')
        if dimension not in ['col', 'row']:
            raise ValueError('Please use "col" or "row" for input dimension')

        workspace_name = params.get('workspace_name')

        old_matrix_ref = params.get('input_matrix_ref')
        old_matrix_obj = self.dfu.get_objects(
            {'object_refs': [old_matrix_ref]})['data'][0]
        old_matrix_info = old_matrix_obj['info']
        old_matrix_data = old_matrix_obj['data']

        old_am_ref = old_matrix_data.get(
            '{}_attributemapping_ref'.format(dimension))

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        if not old_am_ref:
            raise ValueError(
                'Matrix object does not have {} attribute mapping'.format(
                    dimension))

        new_am_ref = self.append_file_to_attribute_mapping(
            params['staging_file_subdir_path'], old_am_ref, workspace_id,
            params['output_am_obj_name'])['attribute_mapping_ref']

        old_matrix_data['{}_attributemapping_ref'.format(
            dimension)] = new_am_ref

        info = self.dfu.save_objects({
            "id":
            workspace_id,
            "objects": [{
                "type": old_matrix_info[2],
                "data": old_matrix_data,
                "name": params['output_matrix_obj_name']
            }]
        })[0]

        new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])

        objects_created = [{
            'ref': new_am_ref,
            'description': 'Updated Attribute Mapping'
        }, {
            'ref': new_matrix_obj_ref,
            'description': 'Updated Matrix'
        }]

        report_params = {
            'message': '',
            'objects_created': objects_created,
            'workspace_name': workspace_name,
            'report_object_name':
            'import_matrix_from_biom_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        return {
            'new_matrix_obj_ref': new_matrix_obj_ref,
            'new_attribute_mapping_ref': new_am_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }

    def _check_and_append_am_data(self, old_am_data, append_am_data):

        exclude_keys = {'attributes', 'instances'}
        new_am_data = {
            k: old_am_data[k]
            for k in set(list(old_am_data.keys())) - exclude_keys
        }

        old_attrs = old_am_data.get('attributes')
        old_insts = old_am_data.get('instances')

        append_attrs = append_am_data.get('attributes')
        append_insts = append_am_data.get('instances')

        # checking duplicate attributes
        old_attrs_names = [old_attr.get('attribute') for old_attr in old_attrs]
        append_attrs_names = [
            append_attr.get('attribute') for append_attr in append_attrs
        ]

        duplicate_attrs = set(old_attrs_names).intersection(append_attrs_names)

        if duplicate_attrs:
            error_msg = 'Duplicate attribute mappings: [{}]'.format(
                duplicate_attrs)
            raise ValueError(error_msg)

        # checking missing instances
        missing_inst = old_insts.keys() - append_insts.keys()

        if missing_inst:
            error_msg = 'Appended attribute mapping misses [{}] instances'.format(
                missing_inst)
            raise ValueError(error_msg)

        new_attrs = old_attrs + append_attrs
        new_am_data['attributes'] = new_attrs

        new_insts = deepcopy(old_insts)

        for inst_name, val in new_insts.items():
            append_val = append_insts.get(inst_name)
            val.extend(append_val)

        new_am_data['instances'] = new_insts

        return new_am_data

    def _am_data_to_df(self, data):
        """
        Converts a compound set object data to a dataframe
        """

        attributes = pd.DataFrame(data['attributes'])
        attributes.rename(columns=lambda x: x.replace("ont", "ontology").
                          capitalize().replace("_", " "))
        instances = pd.DataFrame(data['instances'])
        am_df = attributes.join(instances)

        return am_df

    def _clusterset_data_to_df(self, data):
        """
        Converts a cluster set object data to a dataframe
        """

        original_matrix_ref = data.get('original_data')
        data_matrix = self.data_util.fetch_data({
            'obj_ref': original_matrix_ref
        }).get('data_matrix')

        data_df = pd.read_json(data_matrix)
        clusters = data.get('clusters')

        id_name_list = [
            list(cluster.get('id_to_data_position').keys())
            for cluster in clusters
        ]
        id_names = [item for sublist in id_name_list for item in sublist]

        if set(data_df.columns.tolist()) == set(
                id_names):  # cluster is based on columns
            data_df = data_df.T

        cluster_names = [None] * data_df.index.size

        cluster_id = 0
        for cluster in clusters:
            item_ids = list(cluster.get('id_to_data_position').keys())
            item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids]

            for idx in item_idx:
                cluster_names[idx] = cluster_id

            cluster_id += 1

        data_df['cluster'] = cluster_names

        return data_df

    def _ws_obj_to_df(self, input_ref):
        """Converts workspace obj to a DataFrame"""
        res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0]
        name = res['info'][1]

        obj_type = res['info'][2]

        if "KBaseExperiments.AttributeMapping" in obj_type:
            cs_df = self._am_data_to_df(res['data'])
        elif "KBaseExperiments.ClusterSet" in obj_type:
            cs_df = self._clusterset_data_to_df(res['data'])
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please supply KBaseExperiments.AttributeMapping or KBaseExperiments.ClusterSet'
            raise ValueError("err_msg")

        return name, cs_df, obj_type

    def _file_to_am_obj(self, scratch_file_path):
        try:
            df = pd.read_excel(scratch_file_path, dtype='str')
        except XLRDError:
            df = pd.read_csv(scratch_file_path, sep=None, dtype='str')
        df = df.replace('nan', '')
        if df.columns[1].lower() == "attribute ontology id":
            am_obj = self._df_to_am_obj(df)
        else:
            am_obj = self._isa_df_to_am_object(df)
        return am_obj

    def _df_to_am_obj(self, am_df):
        """Converts a dataframe from a user file to a compound set object"""
        if not len(am_df):
            raise ValueError("No attributes in supplied files")

        attribute_df = am_df.filter(regex="[Uu]nit|[Aa]ttribute")
        instance_df = am_df.drop(attribute_df.columns, axis=1)
        if not len(instance_df.columns):
            raise ValueError(
                "Unable to find any instance columns in supplied file")

        attribute_df.rename(
            columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(),
            inplace=True)
        if "attribute" not in attribute_df.columns:
            raise ValueError(
                "Unable to find a 'attribute' column in supplied file")
        attribute_df['source'] = 'upload'
        attribute_fields = ('attribute', 'unit', 'attribute_ont_id',
                            'unit_ont_id', 'source')
        attributes = attribute_df.filter(
            items=attribute_fields).to_dict('records')
        print(attributes)
        self._validate_attribute_values(
            am_df.set_index(attribute_df.attribute).iterrows())

        attribute_mapping = {
            'ontology_mapping_method': "User Curation",
            'attributes': [self._add_ontology_info(f) for f in attributes],
            'instances': instance_df.to_dict('list')
        }

        return attribute_mapping

    def _isa_df_to_am_object(self, isa_df):
        skip_columns = {
            'Raw Data File', 'Derived Data File', 'Array Data File',
            'Image File'
        }
        if 'Sample Name' in isa_df.columns and not any(
                isa_df['Sample Name'].duplicated()):
            isa_df.set_index('Sample Name', inplace=True)
        elif 'Assay Name' in isa_df.columns and not any(
                isa_df['Assay Name'].duplicated()):
            isa_df.set_index('Assay Name', inplace=True)
        elif not any(isa_df[isa_df.columns[0]].duplicated()):
            logging.warning(f'Using {isa_df.columns[0]} as ID column')
            isa_df.set_index(isa_df.columns[0], inplace=True)
        else:
            raise ValueError(
                "Unable to detect an ID column that was unigue for each row. "
                f"Considered 'Sample Names', 'Assay Names' and {isa_df.columns[0]}"
            )
        self._validate_attribute_values(isa_df.iteritems())

        attribute_mapping = {
            'ontology_mapping_method': "User Curation - ISA format"
        }
        attribute_mapping[
            'attributes'], new_skip_cols = self._get_attributes_from_isa(
                isa_df, skip_columns)
        reduced_isa = isa_df.drop(columns=new_skip_cols, errors='ignore')
        attribute_mapping['instances'] = reduced_isa.T.to_dict('list')

        return attribute_mapping

    def _validate_attribute_values(self, attribute_series):
        errors = {}
        for attr, vals in attribute_series:
            try:
                validator = getattr(AttributeValidation, attr)
                attr_errors = validator(vals)
                if attr_errors:
                    errors[attr] = attr_errors
            except AttributeError:
                continue

        if errors:
            for attr, attr_errors in errors.items():
                logging.error(
                    f'Attribute {attr} had the following validation errors:\n'
                    "\n".join(attr_errors) + '\n')
                raise ValueError(
                    f'The following attributes failed validation: {", ".join(errors)}'
                    f'\n See the log for details')

    def _get_attributes_from_isa(self, isa_df, skip_columns):
        attributes = []
        # associate attribute columns with the other columns that relate to them
        for i, col in enumerate(isa_df.columns):
            if col.startswith('Term Source REF'):
                skip_columns.add(col)
                last_attr = attributes[-1]
                if '_unit' in last_attr:
                    last_attr['_unit_ont'] = col
                else:
                    last_attr['_val_ont'] = col

            elif col.startswith('Term Accession Number'):
                # If the term Accession is a web link only grab the last bit
                # Similarly, sometimes the number is prefixed with the term source e.x. UO_0000012
                isa_df[col] = isa_df[col].map(
                    lambda x: x.split("/")[-1].split("_")[-1])
                skip_columns.add(col)
                last_attr = attributes[-1]
                if '_unit' in last_attr:
                    last_attr['_unit_accession'] = col
                else:
                    last_attr['_val_accession'] = col

            elif col.startswith('Unit'):
                skip_columns.add(col)
                last_attr = attributes[-1]
                if last_attr.get('unit'):
                    raise ValueError(
                        "More than one unit column is supplied for attribute {}"
                        .format(last_attr['attribute']))
                last_attr['_unit'] = col

            elif col not in skip_columns:
                split_col = col.split("|", 1)
                if len(split_col) > 1:
                    attributes.append({
                        "attribute": split_col[0],
                        "attribute_ont_id": split_col[1],
                        "source": "upload"
                    })
                else:
                    attributes.append({"attribute": col, "source": "upload"})

        # handle the categories for each attribute
        for i, attribute in enumerate(attributes):
            if '_val_accession' in attribute:
                category_df = isa_df[[
                    attribute['attribute'],
                    attribute.pop('_val_ont'),
                    attribute.pop('_val_accession')
                ]].drop_duplicates()
                category_df[
                    'attribute_ont_id'] = category_df.iloc[:, 1].str.cat(
                        category_df.iloc[:, 2], ":")
                category_df['value'] = category_df[attribute['attribute']]
                cats = category_df.set_index(attribute['attribute'])[[
                    'value', 'attribute_ont_id'
                ]].to_dict('index')
                attribute['categories'] = {
                    k: self._add_ontology_info(v)
                    for k, v in cats.items()
                }

            if '_unit' in attribute:
                units = isa_df[attribute.pop('_unit')].unique()
                if len(units) > 1:
                    raise ValueError(
                        "More than one unit type is supplied for attribute {}: {}"
                        .format(attribute['attribute'], units))
                attribute['unit'] = units[0]
                if '_unit_ont' in attribute:
                    unit_ont = isa_df[attribute.pop('_unit_ont')].str.cat(
                        isa_df[attribute.pop('_unit_accession')],
                        ":").unique()
                    if len(units) > 1:
                        raise ValueError(
                            "More than one unit ontology is supplied for attribute "
                            "{}: {}".format(attribute['attribute'], unit_ont))
                    attribute['unit_ont_id'] = unit_ont[0]
            attributes[i] = self._add_ontology_info(attribute)
        return attributes, skip_columns

    def _search_ontologies(self, term, closest=False):
        """
        Match to an existing KBase ontology term
        :param term: Test to match
        :param closest: if false, term must exactly match an ontology ID
        :return: dict(ontology_ref, id)
        """
        params = {
            "object_types": ["OntologyTerm"],
            "match_filter": {
                "lookup_in_keys": {
                    "id": {
                        "value": term
                    }
                }
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "pagination": {
                "count": 1
            },
            "post_processing": {
                "skip_data": 1
            }
        }
        if closest:
            params['match_filter'] = {"full_text_in_all": term}
        res = self.kbse.search_objects(params)
        if not res['objects']:
            return None
        term = res['objects'][0]
        return {
            "ontology_ref": term['guid'].split(":")[1],
            "id": term['key_props']['id']
        }

    def _add_ontology_info(self, attribute):
        """Searches KBASE ontologies for terms matching the user supplied attributes and units.
        Add the references if found"""
        optionals = {
            "unit",
            "unit_ont_id",
            "unit_ont_ref",
        }
        attribute = {
            k: v
            for k, v in attribute.items() if k not in optionals or v != ""
        }
        ont_info = self._search_ontologies(
            attribute.get('attribute_ont_id', "").replace("_", ":"))
        if ont_info:
            attribute['attribute_ont_ref'] = ont_info['ontology_ref']
            attribute['attribute_ont_id'] = ont_info['id']
        elif not attribute.get(
                'attribute_ont_id') or attribute['attribute_ont_id'] == ":":
            attribute.pop('attribute_ont_id', None)

        if attribute.get('unit'):
            ont_info = self._search_ontologies(
                attribute.get('unit_ont_id', '').replace("_", ":"))
            if ont_info:
                attribute['unit_ont_ref'] = ont_info['ontology_ref']
                attribute['unit_ont_id'] = ont_info['id']
            elif not attribute.get(
                    'attribute_ont_id') or attribute['unit_ont_id'] == ":":
                attribute.pop('unit_ont_id', None)

        return attribute

    def to_tsv(self, params):
        """Convert an compound set to TSV file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".tsv")
        df.to_csv(files['file_path'], sep="\t", index=False)

        return _id, files

    def to_excel(self, params):
        """Convert an compound set to Excel file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".xlsx")

        writer = pd.ExcelWriter(files['file_path'])

        if "KBaseExperiments.AttributeMapping" in obj_type:
            df.to_excel(writer, "Attributes", index=False)
        elif "KBaseExperiments.ClusterSet" in obj_type:
            df.to_excel(writer, "ClusterSet", index=True)
        # else is checked in `_ws_obj_to_df`

        writer.save()

        return _id, files

    def export(self, file, name, input_ref):
        """Saves a set of files to SHOCK for export"""
        export_package_dir = os.path.join(self.scratch,
                                          name + str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        shutil.move(file,
                    os.path.join(export_package_dir, os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [input_ref]
        })

        return {'shock_id': package_details['shock_id']}
Exemplo n.º 10
0
class GenericsAPI:
    '''
    Module Name:
    GenericsAPI

    Module Description:
    
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "1.0.8"
    GIT_URL = "[email protected]:Tianhao-Gu/GenericsAPI.git"
    GIT_COMMIT_HASH = "e5a7c9fc2952bf44ebf8ec76d92322f00b606b3e"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.config['SDK_CALLBACK_URL'] = os.environ['SDK_CALLBACK_URL']
        self.config['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN']
        self.scratch = config['scratch']
        self.attr_util = AttributesUtil(self.config)
        self.matrix_util = MatrixUtil(self.config)
        self.corr_util = CorrelationUtil(self.config)
        self.data_util = DataUtil(self.config)
        self.network_util = NetworkUtil(self.config)
        self.biom_util = BiomUtil(self.config)
        self.pca_util = PCAUtil(self.config)
        self.data_table_util = DataTableUtil(self.config)
        self.template_util = TemplateUtil(self.config)

        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass

    def fetch_data(self, ctx, params):
        """
        fetch_data: fetch generics data as pandas dataframe for a generics data object
        :param params: instance of type "FetchDataParams" (Input of the
           fetch_data function obj_ref: generics object reference Optional
           arguments: generics_module: the generics data module to be
           retrieved from e.g. for an given data type like below: typedef
           structure { FloatMatrix2D data; condition_set_ref
           condition_set_ref; } SomeGenericsMatrix; generics_module should be
           {'data': 'FloatMatrix2D', 'condition_set_ref':
           'condition_set_ref'}) -> structure: parameter "obj_ref" of type
           "obj_ref" (An X/Y/Z style reference), parameter "generics_module"
           of mapping from String to String
        :returns: instance of type "FetchDataReturn" (Ouput of the fetch_data
           function data_matrix: a pandas dataframe in json format) ->
           structure: parameter "data_matrix" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN fetch_data
        returnVal = self.data_util.fetch_data(params)
        #END fetch_data

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method fetch_data return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def export_matrix(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (Input of the
           export_matrix function obj_ref: generics object reference Optional
           arguments: generics_module: select the generics data to be
           retrieved from e.g. for an given data type like below: typedef
           structure { FloatMatrix2D data; condition_set_ref
           condition_set_ref; } SomeGenericsMatrix; and only 'FloatMatrix2D'
           is needed generics_module should be {'data': FloatMatrix2D'}) ->
           structure: parameter "obj_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "generics_module" of mapping from String to
           String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN export_matrix
        returnVal = self.matrix_util.export_matrix(params)
        #END export_matrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method export_matrix return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def validate_data(self, ctx, params):
        """
        validate_data: validate data
        :param params: instance of type "ValidateParams" (Input of the
           validate_data function obj_type: obj type e.g.:
           'KBaseMatrices.ExpressionMatrix-1.1' data: data to be validated)
           -> structure: parameter "obj_type" of String, parameter "data" of
           mapping from String to String
        :returns: instance of type "ValidateOutput" -> structure: parameter
           "validated" of type "boolean" (A boolean - 0 for false, 1 for
           true.), parameter "failed_constraint" of mapping from String to
           String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN validate_data
        returnVal = self.data_util.validate_data(params)
        #END validate_data

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method validate_data return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def import_matrix_from_excel(self, ctx, params):
        """
        import_matrix_from_excel: import matrix object from excel
        :param params: instance of type "ImportMatrixParams" (Input of the
           import_matrix_from_excel function obj_type: a type in
           KBaseMatrices input_shock_id: file shock id input_file_path:
           absolute file path input_staging_file_path: staging area file path
           matrix_name: matrix object name description: optional, a
           description of the matrix workspace_name: workspace name matrix
           object to be saved to optional: col_attributemapping_ref: column
           AttributeMapping reference row_attributemapping_ref: row
           AttributeMapping reference genome_ref: genome reference
           diff_expr_matrix_ref: DifferentialExpressionMatrix reference
           biochemistry_ref: (for ChemicalAbundanceMatrix) reads_set_ref:
           (raw data for AmpliconMatrix) sample_set_ref: SampleSet object
           reference) -> structure: parameter "obj_type" of String, parameter
           "input_shock_id" of String, parameter "input_file_path" of String,
           parameter "input_staging_file_path" of String, parameter
           "matrix_name" of String, parameter "amplicon_set_name" of String,
           parameter "scale" of String, parameter "description" of String,
           parameter "workspace_name" of type "workspace_name" (workspace
           name of the object), parameter "genome_ref" of type "obj_ref" (An
           X/Y/Z style reference), parameter "col_attributemapping_ref" of
           type "obj_ref" (An X/Y/Z style reference), parameter
           "row_attributemapping_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "diff_expr_matrix_ref" of type "obj_ref" (An
           X/Y/Z style reference), parameter "biochemistry_ref" of type
           "obj_ref" (An X/Y/Z style reference), parameter "reads_set_ref" of
           type "obj_ref" (An X/Y/Z style reference), parameter
           "sample_set_ref" of type "obj_ref" (An X/Y/Z style reference),
           parameter "unit" of String, parameter "type" of String
        :returns: instance of type "ImportMatrixOutput" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "matrix_obj_ref" of type "obj_ref" (An X/Y/Z
           style reference)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN import_matrix_from_excel
        returnVal = self.matrix_util.import_matrix_from_excel(params)
        #END import_matrix_from_excel

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method import_matrix_from_excel return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def import_matrix_from_biom(self, ctx, params):
        """
        import_matrix_from_biom: import matrix object from BIOM file format
        :param params: instance of type "ImportOTUParams" -> structure:
           parameter "obj_type" of String, parameter
           "taxonomic_abundance_tsv" of String, parameter "taxonomic_fasta"
           of String, parameter "input_local_file" of String, parameter
           "matrix_name" of String, parameter "amplicon_set_name" of String,
           parameter "scale" of String, parameter "description" of String,
           parameter "workspace_name" of type "workspace_name" (workspace
           name of the object), parameter "genome_ref" of type "obj_ref" (An
           X/Y/Z style reference), parameter "col_attributemapping_ref" of
           type "obj_ref" (An X/Y/Z style reference), parameter
           "row_attributemapping_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "diff_expr_matrix_ref" of type "obj_ref" (An
           X/Y/Z style reference), parameter "biochemistry_ref" of type
           "obj_ref" (An X/Y/Z style reference), parameter "reads_set_ref" of
           type "obj_ref" (An X/Y/Z style reference), parameter
           "sample_set_ref" of type "obj_ref" (An X/Y/Z style reference),
           parameter "metadata_keys" of list of String, parameter
           "extraction_kit" of String, parameter "amplicon_type" of String,
           parameter "target_gene_region" of String, parameter
           "forward_primer_sequence" of String, parameter
           "reverse_primer_sequence" of String, parameter
           "sequencing_platform" of String, parameter "sequencing_run" of
           String, parameter "sequencing_kit" of String, parameter
           "sequencing_quality_filter_cutoff" of String, parameter
           "clustering_cutoff" of Double, parameter "clustering_method" of
           String
        :returns: instance of type "ImportMatrixOutput" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "matrix_obj_ref" of type "obj_ref" (An X/Y/Z
           style reference)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN import_matrix_from_biom
        returnVal = self.biom_util.import_matrix_from_biom(params)
        #END import_matrix_from_biom

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method import_matrix_from_biom return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def save_object(self, ctx, params):
        """
        save_object: validate data constraints and save matrix object
        :param params: instance of type "SaveObjectParams" (Input of the
           import_matrix_from_excel function obj_type: saving object data
           type obj_name: saving object name data: data to be saved
           workspace_name: workspace name matrix object to be saved to) ->
           structure: parameter "obj_type" of String, parameter "obj_name" of
           String, parameter "data" of mapping from String to String,
           parameter "workspace_name" of type "workspace_name" (workspace
           name of the object)
        :returns: instance of type "SaveObjectOutput" -> structure: parameter
           "obj_ref" of type "obj_ref" (An X/Y/Z style reference)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN save_object
        returnVal = self.data_util.save_object(params)
        #END save_object

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method save_object return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def search_matrix(self, ctx, params):
        """
        search_matrix: generate a HTML report that allows users to select feature ids
        :param params: instance of type "MatrixSelectorParams" (Input of the
           search_matrix function matrix_obj_ref: object reference of a
           matrix workspace_name: workspace name objects to be saved to) ->
           structure: parameter "matrix_obj_ref" of type "obj_ref" (An X/Y/Z
           style reference), parameter "workspace_name" of type
           "workspace_name" (workspace name of the object)
        :returns: instance of type "MatrixSelectorOutput" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN search_matrix
        returnVal = self.matrix_util.search_matrix(params)
        #END search_matrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method search_matrix return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def filter_matrix(self, ctx, params):
        """
        filter_matrix: create sub-matrix based on input filter_ids
        :param params: instance of type "MatrixFilterParams" (Input of the
           filter_matrix function matrix_obj_ref: object reference of a
           matrix workspace_name: workspace name objects to be saved to
           filter_ids: string of column or row ids that result matrix
           contains filtered_matrix_name: name of newly created filtered
           matrix object) -> structure: parameter "matrix_obj_ref" of type
           "obj_ref" (An X/Y/Z style reference), parameter "workspace_name"
           of type "workspace_name" (workspace name of the object), parameter
           "filtered_matrix_name" of String, parameter "remove_ids" of
           String, parameter "dimension" of String
        :returns: instance of type "MatrixFilterOutput" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "matrix_obj_refs" of list of type "obj_ref" (An
           X/Y/Z style reference)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN filter_matrix
        returnVal = self.matrix_util.filter_matrix(params)
        #END filter_matrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method filter_matrix return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def standardize_matrix(self, ctx, params):
        """
        standardize_matrix: standardize a matrix
        :param params: instance of type "StandardizeMatrixParams" (Input of
           the standardize_matrix function input_matrix_ref: object reference
           of a matrix workspace_name: workspace name objects to be saved to
           with_mean: center data before scaling with_std: scale data to unit
           variance new_matrix_name: name of newly created matrix object) ->
           structure: parameter "input_matrix_ref" of type "obj_ref" (An
           X/Y/Z style reference), parameter "workspace_name" of type
           "workspace_name" (workspace name of the object), parameter
           "with_mean" of type "boolean" (A boolean - 0 for false, 1 for
           true.), parameter "with_std" of type "boolean" (A boolean - 0 for
           false, 1 for true.), parameter "dimension" of String, parameter
           "new_matrix_name" of String
        :returns: instance of type "StandardizeMatrixOutput" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "new_matrix_obj_ref" of type "obj_ref" (An X/Y/Z
           style reference)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN standardize_matrix
        returnVal = self.matrix_util.standardize_matrix(params)
        #END standardize_matrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method standardize_matrix return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def transform_matrix(self, ctx, params):
        """
        :param params: instance of type "TransformMatrixParams" -> structure:
           parameter "input_matrix_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "workspace_name" of type "workspace_name"
           (workspace name of the object), parameter "workspace_id" of Long,
           parameter "new_matrix_name" of String, parameter
           "abundance_filtering_params" of mapping from String to String,
           parameter "standardization_params" of mapping from String to
           String, parameter "ratio_transformation_params" of mapping from
           String to String, parameter "perform_relative_abundance" of type
           "boolean" (A boolean - 0 for false, 1 for true.)
        :returns: instance of type "TransformMatrixOutput" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "new_matrix_obj_ref" of type "obj_ref" (An X/Y/Z
           style reference)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN transform_matrix
        returnVal = self.matrix_util.transform_matrix(params)
        #END transform_matrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method transform_matrix return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def perform_rarefy(self, ctx, params):
        """
        :param params: instance of type "RarefyMatrixParams" -> structure:
           parameter "input_matrix_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "workspace_id" of Long, parameter
           "new_matrix_name" of String, parameter "seed_number" of Long,
           parameter "dimension" of String
        :returns: instance of type "RarefyMatrixOutput" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "new_matrix_obj_ref" of type "obj_ref" (An X/Y/Z
           style reference)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN perform_rarefy
        returnVal = self.matrix_util.perform_rarefy(params)
        #END perform_rarefy

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method perform_rarefy return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def perform_variable_stats_matrix(self, ctx, params):
        """
        :param params: instance of type "VariableStatsParams" -> structure:
           parameter "input_matrix_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "attribute_mapping_obj_ref" of type
           "obj_ref" (An X/Y/Z style reference), parameter "workspace_id" of
           Long, parameter "dist_metric" of String, parameter "dimension" of
           String, parameter "grouping" of String, parameter "permutations"
           of Long, parameter "perform_anosim" of type "boolean" (A boolean -
           0 for false, 1 for true.), parameter "perform_permanova" of type
           "boolean" (A boolean - 0 for false, 1 for true.), parameter
           "perform_permdisp" of type "boolean" (A boolean - 0 for false, 1
           for true.)
        :returns: instance of type "VariableStatsOutput" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN perform_variable_stats_matrix
        returnVal = self.matrix_util.perform_variable_stats_matrix(params)
        #END perform_variable_stats_matrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError(
                'Method perform_variable_stats_matrix return value ' +
                'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def perform_mantel_test(self, ctx, params):
        """
        :param params: instance of type "MantelTestParams" -> structure:
           parameter "input_matrix_refs" of list of type "obj_ref" (An X/Y/Z
           style reference), parameter "workspace_id" of Long, parameter
           "dist_metric" of String, parameter "dimension" of String,
           parameter "correlation_method" of String, parameter "permutations"
           of Long, parameter "alternative_hypothesis" of String
        :returns: instance of type "MantelTestOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN perform_mantel_test
        returnVal = self.matrix_util.perform_mantel_test(params)
        #END perform_mantel_test

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method perform_mantel_test return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def file_to_attribute_mapping(self, ctx, params):
        """
        :param params: instance of type "FileToAttributeMappingParams"
           (input_shock_id and input_file_path - alternative input params,)
           -> structure: parameter "input_shock_id" of String, parameter
           "input_file_path" of String, parameter "output_ws_id" of String,
           parameter "output_obj_name" of String
        :returns: instance of type "FileToAttributeMappingOutput" ->
           structure: parameter "attribute_mapping_ref" of type "obj_ref" (An
           X/Y/Z style reference)
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN file_to_attribute_mapping
        logging.info(
            "Starting 'file_to_attribute_mapping' with params:{}".format(
                params))
        self.attr_util.validate_params(params,
                                       ("output_ws_id", "output_obj_name"),
                                       ('input_shock_id', 'input_file_path'))
        result = self.attr_util.file_to_attribute_mapping(params)
        #END file_to_attribute_mapping

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method file_to_attribute_mapping return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]

    def file_to_fbamodel_attribute_mapping(self, ctx, params):
        """
        :param params: instance of type "FileToAttributeMappingParams"
           (input_shock_id and input_file_path - alternative input params,)
           -> structure: parameter "input_shock_id" of String, parameter
           "input_file_path" of String, parameter "output_ws_id" of String,
           parameter "output_obj_name" of String
        :returns: instance of type "FileToAttributeMappingOutput" ->
           structure: parameter "attribute_mapping_ref" of type "obj_ref" (An
           X/Y/Z style reference)
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN file_to_fbamodel_attribute_mapping
        logging.info(
            "Starting 'file_to_fbamodel_attribute_mapping' with params:{}".
            format(params))
        self.attr_util.validate_params(params,
                                       ("output_ws_id", "output_obj_name"),
                                       ('input_shock_id', 'input_file_path'))
        params['import_fbamodel_attri_mapping'] = True
        result = self.attr_util.file_to_attribute_mapping(params)
        #END file_to_fbamodel_attribute_mapping

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError(
                'Method file_to_fbamodel_attribute_mapping return value ' +
                'result is not type dict as required.')
        # return the results
        return [result]

    def update_matrix_attribute_mapping(self, ctx, params):
        """
        :param params: instance of type "UpdateMatrixAMParams" -> structure:
           parameter "staging_file_subdir_path" of String, parameter
           "dimension" of String, parameter "input_matrix_ref" of type
           "obj_ref" (An X/Y/Z style reference), parameter "workspace_name"
           of String, parameter "output_am_obj_name" of String, parameter
           "output_matrix_obj_name" of String
        :returns: instance of type "UpdateMatrixAMOutput" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "new_matrix_obj_ref" of type "obj_ref" (An X/Y/Z
           style reference), parameter "new_attribute_mapping_ref" of type
           "obj_ref" (An X/Y/Z style reference)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN update_matrix_attribute_mapping
        logging.info(
            "Starting 'update_matrix_attribute_mapping' with params:{}".format(
                params))
        self.attr_util.validate_params(
            params, ("staging_file_subdir_path", "dimension", "workspace_name",
                     "output_am_obj_name", "input_matrix_ref",
                     "output_matrix_obj_name"))
        returnVal = self.attr_util.update_matrix_attribute_mapping(params)
        #END update_matrix_attribute_mapping

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError(
                'Method update_matrix_attribute_mapping return value ' +
                'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def attribute_mapping_to_tsv_file(self, ctx, params):
        """
        :param params: instance of type "AttributeMappingToTsvFileParams" ->
           structure: parameter "input_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "destination_dir" of String
        :returns: instance of type "AttributeMappingToTsvFileOutput" ->
           structure: parameter "file_path" of String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN attribute_mapping_to_tsv_file
        logging.info(
            "Starting 'attribute_mapping_to_tsv_file' with params:{}".format(
                params))
        self.attr_util.validate_params(params,
                                       ("destination_dir", "input_ref"))
        am_id, result = self.attr_util.to_tsv(params)
        #END attribute_mapping_to_tsv_file

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError(
                'Method attribute_mapping_to_tsv_file return value ' +
                'result is not type dict as required.')
        # return the results
        return [result]

    def export_attribute_mapping_tsv(self, ctx, params):
        """
        :param params: instance of type "ExportObjectParams" -> structure:
           parameter "input_ref" of type "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN export_attribute_mapping_tsv
        logging.info(
            "Starting 'export_attribute_mapping_tsv' with params:{}".format(
                params))
        self.attr_util.validate_params(params, ("input_ref", ))
        params['destination_dir'] = self.scratch
        am_id, files = self.attr_util.to_tsv(params)
        result = self.attr_util.export(files['file_path'], am_id,
                                       params['input_ref'])
        #END export_attribute_mapping_tsv

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError(
                'Method export_attribute_mapping_tsv return value ' +
                'result is not type dict as required.')
        # return the results
        return [result]

    def export_attribute_mapping_excel(self, ctx, params):
        """
        :param params: instance of type "ExportObjectParams" -> structure:
           parameter "input_ref" of type "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN export_attribute_mapping_excel
        logging.info(
            "Starting 'export_attribute_mapping_excel' with params:{}".format(
                params))
        self.attr_util.validate_params(params, ("input_ref", ))
        params['destination_dir'] = self.scratch
        am_id, files = self.attr_util.to_excel(params)
        result = self.attr_util.export(files['file_path'], am_id,
                                       params['input_ref'])
        #END export_attribute_mapping_excel

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError(
                'Method export_attribute_mapping_excel return value ' +
                'result is not type dict as required.')
        # return the results
        return [result]

    def export_cluster_set_excel(self, ctx, params):
        """
        :param params: instance of type "ExportObjectParams" -> structure:
           parameter "input_ref" of type "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN export_cluster_set_excel
        logging.info(
            "Starting 'export_cluster_set_excel' with params:{}".format(
                params))
        self.attr_util.validate_params(params, ("input_ref", ))
        params['destination_dir'] = self.scratch
        cs_id, files = self.attr_util.to_excel(params)
        result = self.attr_util.export(files['file_path'], cs_id,
                                       params['input_ref'])
        #END export_cluster_set_excel

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method export_cluster_set_excel return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]

    def export_corr_matrix_excel(self, ctx, params):
        """
        :param params: instance of type "ExportObjectParams" -> structure:
           parameter "input_ref" of type "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN export_corr_matrix_excel
        logging.info(
            "Starting 'export_corr_matrix_excel' with params:{}".format(
                params))
        result = self.corr_util.export_corr_matrix_excel(params)
        #END export_corr_matrix_excel

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method export_corr_matrix_excel return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]

    def export_pca_matrix_excel(self, ctx, params):
        """
        :param params: instance of type "ExportObjectParams" -> structure:
           parameter "input_ref" of type "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN export_pca_matrix_excel
        result = self.pca_util.export_pca_matrix_excel(params)
        #END export_pca_matrix_excel

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method export_pca_matrix_excel return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]

    def export_amplicon_set_tsv(self, ctx, params):
        """
        :param params: instance of type "ExportObjectParams" -> structure:
           parameter "input_ref" of type "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN export_amplicon_set_tsv
        result = self.biom_util.export_amplicon_set_tsv(params)
        #END export_amplicon_set_tsv

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method export_amplicon_set_tsv return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]

    def compute_correlation_matrix(self, ctx, params):
        """
        compute_correlation_matrix: create sub-matrix based on input filter_ids
        :param params: instance of type "CompCorrParams" (Input of the
           compute_correlation_matrix function input_obj_ref: object
           reference of a matrix workspace_name: workspace name objects to be
           saved to corr_matrix_name: correlation matrix object name
           dimension: compute correlation on column or row, one of ['col',
           'row'] method: correlation method, one of ['pearson', 'kendall',
           'spearman'] plot_corr_matrix: plot correlation matrix in report,
           default False plot_scatter_matrix: plot scatter matrix in report,
           default False compute_significance: also compute Significance in
           addition to correlation matrix) -> structure: parameter
           "input_obj_ref" of type "obj_ref" (An X/Y/Z style reference),
           parameter "workspace_name" of type "workspace_name" (workspace
           name of the object), parameter "corr_matrix_name" of String,
           parameter "dimension" of String, parameter "method" of String,
           parameter "plot_corr_matrix" of type "boolean" (A boolean - 0 for
           false, 1 for true.), parameter "plot_scatter_matrix" of type
           "boolean" (A boolean - 0 for false, 1 for true.), parameter
           "compute_significance" of type "boolean" (A boolean - 0 for false,
           1 for true.)
        :returns: instance of type "CompCorrOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String,
           parameter "corr_matrix_obj_ref" of type "obj_ref" (An X/Y/Z style
           reference)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN compute_correlation_matrix
        returnVal = self.corr_util.compute_correlation_matrix(params)
        #END compute_correlation_matrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError(
                'Method compute_correlation_matrix return value ' +
                'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def compute_correlation_across_matrices(self, ctx, params):
        """
        compute_correlation_across_matrices: compute correlation matrix across matrices
        :param params: instance of type "CompCorrMetriceParams" (Input of the
           compute_correlation_across_matrices function matrix_ref_1: object
           reference of a matrix matrix_ref_2: object reference of a matrix
           workspace_name: workspace name objects to be saved to
           corr_matrix_name: correlation matrix object name dimension:
           compute correlation on column or row, one of ['col', 'row']
           method: correlation method, one of ['pearson', 'kendall',
           'spearman'] plot_corr_matrix: plot correlation matrix in report,
           default False compute_significance: also compute Significance in
           addition to correlation matrix) -> structure: parameter
           "matrix_ref_1" of type "obj_ref" (An X/Y/Z style reference),
           parameter "matrix_ref_2" of type "obj_ref" (An X/Y/Z style
           reference), parameter "workspace_name" of type "workspace_name"
           (workspace name of the object), parameter "corr_matrix_name" of
           String, parameter "dimension" of String, parameter "method" of
           String, parameter "plot_corr_matrix" of type "boolean" (A boolean
           - 0 for false, 1 for true.), parameter "compute_significance" of
           type "boolean" (A boolean - 0 for false, 1 for true.), parameter
           "corr_threshold" of Double
        :returns: instance of type "CompCorrOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String,
           parameter "corr_matrix_obj_ref" of type "obj_ref" (An X/Y/Z style
           reference)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN compute_correlation_across_matrices
        returnVal = self.corr_util.compute_correlation_across_matrices(params)
        #END compute_correlation_across_matrices

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError(
                'Method compute_correlation_across_matrices return value ' +
                'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def build_network(self, ctx, params):
        """
        build_network: filter correlation matrix and build network
        :param params: instance of type "BuildNetworkParams" (Input of the
           build_network function corr_matrix_ref: CorrelationMatrix object
           workspace_name: workspace name objects to be saved to
           network_obj_name: Network object name filter_on_threshold: Dictory
           holder that holds filter on thredshold params params in
           filter_on_threshold: coefficient_threshold: correlation
           coefficient threshold (select pairs with greater correlation
           coefficient)) -> structure: parameter "corr_matrix_ref" of type
           "obj_ref" (An X/Y/Z style reference), parameter "workspace_name"
           of type "workspace_name" (workspace name of the object), parameter
           "network_obj_name" of String, parameter "filter_on_threshold" of
           mapping from String to String
        :returns: instance of type "BuildNetworkOutput" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "network_obj_ref" of type "obj_ref" (An X/Y/Z
           style reference)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN build_network
        returnVal = self.network_util.build_network(params)
        #END build_network

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method build_network return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def run_pca(self, ctx, params):
        """
        run_pca: PCA analysis on matrix
        :param params: instance of type "PCAParams" (Input of the run_pca
           function input_obj_ref: object reference of a matrix
           workspace_name: the name of the workspace pca_matrix_name: name of
           PCA (KBaseExperiments.PCAMatrix) object dimension: compute PCA on
           column or row, one of ['col', 'row'] n_components - number of
           components (default 2) attribute_mapping_obj_ref - associated
           attribute_mapping_obj_ref scale_size_by - used for PCA plot to
           scale data size color_marker_by - used for PCA plot to group data)
           -> structure: parameter "input_obj_ref" of type "obj_ref" (An
           X/Y/Z style reference), parameter "workspace_name" of String,
           parameter "pca_matrix_name" of String, parameter "dimension" of
           String, parameter "n_components" of Long, parameter
           "attribute_mapping_obj_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "scale_size_by" of mapping from String to
           String, parameter "color_marker_by" of mapping from String to
           String
        :returns: instance of type "PCAOutput" (Ouput of the run_pca function
           pca_ref: PCA object reference (as KBaseExperiments.PCAMatrix data
           type) report_name: report name generated by KBaseReport
           report_ref: report reference generated by KBaseReport) ->
           structure: parameter "pca_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "report_name" of String, parameter
           "report_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN run_pca
        returnVal = self.pca_util.run_pca(params)
        #END run_pca

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method run_pca return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def view_matrix(self, ctx, params):
        """
        view_matrix: generate a report for matrix viewer
        :param params: instance of type "ViewMatrixParams" -> structure:
           parameter "input_matrix_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "workspace_name" of String, parameter
           "with_attribute_info" of type "boolean" (A boolean - 0 for false,
           1 for true.)
        :returns: instance of type "ViewMatrixOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN view_matrix
        returnVal = self.data_table_util.view_matrix_as_table(params)
        #END view_matrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method view_matrix return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def build_chemical_abundance_template(self, ctx, params):
        """
        :param params: instance of type "ChemAbunTempParams" -> structure:
           parameter "workspace_name" of String, parameter "workspace_id" of
           Long, parameter "sample_set_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "chemical_data_included" of mapping from
           String to Long, parameter "chemical_ids_included" of mapping from
           String to Long
        :returns: instance of type "ViewMatrixOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN build_chemical_abundance_template
        returnVal = self.template_util.build_chemical_abundance_template(
            params)
        #END build_chemical_abundance_template

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError(
                'Method build_chemical_abundance_template return value ' +
                'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
Exemplo n.º 11
0
class BiomUtil:
    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _process_params(self, params):
        logging.info('start validating import_matrix_from_biom params')

        # check for required parameters
        for p in [
                'obj_type', 'matrix_name', 'workspace_name', 'scale',
                'amplicon_set_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        obj_type = params.get('obj_type')
        if obj_type not in self.matrix_types:
            raise ValueError('Unknown matrix object type: {}'.format(obj_type))

        scale = params.get('scale')
        if scale not in SCALE_TYPES:
            raise ValueError('Unknown scale type: {}'.format(scale))

        biom_file = None
        tsv_file = None
        fasta_file = None
        metadata_keys = DEFAULT_META_KEYS

        if params.get('biom_tsv'):
            biom_tsv = params.get('biom_tsv')
            biom_file = biom_tsv.get('biom_file_biom_tsv')
            tsv_file = biom_tsv.get('tsv_file_biom_tsv')

            if not (biom_file and tsv_file):
                raise ValueError('missing BIOM or TSV file')

            biom_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                biom_file
            }).get('copy_file_path')

            tsv_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                tsv_file
            }).get('copy_file_path')
            mode = 'biom_tsv'
        elif params.get('biom_fasta'):
            biom_fasta = params.get('biom_fasta')
            biom_file = biom_fasta.get('biom_file_biom_fasta')
            fasta_file = biom_fasta.get('fasta_file_biom_fasta')

            if not (biom_file and fasta_file):
                raise ValueError('missing BIOM or FASTA file')

            biom_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                biom_file
            }).get('copy_file_path')

            fasta_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                fasta_file
            }).get('copy_file_path')
            mode = 'biom_fasta'
        elif params.get('tsv_fasta'):
            tsv_fasta = params.get('tsv_fasta')
            tsv_file = tsv_fasta.get('tsv_file_tsv_fasta')
            fasta_file = tsv_fasta.get('fasta_file_tsv_fasta')

            if not (tsv_file and fasta_file):
                raise ValueError('missing TSV or FASTA file')

            tsv_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                tsv_file
            }).get('copy_file_path')

            fasta_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                fasta_file
            }).get('copy_file_path')

            metadata_keys_str = tsv_fasta.get('metadata_keys_tsv_fasta')
            if metadata_keys_str:
                metadata_keys += [
                    x.strip() for x in metadata_keys_str.split(',')
                ]
            mode = 'tsv_fasta'
        elif params.get('tsv'):
            tsv = params.get('tsv')
            tsv_file = tsv.get('tsv_file_tsv')

            if not tsv_file:
                raise ValueError('missing TSV file')

            tsv_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                tsv_file
            }).get('copy_file_path')

            metadata_keys_str = tsv.get('metadata_keys_tsv')
            if metadata_keys_str:
                metadata_keys += [
                    x.strip() for x in metadata_keys_str.split(',')
                ]

            mode = 'tsv'
        else:
            raise ValueError('missing valide file group type in parameters')

        return (biom_file, tsv_file, fasta_file, mode,
                list(set(metadata_keys)))

    def _retrieve_value(self,
                        biom_metadata_dict,
                        tsv_metadata_df,
                        key,
                        required=False):

        if key in biom_metadata_dict:
            return {k.lower(): v
                    for k, v in biom_metadata_dict.items()}.get(key)
        elif key in tsv_metadata_df:
            return {k.lower(): v for k, v in tsv_metadata_df.items()}.get(key)
        elif required:
            raise ValueError('missing necessary [{}] from file'.format(key))
        else:
            return None

    def _search_taxon(self, scientific_name):
        """
        logic borrowed from: GFU.GenomeInterface
        https://github.com/kbaseapps/GenomeFileUtil/blob/master/lib/GenomeFileUtil/core/GenomeInterface.py#L216
        """
        taxon_id = None

        search_params = {
            "object_types": ["taxon"],
            "match_filter": {
                "lookup_in_keys": {
                    "scientific_name": {
                        "value": scientific_name
                    }
                },
                "exclude_subobjects": 1
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "sorting_rules": [{
                "is_object_property": 0,
                "property": "timestamp",
                "ascending": 0
            }]
        }

        objects = self.kbse.search_objects(search_params)['objects']

        if not objects:
            search_params['match_filter']['lookup_in_keys'] = {
                "aliases": {
                    "value": scientific_name
                }
            }
            objects = self.kbse.search_objects(search_params)['objects']
        if objects:
            taxon_id = objects[0].get('object_name')

        return taxon_id

    def _fetch_taxon_level(self, taxon_char):

        taxon_level_mapping = {
            'l': 'Life',
            'd': 'Domain',
            'k': 'Kingdom',
            'p': 'Phylum',
            'c': 'Class',
            'o': 'Order',
            'f': 'Family',
            'g': 'Genus',
            's': 'Species'
        }

        return taxon_level_mapping.get(taxon_char[0].lower(), 'Unknown')

    def _fetch_taxonomy(self, datarow):
        lineage = self._retrieve_value([], datarow, 'taxonomy')
        if isinstance(lineage, str):
            delimiter = csv.Sniffer().sniff(lineage).delimiter
            lineage = [x.strip() for x in lineage.split(delimiter)]

        taxonomy = {'lineage': lineage}

        for key in ['score', 'taxonomy_source', 'species_name']:
            val = self._retrieve_value([], datarow, key)
            if val:
                taxonomy[key] = val

        for item in lineage[::-1]:
            scientific_name = item.split('_')[-1]
            taxon_level_char = item.split('_')[0]
            if scientific_name:
                taxon_id = self._search_taxon(scientific_name)
                if taxon_id:
                    taxon_ref = f"{self.taxon_wsname}/{taxon_id}"
                    taxon_level = self._fetch_taxon_level(taxon_level_char)

                    taxonomy.update({
                        'taxon_ref': taxon_ref,
                        'taxon_id': taxon_id,
                        'scientific_name': scientific_name,
                        'taxon_level': taxon_level
                    })
                    break

        return taxonomy

    def _retrieve_tsv_amplicon_set_data(self, tsv_file):
        amplicons = dict()

        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide TSV file')

        if 'consensus_sequence' not in df.columns.tolist():
            raise ValueError('TSV file does not include consensus_sequence')

        logging.info('start processing each row in TSV')
        for observation_id in df.index:
            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {
                'consensus_sequence': df.loc[observation_id,
                                             'consensus_sequence'],
                'taxonomy': taxonomy
            }

            amplicons.update({observation_id: amplicon})

        logging.info('finished parsing TSV file')

        return amplicons

    def _retrieve_tsv_fasta_amplicon_set_data(self, tsv_file, fasta_file):
        amplicons = dict()
        try:
            logging.info('start parsing FASTA file')
            fastq_dict = SeqIO.index(fasta_file, "fasta")
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide FASTA file')

        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide TSV file')

        logging.info('start processing files')
        for observation_id in df.index:
            if observation_id not in fastq_dict:
                raise ValueError('FASTA file does not have [{}] OTU id'.format(
                    observation_id))

            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {
                'consensus_sequence': str(fastq_dict.get(observation_id).seq),
                'taxonomy': taxonomy
            }
            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        return amplicons

    def _retrieve_biom_fasta_amplicon_set_data(self, biom_file, fasta_file):
        amplicons = dict()
        try:
            logging.info('start parsing FASTA file')
            fastq_dict = SeqIO.index(fasta_file, "fasta")
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide FASTA file')

        logging.info('start parsing BIOM file')
        table = biom.load_table(biom_file)

        observation_ids = table._observation_ids.tolist()
        observation_metadata = table._observation_metadata

        logging.info('start processing files')
        for index, observation_id in enumerate(observation_ids):
            if observation_id not in fastq_dict:
                raise ValueError('FASTA file does not have [{}] OTU id'.format(
                    observation_id))

            taxonomy = self._fetch_taxonomy(observation_metadata[index])

            amplicon = {
                'consensus_sequence': str(fastq_dict.get(observation_id).seq),
                'taxonomy': taxonomy
            }

            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        return amplicons

    def _retrieve_biom_tsv_amplicon_set_data(self, biom_file, tsv_file):
        amplicons = dict()
        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide tsv file')

        if 'consensus_sequence' not in df.columns.tolist():
            raise ValueError('TSV file does not include consensus_sequence')

        logging.info('start parsing BIOM file')
        table = biom.load_table(biom_file)

        observation_ids = table._observation_ids.tolist()
        observation_metadata = table._observation_metadata

        logging.info('start processing files')
        for index, observation_id in enumerate(observation_ids):
            if observation_id not in df.index:
                raise ValueError('TSV file does not have [{}] OTU id'.format(
                    observation_id))

            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {
                'consensus_sequence': df.loc[observation_id,
                                             'consensus_sequence'],
                'taxonomy': taxonomy
            }

            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        return amplicons

    def _file_to_amplicon_set_data(self, biom_file, tsv_file, fasta_file, mode,
                                   refs, description, matrix_obj_ref):

        logging.info('start parsing amplicon_set_data')

        amplicon_set_data = dict()

        if mode == 'biom_tsv':
            amplicons = self._retrieve_biom_tsv_amplicon_set_data(
                biom_file, tsv_file)
        elif mode == 'biom_fasta':
            amplicons = self._retrieve_biom_fasta_amplicon_set_data(
                biom_file, fasta_file)
        elif mode == 'tsv_fasta':
            amplicons = self._retrieve_tsv_fasta_amplicon_set_data(
                tsv_file, fasta_file)
        elif mode == 'tsv':
            amplicons = self._retrieve_tsv_amplicon_set_data(tsv_file)
        else:
            raise ValueError(
                'error parsing _file_to_amplicon_set_data, mode: {}'.format(
                    mode))

        amplicon_set_data.update({'amplicons': amplicons})

        if 'reads_set_ref' in refs:
            amplicon_set_data['reads_set_ref'] = refs.get('reads_set_ref')

        if description:
            amplicon_set_data['description'] = description

        matrix_obj_ref_array = matrix_obj_ref.split('/')
        amplicon_set_data['amplicon_matrix_ref'] = '{}/{}'.format(
            matrix_obj_ref_array[0], matrix_obj_ref_array[1])

        return amplicon_set_data

    def _file_to_amplicon_data(self,
                               biom_file,
                               tsv_file,
                               mode,
                               refs,
                               matrix_name,
                               workspace_id,
                               scale,
                               description,
                               metadata_keys=None):

        amplicon_data = refs

        if mode.startswith('biom'):
            logging.info('start parsing BIOM file for matrix data')
            table = biom.load_table(biom_file)
            observation_metadata = table._observation_metadata
            sample_metadata = table._sample_metadata

            matrix_data = {
                'row_ids': table._observation_ids.tolist(),
                'col_ids': table._sample_ids.tolist(),
                'values': table.matrix_data.toarray().tolist()
            }

            logging.info('start building attribute mapping object')
            amplicon_data.update(
                self.get_attribute_mapping("row", observation_metadata,
                                           matrix_data, matrix_name, refs,
                                           workspace_id))
            amplicon_data.update(
                self.get_attribute_mapping("col", sample_metadata, matrix_data,
                                           matrix_name, refs, workspace_id))

            amplicon_data['attributes'] = {}
            for k in ('create_date', 'generated_by'):
                val = getattr(table, k)
                if not val:
                    continue
                if isinstance(val, bytes):
                    amplicon_data['attributes'][k] = val.decode('utf-8')
                else:
                    amplicon_data['attributes'][k] = str(val)
        elif mode.startswith('tsv'):
            observation_metadata = None
            sample_metadata = None
            try:
                logging.info('start parsing TSV file for matrix data')
                reader = pd.read_csv(tsv_file, sep=None, iterator=True)
                inferred_sep = reader._engine.data.dialect.delimiter
                df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
            except Exception:
                raise ValueError(
                    'Cannot parse file. Please provide valide tsv file')
            else:
                metadata_df = None
                if metadata_keys:
                    shared_metadata_keys = list(
                        set(metadata_keys) & set(df.columns))
                    if mode == 'tsv' and 'consensus_sequence' not in shared_metadata_keys:
                        raise ValueError(
                            'TSV file does not include consensus_sequence')
                    if shared_metadata_keys:
                        metadata_df = df[shared_metadata_keys]
                        df.drop(columns=shared_metadata_keys, inplace=True)
                try:
                    df = df.astype(float)
                except ValueError:
                    err_msg = 'Found some non-float values. Matrix contains only numeric values\n'
                    err_msg += 'Please list any non-numeric column names in  Metadata Keys field'
                    raise ValueError(err_msg)
                df.fillna(0, inplace=True)
                matrix_data = {
                    'row_ids': df.index.tolist(),
                    'col_ids': df.columns.tolist(),
                    'values': df.values.tolist()
                }

            logging.info('start building attribute mapping object')
            amplicon_data.update(
                self.get_attribute_mapping("row", observation_metadata,
                                           matrix_data, matrix_name, refs,
                                           workspace_id, metadata_df))
            amplicon_data.update(
                self.get_attribute_mapping("col", sample_metadata, matrix_data,
                                           matrix_name, refs, workspace_id))

            amplicon_data['attributes'] = {}
        else:
            raise ValueError(
                'error parsing _file_to_amplicon_data, mode: {}'.format(mode))

        amplicon_data.update({'data': matrix_data})

        amplicon_data['search_attributes'] = [
            f'{k}|{v}' for k, v in amplicon_data['attributes'].items()
        ]

        amplicon_data['scale'] = scale
        if description:
            amplicon_data['description'] = description

        return amplicon_data

    def get_attribute_mapping(self,
                              axis,
                              metadata,
                              matrix_data,
                              matrix_name,
                              refs,
                              workspace_id,
                              metadata_df=None):
        mapping_data = {}
        axis_ids = matrix_data[f'{axis}_ids']
        if refs.get(f'{axis}_attributemapping_ref'):
            am_data = self.dfu.get_objects(
                {'object_refs':
                 [refs[f'{axis}_attributemapping_ref']]})['data'][0]['data']
            unmatched_ids = set(axis_ids) - set(am_data['instances'].keys())
            if unmatched_ids:
                name = "Column" if axis == 'col' else "Row"
                raise ValueError(
                    f"The following {name} IDs from the uploaded matrix do not match "
                    f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}"
                    f"\nPlease verify the input data or upload an excel file with a"
                    f"{name} mapping tab.")
            else:
                mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}

        elif metadata:
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[
                f'{axis}_attributemapping_ref'] = self._metadata_to_attribute_mapping(
                    axis_ids, metadata, name, workspace_id)
            # if coming from biom file, metadata and axis IDs are guaranteed to match
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}
        elif metadata_df is not None:
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[
                f'{axis}_attributemapping_ref'] = self._meta_df_to_attribute_mapping(
                    axis_ids, metadata_df, name, workspace_id)
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}

        return mapping_data

    def _meta_df_to_attribute_mapping(self, axis_ids, metadata_df, obj_name,
                                      ws_id):
        data = {'ontology_mapping_method': "TSV file", 'instances': {}}
        attribute_keys = metadata_df.columns.tolist()
        data['attributes'] = [{
            'attribute': key,
            'source': 'upload'
        } for key in attribute_keys]

        for axis_id in axis_ids:
            data['instances'][axis_id] = metadata_df.loc[axis_id].tolist()

        logging.info(
            'start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id":
            ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": data,
                "name": obj_name
            }]
        })[0]

        return f'{info[6]}/{info[0]}/{info[4]}'

    def _metadata_to_attribute_mapping(self, instances, metadata, obj_name,
                                       ws_id):
        data = {'ontology_mapping_method': "BIOM file", 'instances': {}}
        sample_set = metadata[0:min(len(metadata), 25)]
        metadata_keys = sorted(
            set((k for m_dict in sample_set for k in m_dict)))
        data['attributes'] = [{
            'attribute': key,
            'source': 'upload'
        } for key in metadata_keys]
        for inst, meta in zip(instances, metadata):
            data['instances'][inst] = [
                str(meta[attr]) for attr in metadata_keys
            ]

        logging.info(
            'start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id":
            ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": data,
                "name": obj_name
            }]
        })[0]
        return f'{info[6]}/{info[0]}/{info[4]}'

    def _generate_report(self, matrix_obj_ref, amplicon_set_obj_ref,
                         new_row_attr_ref, new_col_attr_ref, workspace_name):
        """
        _generate_report: generate summary report
        """

        objects_created = [{
            'ref': matrix_obj_ref,
            'description': 'Imported Amplicon Matrix'
        }, {
            'ref': amplicon_set_obj_ref,
            'description': 'Imported Amplicon Set'
        }]

        if new_row_attr_ref:
            objects_created.append({
                'ref':
                new_row_attr_ref,
                'description':
                'Imported Amplicons(Row) Attribute Mapping'
            })

        if new_col_attr_ref:
            objects_created.append({
                'ref':
                new_col_attr_ref,
                'description':
                'Imported Samples(Column) Attribute Mapping'
            })

        report_params = {
            'message': '',
            'objects_created': objects_created,
            'workspace_name': workspace_name,
            'report_object_name':
            'import_matrix_from_biom_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _df_to_tsv(self, amplicon_set_df, result_dir, amplicon_set_ref):
        logging.info('writting amplicon set data frame to tsv file')
        amplicon_set_obj = self.dfu.get_objects(
            {'object_refs': [amplicon_set_ref]})['data'][0]
        amplicon_set_info = amplicon_set_obj['info']
        amplicon_set_name = amplicon_set_info[1]

        file_path = os.path.join(result_dir, amplicon_set_name + ".tsv")

        amplicon_set_df.to_csv(file_path, sep='\t', index=True, header=True)

        return file_path

    def _amplicon_set_to_df(self, amplicon_set_ref):
        logging.info('converting amplicon set to data frame')
        am_set_data = self.dfu.get_objects({'object_refs': [amplicon_set_ref]
                                            })['data'][0]['data']

        amplicon_matrix_ref = am_set_data.get('amplicon_matrix_ref')
        matrix_data = self.dfu.get_objects(
            {'object_refs': [amplicon_matrix_ref]})['data'][0]['data']
        matrix_value_data = matrix_data.get('data')

        index = matrix_value_data.get('row_ids')
        columns = matrix_value_data.get('col_ids')
        values = matrix_value_data.get('values')

        df = pd.DataFrame(values, index=index, columns=columns)

        amplicons = am_set_data.get('amplicons')

        meta_index = list()

        meta_columns = [
            'taxonomy', 'taxon_id', 'taxon_ref', 'taxon_level', 'score',
            'taxonomy_source', 'species_name', 'consensus_sequence'
        ]
        meta_values = list()
        for otu_id, amplicon in amplicons.items():
            meta_index.append(otu_id)

            taxonomy_data = amplicon.get('taxonomy')

            taxonomy = taxonomy_data.get('lineage')
            taxon_id = taxonomy_data.get('taxon_id')
            taxon_ref = taxonomy_data.get('taxon_ref')
            taxon_level = taxonomy_data.get('taxon_level')
            score = taxonomy_data.get('score')
            taxonomy_source = taxonomy_data.get('taxonomy_source')
            species_name = taxonomy_data.get('species_name')

            consensus_sequence = amplicon.get('consensus_sequence')

            meta_values.append([
                taxonomy, taxon_id, taxon_ref, taxon_level, score,
                taxonomy_source, species_name, consensus_sequence
            ])

        meta_df = pd.DataFrame(meta_values,
                               index=meta_index,
                               columns=meta_columns)

        merged_df = df.merge(meta_df,
                             left_index=True,
                             right_index=True,
                             how='left',
                             validate='one_to_one')

        return merged_df

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.data_util = DataUtil(config)
        self.attr_util = AttributesUtil(config)
        self.matrix_util = MatrixUtil(config)
        self.matrix_types = [
            x.split(".")[1].split('-')[0]
            for x in self.data_util.list_generic_types()
        ]
        self.taxon_wsname = config['taxon-workspace-name']
        self.kbse = KBaseSearchEngine(config['search-url'])

    def import_matrix_from_biom(self, params):
        """
        arguments:
        obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix
        matrix_name: matrix object name
        workspace_name: workspace name matrix object to be saved to
        input_shock_id: file shock id
        or
        input_file_path: absolute file path
        or
        input_staging_file_path: staging area file path

        optional arguments:
        col_attributemapping_ref: column AttributeMapping reference
        row_attributemapping_ref: row AttributeMapping reference
        genome_ref: genome reference
        matrix_obj_ref: Matrix reference
        """

        (biom_file, tsv_file, fasta_file, mode,
         metadata_keys) = self._process_params(params)

        workspace_name = params.get('workspace_name')
        matrix_name = params.get('matrix_name')
        amplicon_set_name = params.get('amplicon_set_name')
        obj_type = params.get('obj_type')
        scale = params.get('scale')
        description = params.get('description')
        refs = {k: v for k, v in params.items() if "_ref" in k}

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        amplicon_data = self._file_to_amplicon_data(biom_file, tsv_file, mode,
                                                    refs, matrix_name,
                                                    workspace_id, scale,
                                                    description, metadata_keys)

        new_row_attr_ref = None
        if not params.get('row_attributemapping_ref'):
            new_row_attr_ref = amplicon_data.get('row_attributemapping_ref')

        new_col_attr_ref = None
        if not params.get('col_attributemapping_ref'):
            new_col_attr_ref = amplicon_data.get('col_attributemapping_ref')

        logging.info('start saving Matrix object: {}'.format(matrix_name))
        matrix_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(obj_type),
            'obj_name':
            matrix_name,
            'data':
            amplicon_data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        amplicon_set_data = self._file_to_amplicon_set_data(
            biom_file, tsv_file, fasta_file, mode, refs, description,
            matrix_obj_ref)

        logging.info(
            'start saving AmpliconSet object: {}'.format(amplicon_set_name))
        amplicon_set_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseExperiments.AmpliconSet',
            'obj_name':
            amplicon_set_name,
            'data':
            amplicon_set_data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        logging.info(
            'start resaving Matrix object with amplicon set: {}'.format(
                matrix_name))
        amplicon_data['amplicon_set_ref'] = '{}/{}'.format(
            workspace_id, amplicon_set_name)
        matrix_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(obj_type),
            'obj_name':
            matrix_name,
            'data':
            amplicon_data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        returnVal = {
            'matrix_obj_ref': matrix_obj_ref,
            'amplicon_set_obj_ref': amplicon_set_obj_ref
        }

        report_output = self._generate_report(matrix_obj_ref,
                                              amplicon_set_obj_ref,
                                              new_row_attr_ref,
                                              new_col_attr_ref, workspace_name)

        returnVal.update(report_output)

        return returnVal

    def export_amplicon_set_tsv(self, params):
        """
        export AmpliconSet as TSV
        """
        logging.info('start exporting amplicon set object')
        amplicon_set_ref = params.get('input_ref')

        amplicon_set_df = self._amplicon_set_to_df(amplicon_set_ref)

        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)

        self._df_to_tsv(amplicon_set_df, result_dir, amplicon_set_ref)

        package_details = self.dfu.package_for_download({
            'file_path':
            result_dir,
            'ws_refs': [amplicon_set_ref]
        })

        return {'shock_id': package_details['shock_id']}