예제 #1
0
class ImportPhenotypeSetUtil:

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.fba = fba_tools(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_phenotype_set_from_staging(self, params):
        '''
          import_phenotype_set_from_staging: wrapper method for
                                    fba_tools.tsv_file_to_phenotype_set

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          phenotype_set_name: output PhenotypeSet object name
          workspace_name: workspace name/ID of the object
          genome: Genome object that contains features referenced by the Phenotype Set

          return:
          obj_ref: return object reference
        '''

        log('--->\nrunning ImportPhenotypeSetUtil.import_phenotype_set_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_phenotype_set_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
                        download_staging_file_params).get('copy_file_path')
        file = {
            'path': scratch_file_path
        }
        import_phenotype_set_params = params.copy()
        import_phenotype_set_params['phenotype_set_file'] = file

        ref = self.fba.tsv_file_to_phenotype_set(import_phenotype_set_params)

        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'), ref.get('ref'))
        returnVal = {'obj_ref': ref.get('ref')}

        return returnVal

    def validate_import_phenotype_set_from_staging_params(self, params):
        """
        validate_import_phenotype_set_from_staging_params:
                    validates params passed to import_phenotype_set_from_staging method
        """
        # check for required parameters
        for p in ['staging_file_subdir_path', 'workspace_name', 'phenotype_set_name', 'genome']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                        import_phenotype_set_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to
        """
        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        get_objects_params = {
            'object_refs': [obj_ref],
            'ignore_errors': False
        }

        object_data = self.dfu.get_objects(get_objects_params)

        upload_message += "Phenotype Set Name: "
        upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n'
        upload_message += 'Imported File: {}\n'.format(params.get('staging_file_subdir_path'))

        report_params = {'message': upload_message,
                         'objects_created': [{'ref': obj_ref,
                                              'description': 'Imported Phenotype Set'}],
                         'workspace_name': params.get('workspace_name'),
                         'report_object_name': 'kb_upload_mothods_report_' + uuid_string}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output
class CompoundSetUtils:
    '''
    Module Name:
    CompoundSetUtils
    Module Description:
    A KBase module: CompoundSetUtils
    Contains tools for import & export of compound sets
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "2.1.2"
    GIT_URL = "https://github.com/Tianhao-Gu/CompoundSetUtils.git"
    GIT_COMMIT_HASH = "12e1f23022354f475d7ceb3631913956eb5831a7"

    #BEGIN_CLASS_HEADER
    @staticmethod
    def _check_param(in_params, req_param, opt_param=list()):
        """
        Check if each of the params in the list are in the input params
        """
        for param in req_param:
            if param not in in_params:
                raise ValueError('{} parameter is required'.format(param))
        defined_param = set(req_param + opt_param)
        for param in in_params:
            if param not in defined_param:
                logging.warning(
                    "Received unexpected parameter {}".format(param))

    def _save_to_ws_and_report(self, ws_id, source, compoundset, message=None):
        """Save compound set to the workspace and make report"""
        info = self.dfu.save_objects({
            'id':
            ws_id,
            "objects": [{
                "type": "KBaseBiochem.CompoundSet",
                "data": compoundset,
                "name": compoundset['name']
            }]
        })[0]
        compoundset_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        if not message:
            message = 'Imported %s as %s' % (source, info[1])
        report_params = {
            'objects_created': [{
                'ref': compoundset_ref,
                'description': 'Compound Set'
            }],
            'message':
            message,
            'workspace_name':
            info[7],
            'report_object_name':
            'compound_set_creation_report'
        }

        # Construct the output to send back
        report_client = KBaseReport(self.callback_url)
        report_info = report_client.create_extended_report(report_params)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'compoundset_ref': compoundset_ref
        }
        return output

    def _export_compound_set(self, ref, file_type):
        logging.info("Exporting {} as {}".format(ref, file_type))
        compoundset = self.dfu.get_objects({'object_refs':
                                            [ref]})['data'][0]['data']
        temp_dir = "{}/{}".format(self.scratch, uuid.uuid4())
        os.mkdir(temp_dir)
        out_dir = "{}/{}".format(temp_dir, compoundset['name'])
        os.mkdir(out_dir)
        target = "{}/{}.{}".format(out_dir, compoundset['name'], file_type)
        if file_type == 'tsv':
            parse.write_tsv(compoundset, target)
        elif file_type == 'sdf':
            parse.write_sdf(compoundset, target)
        else:
            raise ValueError("Bad file_type: {}".format(file_type))
        handle = self.dfu.package_for_download({
            'file_path': out_dir,
            'ws_refs': [ref]
        })
        output = {'shock_id': handle['shock_id']}
        return output

    def _fetch_mol2_files(self, ref):
        compoundset_obj = self.dfu.get_objects({'object_refs':
                                                [ref]})['data'][0]
        compoundset_info = compoundset_obj['info']
        compoundset = compoundset_obj['data']
        temp_dir = "{}/{}".format(self.scratch, uuid.uuid4())
        os.mkdir(temp_dir)

        compounds = compoundset.get('compounds')

        mol2_files = []
        comp_id_mol2_file_name_map = {}
        for compound in compounds:
            mol2_handle_ref = compound.get('mol2_handle_ref')

            if mol2_handle_ref:
                mol2_file_path = self.dfu.shock_to_file({
                    'handle_id': mol2_handle_ref,
                    'file_path': temp_dir
                }).get('file_path')
                mol2_files.append(mol2_file_path)
                comp_id_mol2_file_name_map[compound['id']] = os.path.basename(
                    mol2_file_path)

        packed_mol2_files_path = None
        if mol2_files:
            packed_mol2_files_path = os.path.join(
                temp_dir, compoundset_info[1] + '_mol2_files.zip')
            with zipfile.ZipFile(packed_mol2_files_path, 'w') as zipMe:
                for file in mol2_files:
                    zipMe.write(file, compress_type=zipfile.ZIP_DEFLATED)

        return packed_mol2_files_path, comp_id_mol2_file_name_map

    def _covert_mol2_files_to_pdbqt(self, ref):
        compoundset_obj = self.dfu.get_objects({'object_refs':
                                                [ref]})['data'][0]
        compoundset_info = compoundset_obj['info']
        compoundset = compoundset_obj['data']
        mol2_temp_dir = "{}/{}".format(self.scratch, uuid.uuid4())
        os.mkdir(mol2_temp_dir)
        pdbqt_temp_dir = "{}/{}".format(self.scratch, uuid.uuid4())
        os.mkdir(pdbqt_temp_dir)

        compounds = compoundset.get('compounds')

        pdbqt_files = []
        comp_id_pdbqt_file_name_map = {}
        for compound in compounds:
            mol2_handle_ref = compound.get('mol2_handle_ref')

            if mol2_handle_ref:
                mol2_file_path = self.dfu.shock_to_file({
                    'handle_id':
                    mol2_handle_ref,
                    'file_path':
                    mol2_temp_dir
                }).get('file_path')
                pdbqt_file_path = os.path.join(pdbqt_temp_dir,
                                               compound['id'] + '.pdbqt')

                command = [
                    'obabel', '-i', 'mol2', mol2_file_path, '-o', 'pdbqt',
                    '-O', pdbqt_file_path
                ]
                process = Popen(command, stdout=PIPE, stderr=PIPE)
                stdout, stderr = process.communicate()

                if 'converted' in str(stderr) and 'molecule' in str(stderr):
                    logging.info(
                        'Successfully converted Mol2 to pdbqt format: {}'.
                        format(os.path.basename(mol2_file_path)))
                    pdbqt_files.append(pdbqt_file_path)
                    comp_id_pdbqt_file_name_map[
                        compound['id']] = os.path.basename(pdbqt_file_path)
                else:
                    logging.warning(
                        'Cannot convert Mol2 file to pdbqt format: {}'.format(
                            os.path.basename(mol2_file_path)))
                    logging.warning(stderr)

        packed_pdbqt_files_path = None
        if pdbqt_files:
            packed_pdbqt_files_path = os.path.join(
                pdbqt_temp_dir, compoundset_info[1] + '_pdbqt_files.zip')
            with zipfile.ZipFile(packed_pdbqt_files_path, 'w') as zipMe:
                for file in pdbqt_files:
                    zipMe.write(file, compress_type=zipfile.ZIP_DEFLATED)

        return packed_pdbqt_files_path, comp_id_pdbqt_file_name_map

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        #END_CONSTRUCTOR
        pass

    def compound_set_from_file(self, ctx, params):
        """
        CompoundSetFromFile
        string staging_file_path
        :param params: instance of type "compoundset_upload_params" ->
           structure: parameter "workspace_id" of String, parameter
           "staging_file_path" of String, parameter "compound_set_name" of
           String, parameter "mol2_staging_file_path" of String
        :returns: instance of type "compoundset_upload_results" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "compoundset_ref" of type "obj_ref"
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN compound_set_from_file
        self._check_param(
            params, ['workspace_id', 'staging_file_path', 'compound_set_name'],
            opt_param=['mol2_staging_file_path'])
        scratch_file_path = self.dfu.download_staging_file({
            'staging_file_subdir_path':
            params['staging_file_path']
        }).get('copy_file_path')
        # I probably should be uploading the raw files to shock

        mol2_staging_file_path = params.get('mol2_staging_file_path')

        mol2_file_dir = None
        if mol2_staging_file_path:
            mol2_scratch_file_path = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                mol2_staging_file_path
            }).get('copy_file_path')

            try:
                logging.info("start unpacking mol2 file")
                mol2_file_path_out = self.dfu.unpack_file(
                    {'file_path': mol2_scratch_file_path})['file_path']
                mol2_file_dir = os.path.dirname(mol2_file_path_out)
            except Exception:
                raise ValueError('Cannot unpack mol2 file: {}'.format(
                    os.path.basename(mol2_file_path_out)))

        ext = os.path.splitext(scratch_file_path)[1]
        file_name = os.path.basename(scratch_file_path)
        if ext == '.sdf':
            compounds = parse.read_sdf(scratch_file_path,
                                       mol2_file_dir=mol2_file_dir,
                                       callback_url=self.callback_url)
        elif ext == '.tsv':
            compounds = parse.read_tsv(scratch_file_path,
                                       mol2_file_dir=mol2_file_dir,
                                       callback_url=self.callback_url)
        else:
            raise ValueError('Invalid input file type. Expects .tsv or .sdf')

        compoundset = {
            'id': params['compound_set_name'],
            'name': params['compound_set_name'],
            'description': 'Compound Set produced from %s' % file_name,
            'compounds': compounds,
        }

        output = self._save_to_ws_and_report(params['workspace_id'],
                                             params['staging_file_path'],
                                             compoundset)
        #END compound_set_from_file

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method compound_set_from_file return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def compound_set_to_file(self, ctx, params):
        """
        CompoundSetToFile
        string compound_set_name
        string output_format
        :param params: instance of type "compoundset_download_params" ->
           structure: parameter "compound_set_ref" of String, parameter
           "output_format" of String
        :returns: instance of type "compoundset_download_results" ->
           structure: parameter "file_path" of String, parameter
           "packed_mol2_files_path" of String, parameter
           "comp_id_mol2_file_name_map" of mapping from String to String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN compound_set_to_file
        self._check_param(params, ['compound_set_ref', 'output_format'])
        ret = self.dfu.get_objects(
            {'object_refs': [params['compound_set_ref']]})['data'][0]
        compoundset = ret['data']
        ext = params['output_format']
        out = f"{self.scratch}/{uuid.uuid4()}"
        os.mkdir(out)
        out += f"/{compoundset['name']}"
        if ext == 'sdf':
            outfile_path = parse.write_sdf(compoundset, out)
        elif ext == 'tsv':
            outfile_path = parse.write_tsv(compoundset, out)
        else:
            outfile_path = parse.write_mol_dir(compoundset, out, ext)

        packed_mol2_files_path, comp_id_mol2_file_name_map = self._fetch_mol2_files(
            params['compound_set_ref'])

        output = {
            'file_path': outfile_path,
            'packed_mol2_files_path': packed_mol2_files_path,
            'comp_id_mol2_file_name_map': comp_id_mol2_file_name_map
        }

        #END compound_set_to_file

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method compound_set_to_file return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def compound_set_from_model(self, ctx, params):
        """
        CompoundSetFromModel
        required:
        string workspace_id
        string model_ref
        string compound_set_name
        :param params: instance of type "compoundset_from_model_params" ->
           structure: parameter "workspace_id" of String, parameter
           "model_ref" of String, parameter "compound_set_name" of String
        :returns: instance of type "compoundset_upload_results" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "compoundset_ref" of type "obj_ref"
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN compound_set_from_model
        self._check_param(params,
                          ['workspace_id', 'model_ref', 'compound_set_name'])
        model = self.dfu.get_objects({'object_refs': [params['model_ref']]
                                      })['data'][0]['data']
        compounds, undef = parse.parse_model(model)
        compoundset = {
            'id':
            params['compound_set_name'],
            'name':
            params['compound_set_name'],
            'description':
            'Compound Set produced from %s, a metabolic model' % model['id'],
            'compounds':
            compounds,
        }

        output = self._save_to_ws_and_report(params['workspace_id'],
                                             model['name'], compoundset)
        #END compound_set_from_model

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method compound_set_from_model return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def export_compoundset_as_tsv(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_compoundset_as_tsv
        output = self._export_compound_set(params['input_ref'], 'tsv')
        #END export_compoundset_as_tsv

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_compoundset_as_tsv return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def export_compoundset_as_sdf(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_compoundset_as_sdf
        output = self._export_compound_set(params['input_ref'], 'sdf')
        #END export_compoundset_as_sdf

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_compoundset_as_sdf return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def export_compoundset_mol2_files(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "export_mol2_files_results" -> structure:
           parameter "packed_mol2_files_path" of String, parameter
           "comp_id_mol2_file_name_map" of mapping from String to String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_compoundset_mol2_files
        self._check_param(params, ['input_ref'])

        packed_mol2_files_path, comp_id_mol2_file_name_map = self._fetch_mol2_files(
            params['input_ref'])

        output = {
            'packed_mol2_files_path': packed_mol2_files_path,
            'comp_id_mol2_file_name_map': comp_id_mol2_file_name_map
        }
        #END export_compoundset_mol2_files

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError(
                'Method export_compoundset_mol2_files return value ' +
                'output is not type dict as required.')
        # return the results
        return [output]

    def convert_compoundset_mol2_files_to_pdbqt(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "convert_mol2_files_results" -> structure:
           parameter "packed_pdbqt_files_path" of String, parameter
           "comp_id_pdbqt_file_name_map" of mapping from String to String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN convert_compoundset_mol2_files_to_pdbqt
        self._check_param(params, ['input_ref'])

        packed_pdbqt_files_path, comp_id_pdbqt_file_name_map = self._covert_mol2_files_to_pdbqt(
            params['input_ref'])

        output = {
            'packed_pdbqt_files_path': packed_pdbqt_files_path,
            'comp_id_pdbqt_file_name_map': comp_id_pdbqt_file_name_map
        }
        #END convert_compoundset_mol2_files_to_pdbqt

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError(
                'Method convert_compoundset_mol2_files_to_pdbqt return value '
                + 'output is not type dict as required.')
        # return the results
        return [output]

    def fetch_mol2_files_from_zinc(self, ctx, params):
        """
        :param params: instance of type "FetchZINCMol2Params" -> structure:
           parameter "workspace_id" of String, parameter "compoundset_ref" of
           type "obj_ref", parameter "over_write" of Long
        :returns: instance of type "compoundset_upload_results" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "compoundset_ref" of type "obj_ref"
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN fetch_mol2_files_from_zinc
        self._check_param(params, ['workspace_id', 'compoundset_ref'],
                          opt_param=['over_write'])
        over_write = params.get('over_write', False)
        compoundset = self.dfu.get_objects(
            {'object_refs': [params['compoundset_ref']]})['data'][0]['data']

        compoundset_copy = copy.deepcopy(compoundset)

        count = 0
        for compound in compoundset_copy.get('compounds'):
            if not compound.get('mol2_handle_ref') or over_write:
                temp_dir = os.path.join(self.scratch, str(uuid.uuid4()))
                os.mkdir(temp_dir)
                mol2_file_path = os.path.join(temp_dir, compound.get('id'))
                inchikey = compound.get('inchikey')
                if zinc_db_util.inchikey_to_mol2(inchikey, mol2_file_path):
                    handle_id = self.dfu.file_to_shock({
                        'file_path': mol2_file_path,
                        'make_handle': True
                    })['handle']['hid']
                    compound['mol2_handle_ref'] = handle_id
                    compound['mol2_source'] = 'ZINC15'
                    count += 1
                else:
                    logging.warning(
                        'Cannot find Mol2 file from ZINC for {}'.format(
                            inchikey))

        if count:
            message = 'Successfully fetched {} Mol2 files from ZINC database'.format(
                count)
        else:
            message = 'Fetched 0 Mol2 files from ZINC database. The CompoundSet object remains unchanged.'

        output = self._save_to_ws_and_report(params['workspace_id'],
                                             '',
                                             compoundset_copy,
                                             message=message)

        #END fetch_mol2_files_from_zinc

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError(
                'Method fetch_mol2_files_from_zinc return value ' +
                'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
예제 #3
0
class AttributesUtil:
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.srv_wiz_url = config['srv-wiz-url']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url)
        self.kbse = KBaseSearchEngine(config['search-url'])
        self.data_util = DataUtil(config)
        self.wsClient = workspaceService(self.ws_url, token=self.token)
        self.DEFAULT_ONTOLOGY_ID = "Custom:Term"
        self.DEFAULT_UNIT_ID = "Custom:Unit"
        self.ONT_LABEL_DEL = " - "
        self.ONT_TERM_DEL = ":"

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning(
                    "Unexpected parameter {} supplied".format(param))

    def file_to_attribute_mapping(self, params):
        """Convert a user supplied file to a compound set"""
        if 'input_file_path' in params:
            scratch_file_path = params['input_file_path']
        elif 'input_shock_id' in params:
            scratch_file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        else:
            raise ValueError(
                "Must supply either a input_shock_id or input_file_path")
        attr_mapping = self._file_to_am_obj(scratch_file_path)
        info = self.dfu.save_objects({
            "id":
            params['output_ws_id'],
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": attr_mapping,
                "name": params['output_obj_name']
            }]
        })[0]
        return {
            "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4])
        }

    def append_file_to_attribute_mapping(self,
                                         staging_file_subdir_path,
                                         old_am_ref,
                                         output_ws_id,
                                         new_am_name=None):
        """append an attribute mapping file to existing attribute mapping object
        """

        download_staging_file_params = {
            'staging_file_subdir_path': staging_file_subdir_path
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')

        append_am_data = self._file_to_am_obj(scratch_file_path)

        old_am_obj = self.dfu.get_objects({'object_refs':
                                           [old_am_ref]})['data'][0]

        old_am_info = old_am_obj['info']
        old_am_name = old_am_info[1]
        old_am_data = old_am_obj['data']

        new_am_data = self._check_and_append_am_data(old_am_data,
                                                     append_am_data)

        if not new_am_name:
            current_time = time.localtime()
            new_am_name = old_am_name + time.strftime('_%H_%M_%S_%Y_%m_%d',
                                                      current_time)

        info = self.dfu.save_objects({
            "id":
            output_ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": new_am_data,
                "name": new_am_name
            }]
        })[0]
        return {
            "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4])
        }

    def update_matrix_attribute_mapping(self, params):

        dimension = params.get('dimension')
        if dimension not in ['col', 'row']:
            raise ValueError('Please use "col" or "row" for input dimension')

        workspace_name = params.get('workspace_name')

        old_matrix_ref = params.get('input_matrix_ref')
        old_matrix_obj = self.dfu.get_objects(
            {'object_refs': [old_matrix_ref]})['data'][0]
        old_matrix_info = old_matrix_obj['info']
        old_matrix_data = old_matrix_obj['data']

        old_am_ref = old_matrix_data.get(
            '{}_attributemapping_ref'.format(dimension))

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        if not old_am_ref:
            raise ValueError(
                'Matrix object does not have {} attribute mapping'.format(
                    dimension))

        new_am_ref = self.append_file_to_attribute_mapping(
            params['staging_file_subdir_path'], old_am_ref, workspace_id,
            params['output_am_obj_name'])['attribute_mapping_ref']

        old_matrix_data['{}_attributemapping_ref'.format(
            dimension)] = new_am_ref

        info = self.dfu.save_objects({
            "id":
            workspace_id,
            "objects": [{
                "type": old_matrix_info[2],
                "data": old_matrix_data,
                "name": params['output_matrix_obj_name']
            }]
        })[0]

        new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])

        objects_created = [{
            'ref': new_am_ref,
            'description': 'Updated Attribute Mapping'
        }, {
            'ref': new_matrix_obj_ref,
            'description': 'Updated Matrix'
        }]

        report_params = {
            'message': '',
            'objects_created': objects_created,
            'workspace_name': workspace_name,
            'report_object_name':
            'import_matrix_from_biom_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        return {
            'new_matrix_obj_ref': new_matrix_obj_ref,
            'new_attribute_mapping_ref': new_am_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }

    def _check_and_append_am_data(self, old_am_data, append_am_data):

        exclude_keys = {'attributes', 'instances'}
        new_am_data = {
            k: old_am_data[k]
            for k in set(list(old_am_data.keys())) - exclude_keys
        }

        old_attrs = old_am_data.get('attributes')
        old_insts = old_am_data.get('instances')

        append_attrs = append_am_data.get('attributes')
        append_insts = append_am_data.get('instances')

        # checking duplicate attributes
        old_attrs_names = [old_attr.get('attribute') for old_attr in old_attrs]
        append_attrs_names = [
            append_attr.get('attribute') for append_attr in append_attrs
        ]

        duplicate_attrs = set(old_attrs_names).intersection(append_attrs_names)

        if duplicate_attrs:
            error_msg = 'Duplicate attribute mappings: [{}]'.format(
                duplicate_attrs)
            raise ValueError(error_msg)

        # checking missing instances
        missing_inst = old_insts.keys() - append_insts.keys()

        if missing_inst:
            error_msg = 'Appended attribute mapping misses [{}] instances'.format(
                missing_inst)
            raise ValueError(error_msg)

        new_attrs = old_attrs + append_attrs
        new_am_data['attributes'] = new_attrs

        new_insts = deepcopy(old_insts)

        for inst_name, val in new_insts.items():
            append_val = append_insts.get(inst_name)
            val.extend(append_val)

        new_am_data['instances'] = new_insts

        return new_am_data

    def _am_data_to_df(self, data):
        """
        Converts a compound set object data to a dataframe
        """

        attributes = pd.DataFrame(data['attributes'])
        attributes.rename(columns=lambda x: x.replace("ont", "ontology").
                          capitalize().replace("_", " "))
        instances = pd.DataFrame(data['instances'])
        am_df = attributes.join(instances)

        return am_df

    def _clusterset_data_to_df(self, data):
        """
        Converts a cluster set object data to a dataframe
        """

        original_matrix_ref = data.get('original_data')
        data_matrix = self.data_util.fetch_data({
            'obj_ref': original_matrix_ref
        }).get('data_matrix')

        data_df = pd.read_json(data_matrix)
        clusters = data.get('clusters')

        id_name_list = [
            list(cluster.get('id_to_data_position').keys())
            for cluster in clusters
        ]
        id_names = [item for sublist in id_name_list for item in sublist]

        if set(data_df.columns.tolist()) == set(
                id_names):  # cluster is based on columns
            data_df = data_df.T

        cluster_names = [None] * data_df.index.size

        cluster_id = 0
        for cluster in clusters:
            item_ids = list(cluster.get('id_to_data_position').keys())
            item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids]

            for idx in item_idx:
                cluster_names[idx] = cluster_id

            cluster_id += 1

        data_df['cluster'] = cluster_names

        return data_df

    def _ws_obj_to_df(self, input_ref):
        """Converts workspace obj to a DataFrame"""
        res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0]
        name = res['info'][1]

        obj_type = res['info'][2]

        if "KBaseExperiments.AttributeMapping" in obj_type:
            cs_df = self._am_data_to_df(res['data'])
        elif "KBaseExperiments.ClusterSet" in obj_type:
            cs_df = self._clusterset_data_to_df(res['data'])
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please supply KBaseExperiments.AttributeMapping or KBaseExperiments.ClusterSet'
            raise ValueError("err_msg")

        return name, cs_df, obj_type

    def _file_to_am_obj(self, scratch_file_path):
        try:
            df = pd.read_excel(scratch_file_path, dtype='str')
        except XLRDError:
            df = pd.read_csv(scratch_file_path, sep=None, dtype='str')
        df = df.replace('nan', '')
        if df.columns[1].lower() == "attribute ontology id":
            am_obj = self._df_to_am_obj(df)
        else:
            am_obj = self._isa_df_to_am_object(df)
        return am_obj

    def _df_to_am_obj(self, am_df):
        """Converts a dataframe from a user file to a compound set object"""
        if not len(am_df):
            raise ValueError("No attributes in supplied files")

        attribute_df = am_df.filter(regex="[Uu]nit|[Aa]ttribute")
        instance_df = am_df.drop(attribute_df.columns, axis=1)
        if not len(instance_df.columns):
            raise ValueError(
                "Unable to find any instance columns in supplied file")

        attribute_df.rename(
            columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(),
            inplace=True)
        if "attribute" not in attribute_df.columns:
            raise ValueError(
                "Unable to find a 'attribute' column in supplied file")
        attribute_df['source'] = 'upload'
        attribute_fields = ('attribute', 'unit', 'attribute_ont_id',
                            'unit_ont_id', 'source')
        attributes = attribute_df.filter(
            items=attribute_fields).to_dict('records')
        print(attributes)
        self._validate_attribute_values(
            am_df.set_index(attribute_df.attribute).iterrows())

        attribute_mapping = {
            'ontology_mapping_method': "User Curation",
            'attributes': [self._add_ontology_info(f) for f in attributes],
            'instances': instance_df.to_dict('list')
        }

        return attribute_mapping

    def _isa_df_to_am_object(self, isa_df):
        skip_columns = {
            'Raw Data File', 'Derived Data File', 'Array Data File',
            'Image File'
        }
        if 'Sample Name' in isa_df.columns and not any(
                isa_df['Sample Name'].duplicated()):
            isa_df.set_index('Sample Name', inplace=True)
        elif 'Assay Name' in isa_df.columns and not any(
                isa_df['Assay Name'].duplicated()):
            isa_df.set_index('Assay Name', inplace=True)
        elif not any(isa_df[isa_df.columns[0]].duplicated()):
            logging.warning(f'Using {isa_df.columns[0]} as ID column')
            isa_df.set_index(isa_df.columns[0], inplace=True)
        else:
            raise ValueError(
                "Unable to detect an ID column that was unigue for each row. "
                f"Considered 'Sample Names', 'Assay Names' and {isa_df.columns[0]}"
            )
        self._validate_attribute_values(isa_df.iteritems())

        attribute_mapping = {
            'ontology_mapping_method': "User Curation - ISA format"
        }
        attribute_mapping[
            'attributes'], new_skip_cols = self._get_attributes_from_isa(
                isa_df, skip_columns)
        reduced_isa = isa_df.drop(columns=new_skip_cols, errors='ignore')
        attribute_mapping['instances'] = reduced_isa.T.to_dict('list')

        return attribute_mapping

    def _validate_attribute_values(self, attribute_series):
        errors = {}
        for attr, vals in attribute_series:
            try:
                validator = getattr(AttributeValidation, attr)
                attr_errors = validator(vals)
                if attr_errors:
                    errors[attr] = attr_errors
            except AttributeError:
                continue

        if errors:
            for attr, attr_errors in errors.items():
                logging.error(
                    f'Attribute {attr} had the following validation errors:\n'
                    "\n".join(attr_errors) + '\n')
                raise ValueError(
                    f'The following attributes failed validation: {", ".join(errors)}'
                    f'\n See the log for details')

    def _get_attributes_from_isa(self, isa_df, skip_columns):
        attributes = []
        # associate attribute columns with the other columns that relate to them
        for i, col in enumerate(isa_df.columns):
            if col.startswith('Term Source REF'):
                skip_columns.add(col)
                last_attr = attributes[-1]
                if '_unit' in last_attr:
                    last_attr['_unit_ont'] = col
                else:
                    last_attr['_val_ont'] = col

            elif col.startswith('Term Accession Number'):
                # If the term Accession is a web link only grab the last bit
                # Similarly, sometimes the number is prefixed with the term source e.x. UO_0000012
                isa_df[col] = isa_df[col].map(
                    lambda x: x.split("/")[-1].split("_")[-1])
                skip_columns.add(col)
                last_attr = attributes[-1]
                if '_unit' in last_attr:
                    last_attr['_unit_accession'] = col
                else:
                    last_attr['_val_accession'] = col

            elif col.startswith('Unit'):
                skip_columns.add(col)
                last_attr = attributes[-1]
                if last_attr.get('unit'):
                    raise ValueError(
                        "More than one unit column is supplied for attribute {}"
                        .format(last_attr['attribute']))
                last_attr['_unit'] = col

            elif col not in skip_columns:
                split_col = col.split("|", 1)
                if len(split_col) > 1:
                    attributes.append({
                        "attribute": split_col[0],
                        "attribute_ont_id": split_col[1],
                        "source": "upload"
                    })
                else:
                    attributes.append({"attribute": col, "source": "upload"})

        # handle the categories for each attribute
        for i, attribute in enumerate(attributes):
            if '_val_accession' in attribute:
                category_df = isa_df[[
                    attribute['attribute'],
                    attribute.pop('_val_ont'),
                    attribute.pop('_val_accession')
                ]].drop_duplicates()
                category_df[
                    'attribute_ont_id'] = category_df.iloc[:, 1].str.cat(
                        category_df.iloc[:, 2], ":")
                category_df['value'] = category_df[attribute['attribute']]
                cats = category_df.set_index(attribute['attribute'])[[
                    'value', 'attribute_ont_id'
                ]].to_dict('index')
                attribute['categories'] = {
                    k: self._add_ontology_info(v)
                    for k, v in cats.items()
                }

            if '_unit' in attribute:
                units = isa_df[attribute.pop('_unit')].unique()
                if len(units) > 1:
                    raise ValueError(
                        "More than one unit type is supplied for attribute {}: {}"
                        .format(attribute['attribute'], units))
                attribute['unit'] = units[0]
                if '_unit_ont' in attribute:
                    unit_ont = isa_df[attribute.pop('_unit_ont')].str.cat(
                        isa_df[attribute.pop('_unit_accession')],
                        ":").unique()
                    if len(units) > 1:
                        raise ValueError(
                            "More than one unit ontology is supplied for attribute "
                            "{}: {}".format(attribute['attribute'], unit_ont))
                    attribute['unit_ont_id'] = unit_ont[0]
            attributes[i] = self._add_ontology_info(attribute)
        return attributes, skip_columns

    def _search_ontologies(self, term, closest=False):
        """
        Match to an existing KBase ontology term
        :param term: Test to match
        :param closest: if false, term must exactly match an ontology ID
        :return: dict(ontology_ref, id)
        """
        params = {
            "object_types": ["OntologyTerm"],
            "match_filter": {
                "lookup_in_keys": {
                    "id": {
                        "value": term
                    }
                }
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "pagination": {
                "count": 1
            },
            "post_processing": {
                "skip_data": 1
            }
        }
        if closest:
            params['match_filter'] = {"full_text_in_all": term}
        res = self.kbse.search_objects(params)
        if not res['objects']:
            return None
        term = res['objects'][0]
        return {
            "ontology_ref": term['guid'].split(":")[1],
            "id": term['key_props']['id']
        }

    def _add_ontology_info(self, attribute):
        """Searches KBASE ontologies for terms matching the user supplied attributes and units.
        Add the references if found"""
        optionals = {
            "unit",
            "unit_ont_id",
            "unit_ont_ref",
        }
        attribute = {
            k: v
            for k, v in attribute.items() if k not in optionals or v != ""
        }
        ont_info = self._search_ontologies(
            attribute.get('attribute_ont_id', "").replace("_", ":"))
        if ont_info:
            attribute['attribute_ont_ref'] = ont_info['ontology_ref']
            attribute['attribute_ont_id'] = ont_info['id']
        elif not attribute.get(
                'attribute_ont_id') or attribute['attribute_ont_id'] == ":":
            attribute.pop('attribute_ont_id', None)

        if attribute.get('unit'):
            ont_info = self._search_ontologies(
                attribute.get('unit_ont_id', '').replace("_", ":"))
            if ont_info:
                attribute['unit_ont_ref'] = ont_info['ontology_ref']
                attribute['unit_ont_id'] = ont_info['id']
            elif not attribute.get(
                    'attribute_ont_id') or attribute['unit_ont_id'] == ":":
                attribute.pop('unit_ont_id', None)

        return attribute

    def to_tsv(self, params):
        """Convert an compound set to TSV file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".tsv")
        df.to_csv(files['file_path'], sep="\t", index=False)

        return _id, files

    def to_excel(self, params):
        """Convert an compound set to Excel file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".xlsx")

        writer = pd.ExcelWriter(files['file_path'])

        if "KBaseExperiments.AttributeMapping" in obj_type:
            df.to_excel(writer, "Attributes", index=False)
        elif "KBaseExperiments.ClusterSet" in obj_type:
            df.to_excel(writer, "ClusterSet", index=True)
        # else is checked in `_ws_obj_to_df`

        writer.save()

        return _id, files

    def export(self, file, name, input_ref):
        """Saves a set of files to SHOCK for export"""
        export_package_dir = os.path.join(self.scratch,
                                          name + str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        shutil.move(file,
                    os.path.join(export_package_dir, os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [input_ref]
        })

        return {'shock_id': package_details['shock_id']}
예제 #4
0
class CompMolNWChem:
    '''
    Module Name:
    CompMolNWChem

    Module Description:
    A KBase module: CompMolNWChem
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/nkkchem/CompMolNWChem.git"
    GIT_COMMIT_HASH = "0e157ea3f395c544a04c1542473be59ec39129ef"

    #BEGIN_CLASS_HEADER
    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """

        #log('start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'DESeq2_result.zip')
        plot_file = os.path.join(output_directory, 'DESeq2_plot.zip')

        with zipfile.ZipFile(result_file, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.zip') or
                            file.endswith('.png') or
                            file.endswith('.DS_Store')):
                        zip_file.write(os.path.join(root, file), 
                                       os.path.join(os.path.basename(root), file))

        output_files.append({'path': result_file,
                             'name': os.path.basename(result_file),
                             'label': os.path.basename(result_file),
                             'description': 'File(s) generated by DESeq2 App'})

        with zipfile.ZipFile(plot_file, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if file.endswith('.png'):
                        zip_file.write(os.path.join(root, file), 
                                       os.path.join(os.path.basename(root), file))

        output_files.append({'path': plot_file,
                             'name': os.path.basename(plot_file),
                             'label': os.path.basename(plot_file),
                             'description': 'Visualization plots by DESeq2 App'})

        return output_files

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _save_to_ws_and_report(self, ws_id, source, compoundset, message=None):
        """Save compound set to the workspace and make report"""
        info = self.dfu.save_objects(
            {'id': ws_id,
             "objects": [{
                 "type": "KBaseBiochem.CompoundSet",
                 "data": compoundset,
                 "name": compoundset['name']
             }]})[0]
        compoundset_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        if not message:
            message = 'Imported %s as %s' % (source, info[1])
        report_params = {
            'objects_created': [{'ref': compoundset_ref,
                                 'description': 'Compound Set'}],
            'message': message,
            'workspace_name': info[7],
            'report_object_name': 'compound_set_creation_report'
        }

        # Construct the output to send back
        report_client = KBaseReport(self.callback_url)
        report_info = report_client.create_extended_report(report_params)
        output = {'report_name': report_info['name'],
                  'report_ref': report_info['ref'],
                  'compoundset_ref': compoundset_ref}
        return output

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.comp = CompoundSetUtils(self.callback_url)
        self.scratch = config['scratch']
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #self.scratch = config['scratch']

        #END_CONSTRUCTOR
        pass


    def run_CompMolNWChem(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_CompMolNWChem

        # Initial Tests to Check for Proper Inputs

        for name in ['Input_File','calculation_type','workspace_name']:
            if name not in params:
                raise ValueError('Parameter "' + name + '"is required but missing')
        if not isinstance(params['Input_File'], str):
            raise ValueError('Input_File must be a string')

        
        # Load the tsv file into a compound set using DataFileUtil methods
        
        scratch_file_path = self.dfu.download_staging_file({'staging_file_subdir_path':params['Input_File']}
                                       ).get('copy_file_path')

        #print('Scratch File Path: ',scratch_file_path)

        mol2_file_dir = None        
        ext = os.path.splitext(scratch_file_path)[1]
        file_name = os.path.basename(scratch_file_path)
        if ext == '.sdf':
            compounds = parse.read_sdf(scratch_file_path,
                                       mol2_file_dir=mol2_file_dir,
                                       callback_url=self.callback_url)
        elif ext == '.tsv':
            compounds = parse.read_tsv(scratch_file_path,
                                       mol2_file_dir=mol2_file_dir,
                                       callback_url=self.callback_url)
        else:
            raise ValueError('Invalid input file type. Expects .tsv or .sdf')

        #DEBUG::
        #print('Compounds:',compounds)

        compoundset = {
            'id': params['Input_File'],
            'name': params['Input_File'],
            'description': 'Compound Set produced from %s' % file_name,
            'compounds': compounds,
        }

        # Finish Reading in Compound Set
        
        # Read ids and smiles from compound set for nwchem input
        
        ids = []
        smiles = []

        for d in compounds:
           ids.append(d['id'])
           smiles.append(d['smiles'])
        #print(ids)
        #print(smiles)

        # Read the ids and structures of the compounds
        
        its.inchi_to_dft(ids,smiles)

        #DEBUG::
        #os.system('pwd')
        #os.system('ls')
        
        length = len(ids)
        for i in range(length):
            os.chdir('./'+ids[i]+'/dft')
            x = ids[i] + '_nwchem.out'
            #print('x:',x)
            file1 = open(x, 'r')
            nAtoms = mul.getNumberOfAtoms(file1)
            energy = mul.getInternalEnergy0K(file1)
            charge =mul.getMullikenCharge(file1,nAtoms)
            file1.close()
           
            mul.nAtoms = nAtoms
            mul.E0K = energy

            mul.calculate(ids[i])

        # Build KBase Output. Should output entire /simulation directory and build a CompoundSet with Mol2 Files

        result_directory = '/simulation/'

        ## Build CompoundSet with Mol2 Files... similarly to fetch_mol2_files_from_zinc (CompoundSetUtils)....

        compoundset_copy = copy.deepcopy(compoundset)

        count = 0

        for compound in compoundset_copy.get('compounds'):
            if not compound.get('mol2_handle_ref'):
                mol2_file_path = result_directory+compound.get('id')
                SMILES = compound.get('smiles')

                shutil.move(mol2_file_path,self.scratch)

                os.chdir(self.scratch)
               
                mol2_file_path = self.scratch + '/'+ compound.get('id')+'/dft/' + compound.get('id')+'_Mulliken.mol2'              
                handle_id = self.dfu.file_to_shock({'file_path': mol2_file_path,
                                                    'make_handle': True})['handle']['hid']
                print('Handle ID:',handle_id)
                compound['mol2_handle_ref'] = handle_id
                count += 1

               
               
        if count:
            message = 'Successfully fetched {} Mol2 files from Staging Path'.format(count)


        ## Create Extended Report

        output_files = self._generate_output_file_list(self.scratch)


        report_params = {'message': message,
                         'workspace_id': params['workspace_id'],
                         'objects_created': [],
                         'file_links': output_files,
                         'report_object_name': 'kb_deseq2_report_' + str(uuid.uuid4())}

        report = KBaseReport(self.callback_url)
        
        report_info = report.create_extended_report(report_params)

        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }

        output2 = self._save_to_ws_and_report(
            params['workspace_id'],'', compoundset_copy,
            message=message)
            
        
        return [output,output2]


        #END run_CompMolNWChem

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_CompMolNWChem return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {'state': "OK",
                     'message': "",
                     'version': self.VERSION,
                     'git_url': self.GIT_URL,
                     'git_commit_hash': self.GIT_COMMIT_HASH}
        #END_STATUS
        return [returnVal]
예제 #5
0
class ImportFBAModelUtil:
    def _call_sbml_tools(self, params):

        try:
            # calling SBMLTools.sbml_importer without genome
            sbml_importer_params = dict()
            sbml_importer_params['sbml_local_path'] = params.get(
                'model_file').get('path')
            sbml_importer_params['automatically_integrate'] = 1
            sbml_importer_params['remove_boundary'] = 1
            for param_name in ['workspace_name', 'model_name', 'biomass']:
                sbml_importer_params[param_name] = params.get(param_name)
            log('start executing SBMLTools.sbml_importer with {}'.format(
                sbml_importer_params))
            sbml_importer_ret = self.SBMLTools.sbml_importer(
                sbml_importer_params)
            log('SBMLTools.sbml_importer returned {}'.format(
                sbml_importer_ret))
        except Exception:
            raise ValueError(
                'Unexpected error calling SBMLTools.sbml_importer')

        try:
            # calling SBMLTools.integrate_model
            integrate_model_params = dict()
            integrate_model_params['biomass_reactions'] = ''
            integrate_model_params['compartment_translation'] = list()
            integrate_model_params['compound_mappings'] = ''
            integrate_model_params['create_extracellular'] = 0
            integrate_model_params['fill_metadata'] = 1
            integrate_model_params['gene_mappings'] = ''
            integrate_model_params['remove_boundary'] = 1
            integrate_model_params['template_id'] = 'gramneg'
            integrate_model_params['translate_database'] = 'modelseed'

            integrate_model_params['workspace_name'] = params.get(
                'workspace_name')
            integrate_model_params['model_name'] = params.get('model_name')
            integrate_model_params['output_model_name'] = params.get(
                'model_name')
            integrate_model_params['output_media_name'] = params.get(
                'model_name') + '.media'
            integrate_model_params['genome_id'] = params.get('genome')

            log('start executing SBMLTools.integrate_model with {}'.format(
                integrate_model_params))
            sbml_integrate_model_ret = self.SBMLTools.integrate_model(
                integrate_model_params)
            log('SBMLTools.integrate_model returned {}'.format(
                sbml_integrate_model_ret))

            fbamodel_id = sbml_integrate_model_ret['fbamodel_id']
            report_name = sbml_integrate_model_ret['report_name']
            report_ref = sbml_integrate_model_ret['report_ref']
        except Exception:
            raise ValueError(
                'Unexpected error calling SBMLTools.integrate_model')

        return fbamodel_id, report_name, report_ref

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.fba = fba_tools(self.callback_url)
        self.SBMLTools = SBMLTools(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_fbamodel_from_staging(self, params):

        log('--->\nrunning {}.{}\n params:\n{}'.format(
            self.__class__.__name__,
            sys._getframe().f_code.co_name, json.dumps(params, indent=1)))

        self._check_param(params, [
            'model_file', 'file_type', 'workspace_name', 'model_name',
            'biomass'
        ], ['genome', 'compounds_file'])
        if params['file_type'] == 'tsv' and not params.get(
                'compounds_file', None):
            raise ValueError('A compound file is required for tsv upload.')

        fba_tools_params = params.copy()
        for infile in ['model_file', 'compounds_file']:
            if not params.get(infile, None):
                continue
            download_staging_file_params = {
                'staging_file_subdir_path': params[infile]
            }
            scratch_file_path = self.dfu.download_staging_file(
                download_staging_file_params).get('copy_file_path')
            fba_tools_params[infile] = {'path': scratch_file_path}

        report_name = None
        report_ref = None
        if params['file_type'] == 'sbml':
            fbamodel_id, report_name, report_ref = self._call_sbml_tools(
                fba_tools_params)
            res = dict()
            res['ref'] = fbamodel_id
            # res = self.fba.sbml_file_to_model(fba_tools_params)
        elif params['file_type'] == 'excel':
            res = self.fba.excel_file_to_model(fba_tools_params)
        elif params['file_type'] == 'tsv':
            res = self.fba.tsv_file_to_model(fba_tools_params)
        else:
            raise ValueError('"{}" is not a valid import file_type'.format(
                params['file_type']))

        return {
            'obj_ref': res['ref'],
            'report_name': report_name,
            'report_ref': report_ref
        }

    @staticmethod
    def _check_param(in_params, req_param, opt_param=list()):
        """
        Check if each of the params in the list are in the input params
        """
        for param in req_param:
            if param not in in_params:
                raise ValueError(
                    'Required parameter "{}" is missing'.format(param))
        defined_param = set(req_param + opt_param)
        for param in in_params:
            if param not in defined_param:
                print(('WARNING: received unexpected parameter "{}"'.format(
                    param)))

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                        import_excel(tsv)_as_media_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """

        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        upload_message += "FBAModel Object Name: "
        upload_message += params['model_name'] + '\n'
        upload_message += 'Imported File: {}\n'.format(
            params.get('model_file'))

        report_params = {
            'message':
            upload_message,
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Imported FBAModel'
            }],
            'workspace_name':
            params.get('workspace_name'),
            'report_object_name':
            'kb_upload_methods_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
예제 #6
0
class ImportExpressionMatrixUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.fv = KBaseFeatureValues(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_tsv_as_expression_matrix_from_staging(self, params):
        '''
        import_tsv_as_expression_matrix_from_staging: wrapper method for
                                    KBaseFeatureValues.tsv_file_to_matrix

        required params:
            staging_file_subdir_path: subdirectory file path
              e.g.
                for file: /data/bulk/user_name/file_name
                staging_file_subdir_path is file_name
                for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
                staging_file_subdir_path is subdir_1/subdir_2/file_name
            matrix_name: output Expressin Matirx file name
            workspace_name: workspace name/ID of the object

        optional params:
            genome_ref: optional reference to a Genome object that will be
                  used for mapping feature IDs to
            fill_missing_values: optional flag for filling in missing
                    values in matrix (default value is false)
            data_type: optional filed, value is one of 'untransformed',
                    'log2_level', 'log10_level', 'log2_ratio', 'log10_ratio' or
                    'unknown' (last one is default value)
            data_scale: optional parameter (default value is '1.0')

        return:
            obj_ref: return object reference
        '''

        log('--->\nrunning ImportAssemblyUtil.import_tsv_as_expression_matrix_from_staging\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_tsv_as_expression_matrix_from_staging_params(
            params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')

        import_matrix_params = params
        import_matrix_params['input_file_path'] = scratch_file_path
        import_matrix_params['output_ws_name'] = params.get('workspace_name')
        import_matrix_params['output_obj_name'] = params.get('matrix_name')

        ref = self.fv.tsv_file_to_matrix(import_matrix_params)
        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(
            params.get('staging_file_subdir_path'),
            ref.get('output_matrix_ref'))
        returnVal = {'obj_ref': ref.get('output_matrix_ref')}

        return returnVal

    def validate_import_tsv_as_expression_matrix_from_staging_params(
            self, params):
        """
        validate_import_tsv_as_expression_matrix_from_staging_params:
                    validates params passed to import_tsv_as_expression_matrix_from_staging method

        """

        # check for required parameters
        for p in ['staging_file_subdir_path', 'workspace_name', 'matrix_name']:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                         import_tsv_as_expression_matrix_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """

        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False}

        object_data = self.dfu.get_objects(get_objects_params)

        upload_message += "Expression Matrix Object Name: "
        upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n'
        upload_message += 'Imported TSV File: {}\n'.format(
            params.get('staging_file_subdir_path'))

        report_params = {
            'message': upload_message,
            'workspace_name': params.get('workspace_name'),
            'report_object_name': 'kb_upload_mothods_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
예제 #7
0
class BiomUtil:

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.data_util = DataUtil(config)
        self.attr_util = AttributesUtil(config)
        self.matrix_util = MatrixUtil(config)
        self.matrix_types = [x.split(".")[1].split('-')[0]
                             for x in self.data_util.list_generic_types()]
        self.taxon_wsname = config['taxon-workspace-name']
        self.kbse = KBaseSearchEngine(config['search-url'])

    def import_matrix_from_biom(self, params):
        """
        arguments:
        obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix
        matrix_name: matrix object name
        workspace_name: workspace name matrix object to be saved to
        input_shock_id: file shock id
        or
        input_file_path: absolute file path
        or
        input_staging_file_path: staging area file path

        optional arguments:
        col_attributemapping_ref: column AttributeMapping reference
        row_attributemapping_ref: row AttributeMapping reference
        genome_ref: genome reference
        matrix_obj_ref: Matrix reference
        """
        #exit(params)  {'obj_type': 'AmpliconMatrix', 'matrix_name': 'test_AmpliconMatrix', 'workspace_name': 'man4ish_gupta:narrative_1568644342277', 'biom_fasta': {'biom_file_biom_fasta': 'data/phyloseq_test.biom', 'fasta_file_biom_fasta': 'data/phyloseq_test.fa'}, 'scale': 'raw', 'description': 'OTU data', 'amplicon_set_name': 'test_AmpliconSet', 'col_attributemapping_ref': '44071/33/54'}

        (biom_file, tsv_file, fasta_file, mode, metadata_keys) = self._process_params(params)

        workspace_name = params.get('workspace_name')
        matrix_name = params.get('matrix_name')
        amplicon_set_name = params.get('amplicon_set_name')
        obj_type = params.get('obj_type')
        scale = params.get('scale')
        description = params.get('description')
        refs = {k: v for k, v in params.items() if "_ref" in k}

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        amplicon_data = self._file_to_amplicon_data(biom_file, tsv_file, mode, refs, matrix_name,
                                                    workspace_id, scale, description, metadata_keys)

        new_row_attr_ref = None
        if not params.get('row_attributemapping_ref'):
            new_row_attr_ref = amplicon_data.get('row_attributemapping_ref')

        new_col_attr_ref = None
        if not params.get('col_attributemapping_ref'):
            new_col_attr_ref = amplicon_data.get('col_attributemapping_ref')

        logging.info('start saving Matrix object: {}'.format(matrix_name))
        matrix_obj_ref = self.data_util.save_object({
                                                'obj_type': 'KBaseMatrices.{}'.format(obj_type),
                                                'obj_name': matrix_name,
                                                'data': amplicon_data,
                                                'workspace_name': workspace_id})['obj_ref']

        amplicon_set_data = self._file_to_amplicon_set_data(biom_file, tsv_file, fasta_file, mode,
                                                            refs, description, matrix_obj_ref)

        logging.info('start saving AmpliconSet object: {}'.format(amplicon_set_name))
        amplicon_set_obj_ref = self.data_util.save_object({
                                                'obj_type': 'KBaseExperiments.AmpliconSet',
                                                'obj_name': amplicon_set_name,
                                                'data': amplicon_set_data,
                                                'workspace_name': workspace_id})['obj_ref']

        logging.info('start resaving Matrix object with amplicon set: {}'.format(matrix_name))
        amplicon_data['amplicon_set_ref'] = '{}/{}'.format(workspace_id, amplicon_set_name)
        matrix_obj_ref = self.data_util.save_object({
                                                'obj_type': 'KBaseMatrices.{}'.format(obj_type),
                                                'obj_name': matrix_name,
                                                'data': amplicon_data,
                                                'workspace_name': workspace_id})['obj_ref']

        returnVal = {'matrix_obj_ref': matrix_obj_ref,
                     'amplicon_set_obj_ref': amplicon_set_obj_ref}

        report_output = self._generate_report(matrix_obj_ref, amplicon_set_obj_ref,
                                              new_row_attr_ref, new_col_attr_ref, workspace_name)

        returnVal.update(report_output)

        return returnVal

    def _process_params(self, params):
        logging.info('start validating import_matrix_from_biom params')

        # check for required parameters
        for p in ['obj_type', 'matrix_name', 'workspace_name', 'scale', 'amplicon_set_name']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        obj_type = params.get('obj_type')
        if obj_type not in self.matrix_types:
            raise ValueError('Unknown matrix object type: {}'.format(obj_type))

        scale = params.get('scale')
        if scale not in SCALE_TYPES:
            raise ValueError('Unknown scale type: {}'.format(scale))

        biom_file = None
        tsv_file = None
        fasta_file = None
        metadata_keys = DEFAULT_META_KEYS

        if params.get('biom_tsv'):
            biom_tsv = params.get('biom_tsv')
            biom_file = biom_tsv.get('biom_file_biom_tsv')
            tsv_file = biom_tsv.get('tsv_file_biom_tsv')

            if not (biom_file and tsv_file):
                raise ValueError('missing BIOM or TSV file')

            biom_file = self.dfu.download_staging_file(
                                {'staging_file_subdir_path': biom_file}).get('copy_file_path')

            tsv_file = self.dfu.download_staging_file(
                                {'staging_file_subdir_path': tsv_file}).get('copy_file_path')
            mode = 'biom_tsv'
        elif params.get('biom_fasta'):
            biom_fasta = params.get('biom_fasta')
            biom_file = biom_fasta.get('biom_file_biom_fasta')
            fasta_file = biom_fasta.get('fasta_file_biom_fasta')

            if not (biom_file and fasta_file):
                raise ValueError('missing BIOM or FASTA file')

            biom_file = self.dfu.download_staging_file(
                                {'staging_file_subdir_path': biom_file}).get('copy_file_path')

            fasta_file = self.dfu.download_staging_file(
                                {'staging_file_subdir_path': fasta_file}).get('copy_file_path')
            mode = 'biom_fasta'
        elif params.get('tsv_fasta'):
            tsv_fasta = params.get('tsv_fasta')
            tsv_file = tsv_fasta.get('tsv_file_tsv_fasta')
            fasta_file = tsv_fasta.get('fasta_file_tsv_fasta')

            if not (tsv_file and fasta_file):
                raise ValueError('missing TSV or FASTA file')

            tsv_file = self.dfu.download_staging_file(
                                {'staging_file_subdir_path': tsv_file}).get('copy_file_path')

            fasta_file = self.dfu.download_staging_file(
                                {'staging_file_subdir_path': fasta_file}).get('copy_file_path')

            metadata_keys_str = tsv_fasta.get('metadata_keys_tsv_fasta')
            if metadata_keys_str:
                metadata_keys += [x.strip() for x in metadata_keys_str.split(',')]
            mode = 'tsv_fasta'
        elif params.get('tsv'):
            tsv = params.get('tsv')
            tsv_file = tsv.get('tsv_file_tsv')

            if not tsv_file:
                raise ValueError('missing TSV file')

            tsv_file = self.dfu.download_staging_file(
                                {'staging_file_subdir_path': tsv_file}).get('copy_file_path')

            metadata_keys_str = tsv.get('metadata_keys_tsv')
            if metadata_keys_str:
                metadata_keys += [x.strip() for x in metadata_keys_str.split(',')]

            mode = 'tsv'
        else:
            raise ValueError('missing valide file group type in parameters')

        return (biom_file, tsv_file, fasta_file, mode, list(set(metadata_keys)))

    def _retrieve_value(self, biom_metadata_dict, tsv_metadata_df, key, required=False):

        #exit(tsv_metadata_df)  defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fdb3037f378>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']})
        #exit(key) taxonomy
        #exit(biom_metadata_dict) none
        if key in biom_metadata_dict:
            return {k.lower(): v for k, v in biom_metadata_dict.items()}.get(key)
        elif key in tsv_metadata_df:
            return {k.lower(): v for k, v in tsv_metadata_df.items()}.get(key)
        elif required:
            raise ValueError('missing necessary [{}] from file'.format(key))
        else:
            return None

    def _search_taxon(self, scientific_name):
        """
        logic borrowed from: GFU.GenomeInterface
        https://github.com/kbaseapps/GenomeFileUtil/blob/master/lib/GenomeFileUtil/core/GenomeInterface.py#L216
        """
        taxon_id = None

        search_params = {
            "object_types": ["taxon"],
            "match_filter": {
                "lookup_in_keys": {
                    "scientific_name": {"value": scientific_name}},
                "exclude_subobjects": 1
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "sorting_rules": [{
                "is_object_property": 0,
                "property": "timestamp",
                "ascending": 0
            }]
        }

        objects = self.kbse.search_objects(search_params)['objects']

        if not objects:
            search_params['match_filter']['lookup_in_keys'] = {
                "aliases": {"value": scientific_name}
            }
            objects = self.kbse.search_objects(search_params)['objects']
        if objects:
            taxon_id = objects[0].get('object_name')
        #exit(taxon_id)  561_taxon
        return taxon_id

    def _fetch_taxon_level(self, taxon_char):

        taxon_level_mapping = {'l': 'Life', 'd': 'Domain', 'k': 'Kingdom', 'p': 'Phylum',
                               'c': 'Class', 'o': 'Order', 'f': 'Family', 'g': 'Genus',
                               's': 'Species'}
        return taxon_level_mapping.get(taxon_char[0].lower(), 'Unknown')

    def _fetch_taxonomy(self, datarow):
        #exit(datarow) defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7f7ca8e8d950>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']})
        lineage = self._retrieve_value([], datarow, 'taxonomy')
        #exit(lineage)  ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']
        if isinstance(lineage, str):
            delimiter = csv.Sniffer().sniff(lineage).delimiter
            lineage = [x.strip() for x in lineage.split(delimiter)]
            #exit(lineage)  ['k__Bacteria', 'k__Bacteria']
        taxonomy = {'lineage': lineage}
        

        for key in ['score', 'taxonomy_source', 'species_name']:
            val = self._retrieve_value([], datarow, key)
            if val:
                taxonomy[key] = val
        #exit(key) species_name
        for item in lineage[::-1]:
            scientific_name = item.split('_')[-1]
            taxon_level_char = item.split('_')[0]
            if scientific_name:
                taxon_id = self._search_taxon(scientific_name)
                if taxon_id:
                    taxon_ref = f"{self.taxon_wsname}/{taxon_id}"
                    taxon_level = self._fetch_taxon_level(taxon_level_char)

                    taxonomy.update({'taxon_ref': taxon_ref,
                                     'taxon_id': taxon_id,
                                     'scientific_name': scientific_name,
                                     'taxon_level': taxon_level})
                    break
        #exit(taxonomy) {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}
        return taxonomy

    def _retrieve_tsv_amplicon_set_data(self, tsv_file):              #tsv file is data/amplicon_test.tsv
        amplicons = dict()
        
        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError('Cannot parse file. Please provide valide TSV file')

        if 'consensus_sequence' not in df.columns.tolist():
            raise ValueError('TSV file does not include consensus_sequence')

        logging.info('start processing each row in TSV')
        for observation_id in df.index:
            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {'consensus_sequence': df.loc[observation_id, 'consensus_sequence'],
                        'taxonomy': taxonomy}

            amplicons.update({observation_id: amplicon})

        logging.info('finished parsing TSV file')
        
        return amplicons
        '''
        {'GG_OTU_1': {'consensus_sequence': 'AACCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_2': {'consensus_sequence': 'TTGGCC', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_3': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_4': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_5': {'consensus_sequence': 'TTCCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_6': {'consensus_sequence': 'AACCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}}
        '''

    def _retrieve_tsv_fasta_amplicon_set_data(self, tsv_file, fasta_file):
        #tsvfile = data/amplicon_test.tsv
        amplicons = dict()
        try:
            logging.info('start parsing FASTA file')
            fastq_dict = SeqIO.index(fasta_file, "fasta")      #{'GG_OTU_1' : SeqRecord(...), ...}
        except Exception:
            raise ValueError('Cannot parse file. Please provide valide FASTA file')

        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError('Cannot parse file. Please provide valide TSV file')

        logging.info('start processing files')
        for observation_id in df.index:
            if observation_id not in fastq_dict:
                raise ValueError('FASTA file does not have [{}] OTU id'.format(observation_id))

            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {'consensus_sequence': str(fastq_dict.get(observation_id).seq),
                        'taxonomy': taxonomy}
            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        return amplicons
        '''
        {'GG_OTU_1': {'consensus_sequence': 'ACTGACTAGCTAGCTAACTG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_2': {'consensus_sequence': 'GCATCGTAGCTAGCTACGAT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_3': {'consensus_sequence': 'CATCGATCGTACGTACGTAG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_4': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_5': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_6': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}}
        '''

    def _retrieve_biom_fasta_amplicon_set_data(self, biom_file, fasta_file):
        #exit(biom_file)  data/phyloseq_test.biom
        amplicons = dict()
        try:
            logging.info('start parsing FASTA file')
            fastq_dict = SeqIO.index(fasta_file, "fasta")
        except Exception:
            raise ValueError('Cannot parse file. Please provide valide FASTA file')

        logging.info('start parsing BIOM file')
        table = biom.load_table(biom_file)

        observation_ids = table._observation_ids.tolist()
        observation_metadata = table._observation_metadata

        logging.info('start processing files')
        for index, observation_id in enumerate(observation_ids):
            if observation_id not in fastq_dict:
                raise ValueError('FASTA file does not have [{}] OTU id'.format(observation_id))

            taxonomy = self._fetch_taxonomy(observation_metadata[index])

            amplicon = {'consensus_sequence': str(fastq_dict.get(observation_id).seq),
                        'taxonomy': taxonomy}

            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        return amplicons
        '''
        {'GG_OTU_1': {'consensus_sequence': 'ACTGACTAGCTAGCTAACTG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}, 'GG_OTU_2': {'consensus_sequence': 'GCATCGTAGCTAGCTACGAT', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostocophycideae', 'o__Nostocales', 'f__Nostocaceae', 'g__Dolichospermum', 's__'], 'taxon_ref': 'ReferenceTaxons/748770_taxon', 'taxon_id': '748770_taxon', 'scientific_name': 'Dolichospermum', 'taxon_level': 'Genus'}}, 'GG_OTU_3': {'consensus_sequence': 'CATCGATCGTACGTACGTAG', 'taxonomy': {'lineage': ['k__Archaea', 'p__Euryarchaeota', 'c__Methanomicrobia', 'o__Methanosarcinales', 'f__Methanosarcinaceae', 'g__Methanosarcina', 's__'], 'taxon_ref': 'ReferenceTaxons/2207_taxon', 'taxon_id': '2207_taxon', 'scientific_name': 'Methanosarcina', 'taxon_level': 'Genus'}}, 'GG_OTU_4': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Firmicutes', 'c__Clostridia', 'o__Halanaerobiales', 'f__Halanaerobiaceae', 'g__Halanaerobium', 's__Halanaerobiumsaccharolyticum'], 'taxon_ref': 'ReferenceTaxons/2330_taxon', 'taxon_id': '2330_taxon', 'scientific_name': 'Halanaerobium', 'taxon_level': 'Genus'}}, 'GG_OTU_5': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}}
        '''
        

    def _retrieve_biom_tsv_amplicon_set_data(self, biom_file, tsv_file):
        amplicons = dict()
        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError('Cannot parse file. Please provide valide tsv file')

        if 'consensus_sequence' not in df.columns.tolist():
            raise ValueError('TSV file does not include consensus_sequence')

        logging.info('start parsing BIOM file')
        table = biom.load_table(biom_file)

        observation_ids = table._observation_ids.tolist()
        observation_metadata = table._observation_metadata

        logging.info('start processing files')
        for index, observation_id in enumerate(observation_ids):
            if observation_id not in df.index:
                raise ValueError('TSV file does not have [{}] OTU id'.format(observation_id))

            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {'consensus_sequence': df.loc[observation_id, 'consensus_sequence'],
                        'taxonomy': taxonomy}

            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        '''
        {'GG_OTU_1': {'consensus_sequence': 'AACCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_2': {'consensus_sequence': 'TTGGCC', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_3': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_4': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_5': {'consensus_sequence': 'TTCCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}}
        '''
        return amplicons

    def _file_to_amplicon_set_data(self, biom_file, tsv_file, fasta_file, mode, refs, description,
                                   matrix_obj_ref):

        logging.info('start parsing amplicon_set_data')

        amplicon_set_data = dict()

        if mode == 'biom_tsv':
            amplicons = self._retrieve_biom_tsv_amplicon_set_data(biom_file, tsv_file)
        elif mode == 'biom_fasta':
            amplicons = self._retrieve_biom_fasta_amplicon_set_data(biom_file, fasta_file)
        elif mode == 'tsv_fasta':
            amplicons = self._retrieve_tsv_fasta_amplicon_set_data(tsv_file, fasta_file)
        elif mode == 'tsv':
            amplicons = self._retrieve_tsv_amplicon_set_data(tsv_file)
        else:
            raise ValueError('error parsing _file_to_amplicon_set_data, mode: {}'.format(mode))

        amplicon_set_data.update({'amplicons': amplicons})

        if 'reads_set_ref' in refs:
            amplicon_set_data['reads_set_ref'] = refs.get('reads_set_ref')

        if description:
            amplicon_set_data['description'] = description

        matrix_obj_ref_array = matrix_obj_ref.split('/')
        amplicon_set_data['amplicon_matrix_ref'] = '{}/{}'.format(matrix_obj_ref_array[0],
                                                                  matrix_obj_ref_array[1])
        '''
        {'amplicons': {'GG_OTU_1': {'consensus_sequence': 'ACTGACTAGCTAGCTAACTG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}, 'GG_OTU_2': {'consensus_sequence': 'GCATCGTAGCTAGCTACGAT', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostocophycideae', 'o__Nostocales', 'f__Nostocaceae', 'g__Dolichospermum', 's__'], 'taxon_ref': 'ReferenceTaxons/748770_taxon', 'taxon_id': '748770_taxon', 'scientific_name': 'Dolichospermum', 'taxon_level': 'Genus'}}, 'GG_OTU_3': {'consensus_sequence': 'CATCGATCGTACGTACGTAG', 'taxonomy': {'lineage': ['k__Archaea', 'p__Euryarchaeota', 'c__Methanomicrobia', 'o__Methanosarcinales', 'f__Methanosarcinaceae', 'g__Methanosarcina', 's__'], 'taxon_ref': 'ReferenceTaxons/2207_taxon', 'taxon_id': '2207_taxon', 'scientific_name': 'Methanosarcina', 'taxon_level': 'Genus'}}, 'GG_OTU_4': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Firmicutes', 'c__Clostridia', 'o__Halanaerobiales', 'f__Halanaerobiaceae', 'g__Halanaerobium', 's__Halanaerobiumsaccharolyticum'], 'taxon_ref': 'ReferenceTaxons/2330_taxon', 'taxon_id': '2330_taxon', 'scientific_name': 'Halanaerobium', 'taxon_level': 'Genus'}}, 'GG_OTU_5': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}}, 'description': 'OTU data', 'amplicon_matrix_ref': '44071/21'}
        '''
        return amplicon_set_data

    def _file_to_amplicon_data(self, biom_file, tsv_file, mode, refs, matrix_name, workspace_id,
                               scale, description, metadata_keys=None):

        amplicon_data = refs

        if mode.startswith('biom'):
            logging.info('start parsing BIOM file for matrix data')
            table = biom.load_table(biom_file)
            observation_metadata = table._observation_metadata
            sample_metadata = table._sample_metadata

            matrix_data = {'row_ids': table._observation_ids.tolist(),
                           'col_ids': table._sample_ids.tolist(),
                           'values': table.matrix_data.toarray().tolist()}

            logging.info('start building attribute mapping object')
            amplicon_data.update(self.get_attribute_mapping("row", observation_metadata,
                                                            matrix_data, matrix_name, refs,
                                                            workspace_id))
            amplicon_data.update(self.get_attribute_mapping("col", sample_metadata,
                                                            matrix_data, matrix_name, refs,
                                                            workspace_id))

            amplicon_data['attributes'] = {}
            for k in ('create_date', 'generated_by'):
                val = getattr(table, k)
                if not val:
                    continue
                if isinstance(val, bytes):
                    amplicon_data['attributes'][k] = val.decode('utf-8')
                else:
                    amplicon_data['attributes'][k] = str(val)
        elif mode.startswith('tsv'):
            observation_metadata = None
            sample_metadata = None
            try:
                logging.info('start parsing TSV file for matrix data')
                reader = pd.read_csv(tsv_file, sep=None, iterator=True)
                inferred_sep = reader._engine.data.dialect.delimiter
                df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
            except Exception:
                raise ValueError('Cannot parse file. Please provide valide tsv file')
            else:
                metadata_df = None
                if metadata_keys:
                    shared_metadata_keys = list(set(metadata_keys) & set(df.columns))
                    if mode == 'tsv' and 'consensus_sequence' not in shared_metadata_keys:
                        raise ValueError('TSV file does not include consensus_sequence')
                    if shared_metadata_keys:
                        metadata_df = df[shared_metadata_keys]
                        df.drop(columns=shared_metadata_keys, inplace=True)
                try:
                    df = df.astype(float)
                except ValueError:
                    err_msg = 'Found some non-float values. Matrix contains only numeric values\n'
                    err_msg += 'Please list any non-numeric column names in  Metadata Keys field'
                    raise ValueError(err_msg)
                df.fillna(0, inplace=True)
                matrix_data = {'row_ids': df.index.tolist(),
                               'col_ids': df.columns.tolist(),
                               'values': df.values.tolist()}

            logging.info('start building attribute mapping object')
            amplicon_data.update(self.get_attribute_mapping("row", observation_metadata,
                                                            matrix_data, matrix_name, refs,
                                                            workspace_id, metadata_df))
            amplicon_data.update(self.get_attribute_mapping("col", sample_metadata,
                                                            matrix_data, matrix_name, refs,
                                                            workspace_id))

            amplicon_data['attributes'] = {}
        else:
            raise ValueError('error parsing _file_to_amplicon_data, mode: {}'.format(mode))

        amplicon_data.update({'data': matrix_data})

        amplicon_data['search_attributes'] = [f'{k}|{v}' for k, v in amplicon_data['attributes'].items()]

        amplicon_data['scale'] = scale
        if description:
            amplicon_data['description'] = description
        '''
        {'col_attributemapping_ref': '44071/33/24', 'row_attributemapping_ref': '44071/19/119', 'row_mapping': {'GG_OTU_1': 'GG_OTU_1', 'GG_OTU_2': 'GG_OTU_2', 'GG_OTU_3': 'GG_OTU_3', 'GG_OTU_4': 'GG_OTU_4', 'GG_OTU_5': 'GG_OTU_5'}, 'col_mapping': {'Sample1': 'Sample1', 'Sample2': 'Sample2', 'Sample3': 'Sample3', 'Sample4': 'Sample4', 'Sample5': 'Sample5', 'Sample6': 'Sample6'}, 'attributes': {'generated_by': 'QIIME revision XYZ'}, 'data': {'row_ids': ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], 'col_ids': ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], 'values': [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]}, 'search_attributes': ['generated_by|QIIME revision XYZ'], 'scale': 'raw', 'description': 'OTU data'}
        '''
        return amplicon_data

    def get_attribute_mapping(self, axis, metadata, matrix_data, matrix_name, refs,  workspace_id,
                              metadata_df=None):
        '''
        getting mapping data based on refs or metadata or metadata_df
        '''
        #exit(metadata)
        '''
        (defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35faf730>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35faf9d8>, {'taxonomy': ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostocophycideae', 'o__Nostocales', 'f__Nostocaceae', 'g__Dolichospermum', 's__']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35faf6a8>, {'taxonomy': ['k__Archaea', 'p__Euryarchaeota', 'c__Methanomicrobia', 'o__Methanosarcinales', 'f__Methanosarcinaceae', 'g__Methanosarcina', 's__']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35fafd08>, {'taxonomy': ['k__Bacteria', 'p__Firmicutes', 'c__Clostridia', 'o__Halanaerobiales', 'f__Halanaerobiaceae', 'g__Halanaerobium', 's__Halanaerobiumsaccharolyticum']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35fafea0>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']}))

        '''

        #exit(matrix_data)  {'row_ids': ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], 'col_ids': ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], 'values': [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]}
        #exit(matrix_name) test_AmpliconMatrix
        #exit(refs)  {'col_attributemapping_ref': '44071/33/51'}
        mapping_data = {}
        axis_ids = matrix_data[f'{axis}_ids']
        #exit(axis_ids)  ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5']
        if refs.get(f'{axis}_attributemapping_ref'):
            am_data = self.dfu.get_objects(
                {'object_refs': [refs[f'{axis}_attributemapping_ref']]}
            )['data'][0]['data']
            unmatched_ids = set(axis_ids) - set(am_data['instances'].keys())
            if unmatched_ids:
                name = "Column" if axis == 'col' else "Row"
                raise ValueError(f"The following {name} IDs from the uploaded matrix do not match "
                                 f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}"
                                 f"\nPlease verify the input data or upload an excel file with a"
                                 f"{name} mapping tab.")
            else:
                mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}

        elif metadata:
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[f'{axis}_attributemapping_ref'] = self._metadata_to_attribute_mapping(
                axis_ids, metadata, name, workspace_id)
            # if coming from biom file, metadata and axis IDs are guaranteed to match
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}
        elif metadata_df is not None:
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[f'{axis}_attributemapping_ref'] = self._meta_df_to_attribute_mapping(
                axis_ids, metadata_df, name, workspace_id)
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}
        '''
        {'row_attributemapping_ref': '44071/19/122', 'row_mapping': {'GG_OTU_1': 'GG_OTU_1', 'GG_OTU_2': 'GG_OTU_2', 'GG_OTU_3': 'GG_OTU_3', 'GG_OTU_4': 'GG_OTU_4', 'GG_OTU_5': 'GG_OTU_5', 'GG_OTU_6': 'GG_OTU_6'}} 
        '''
        return mapping_data

    def _meta_df_to_attribute_mapping(self, axis_ids, metadata_df, obj_name, ws_id):
        data = {'ontology_mapping_method': "TSV file", 'instances': {}}
        attribute_keys = metadata_df.columns.tolist()
        data['attributes'] = [{'attribute': key, 'source': 'upload'} for key in attribute_keys]

        for axis_id in axis_ids:
            data['instances'][axis_id] = metadata_df.loc[axis_id].tolist()

        logging.info('start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id": ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": data,
                "name": obj_name
            }]
        })[0]
        # 44071/19/128
        return f'{info[6]}/{info[0]}/{info[4]}'

    def _metadata_to_attribute_mapping(self, instances, metadata, obj_name, ws_id):
        data = {'ontology_mapping_method': "BIOM file", 'instances': {}}
        sample_set = metadata[0:min(len(metadata), 25)]
        metadata_keys = sorted(set((k for m_dict in sample_set for k in m_dict)))
        data['attributes'] = [{'attribute': key, 'source': 'upload'} for key in metadata_keys]
        for inst, meta in zip(instances, metadata):
            data['instances'][inst] = [str(meta[attr]) for attr in metadata_keys]

        logging.info('start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id": ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": data,
                "name": obj_name
            }]
        })[0]
        # 44071/19/134
        return f'{info[6]}/{info[0]}/{info[4]}'

    def _generate_report(self, matrix_obj_ref, amplicon_set_obj_ref, new_row_attr_ref,
                         new_col_attr_ref, workspace_name):
        """
        _generate_report: generate summary report
        """

        objects_created = [{'ref': matrix_obj_ref, 'description': 'Imported Amplicon Matrix'},
                           {'ref': amplicon_set_obj_ref, 'description': 'Imported Amplicon Set'}]

        if new_row_attr_ref:
            objects_created.append({'ref': new_row_attr_ref,
                                    'description': 'Imported Amplicons(Row) Attribute Mapping'})

        if new_col_attr_ref:
            objects_created.append({'ref': new_col_attr_ref,
                                    'description': 'Imported Samples(Column) Attribute Mapping'})

        report_params = {'message': '',
                         'objects_created': objects_created,
                         'workspace_name': workspace_name,
                         'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}
        #{'report_name': 'import_matrix_from_biom_db306341-c03a-4e60-b8a4-2bd7f6a48925', 'report_ref': '44071/200/1'}
        return report_output

    def _df_to_tsv(self, amplicon_set_df, result_dir, amplicon_set_ref):    #not going to be used anywhere
        logging.info('writting amplicon set data frame to tsv file')
        amplicon_set_obj = self.dfu.get_objects({'object_refs': [amplicon_set_ref]})['data'][0]
        amplicon_set_info = amplicon_set_obj['info']
        amplicon_set_name = amplicon_set_info[1]

        file_path = os.path.join(result_dir, amplicon_set_name + ".tsv")

        amplicon_set_df.to_csv(file_path, sep='\t', index=True, header=True)

        return file_path

    def _amplicon_set_to_df(self, amplicon_set_ref):                       #not going to be used anywhere
        logging.info('converting amplicon set to data frame')
        am_set_data = self.dfu.get_objects({'object_refs': [amplicon_set_ref]})['data'][0]['data']

        amplicon_matrix_ref = am_set_data.get('amplicon_matrix_ref')
        matrix_data = self.dfu.get_objects({'object_refs': [amplicon_matrix_ref]})['data'][0]['data']
        matrix_value_data = matrix_data.get('data')

        index = matrix_value_data.get('row_ids')
        columns = matrix_value_data.get('col_ids')
        values = matrix_value_data.get('values')

        df = pd.DataFrame(values, index=index, columns=columns)

        amplicons = am_set_data.get('amplicons')

        meta_index = list()

        meta_columns = ['taxonomy', 'taxon_id', 'taxon_ref', 'taxon_level', 'score',
                        'taxonomy_source', 'species_name', 'consensus_sequence']
        meta_values = list()
        for otu_id, amplicon in amplicons.items():
            meta_index.append(otu_id)

            taxonomy_data = amplicon.get('taxonomy')

            taxonomy = taxonomy_data.get('lineage')
            taxon_id = taxonomy_data.get('taxon_id')
            taxon_ref = taxonomy_data.get('taxon_ref')
            taxon_level = taxonomy_data.get('taxon_level')
            score = taxonomy_data.get('score')
            taxonomy_source = taxonomy_data.get('taxonomy_source')
            species_name = taxonomy_data.get('species_name')

            consensus_sequence = amplicon.get('consensus_sequence')

            meta_values.append([taxonomy, taxon_id, taxon_ref, taxon_level, score, taxonomy_source,
                                species_name, consensus_sequence])

        meta_df = pd.DataFrame(meta_values, index=meta_index, columns=meta_columns)

        merged_df = df.merge(meta_df, left_index=True, right_index=True, how='left',
                             validate='one_to_one')
        
        return merged_df
   
    def export_amplicon_set_tsv(self, params):   # not goign to be called anywhere
        """
        export AmpliconSet as TSV
        """
        logging.info('start exporting amplicon set object')
        amplicon_set_ref = params.get('input_ref')

        amplicon_set_df = self._amplicon_set_to_df(amplicon_set_ref)

        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)

        self._df_to_tsv(amplicon_set_df, result_dir, amplicon_set_ref)

        package_details = self.dfu.package_for_download({
            'file_path': result_dir,
            'ws_refs': [amplicon_set_ref]
        })

        return {'shock_id': package_details['shock_id']}
예제 #8
0
class ImportEscherMapUtil:

    @staticmethod
    def validate_eschermap_params(params, expected, opt_param=set()):
        """
        Validates that required parameters are present.
        Warns if unexpected parameters appear
        """
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning("Unexpected parameter {} supplied".format(param))

    def _save_escher_map(self, escher_data, workspace_id, escher_map_name):
        """
        save KBaseFBA.EscherMap to workspace
        """

        logging.info('start saving KBaseFBA.EscherMap')

        if not isinstance(workspace_id, int):
            logging.warning('Invalid workspace ID: {}'.format(workspace_id))

            try:
                workspace_id = self.dfu.ws_name_to_id(workspace_id)
            except Exception:
                raise ValueError('Cannot convert {} to valid workspace id'.format(workspace_id))

        info = self.dfu.save_objects({'id': workspace_id,
                                      'objects': [{'type': 'KBaseFBA.EscherMap',
                                                   'data': escher_data,
                                                   'name': escher_map_name}]})[0]

        return "%s/%s/%s" % (info[6], info[0], info[4])

    def _refactor_escher_data(self, escher_data):
        """
        refactor escher data to better fit KBaseFBA.EscherMap object
        """
        logging.info('start refactoring escher data')
        refactored_escher_data = copy.deepcopy(escher_data)

        refactored_escher_data[0]['map_name'] = "custom map"
        if not 'authors' in refactored_escher_data[0]:
            refactored_escher_data[0]['authors'] = []

        for rxn_uid in refactored_escher_data[1]['reactions']:
            rxn_node = refactored_escher_data[1]['reactions'][rxn_uid]
            rxn_node['reversibility'] = 1 if rxn_node['reversibility'] else 0

            for seg_uid in rxn_node['segments']:
                seg = rxn_node['segments'][seg_uid]
                if seg['b1'] == None:
                    del seg['b1']
                if seg['b2'] == None:
                    del seg['b2']

        for node_uid in refactored_escher_data[1]['nodes']:
            node = refactored_escher_data[1]['nodes'][node_uid]
            if 'node_is_primary' in node:
                node['node_is_primary'] = 1 if node['node_is_primary'] else 0

        refactored_escher_data = {
            "metadata" : refactored_escher_data[0],
            "layout" : refactored_escher_data[1]
        }

        if refactored_escher_data == escher_data:
            logging.warning('No changes in escher data')

        return refactored_escher_data

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)

    def import_eschermap_from_staging(self, params):
        """
          import_attribute_mapping_from_staging: import a JSON file as KBaseFBA.EscherMap

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          escher_map_name: output KBaseFBA.EscherMap object name
          workspace_id: workspace ID

          return:
          obj_ref: return object reference
        """

        self.validate_eschermap_params(params, ['staging_file_subdir_path', 'escher_map_name',
                                                'workspace_id'])

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
                                               download_staging_file_params).get('copy_file_path')

        try:
            with open(scratch_file_path) as f:
                escher_data = json.load(f)
        except Exception:
            raise ValueError('Failed to parse JSON file.')

        escher_data = self._refactor_escher_data(escher_data)

        obj_ref = self._save_escher_map(escher_data,
                                        params['workspace_id'],
                                        params['escher_map_name'])

        returnVal = {'obj_ref': obj_ref}

        return returnVal

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references.
        """
        logging.info('start generating report')

        upload_message = 'Import Finished\n'

        get_objects_params = {'object_refs': [obj_ref],
                              'ignore_errors': False}

        object_data = self.dfu.get_objects(get_objects_params)

        upload_message += "Imported Escher Map Name: "
        upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n'
        upload_message += 'Imported File: {}\n'.format(params['staging_file_subdir_path'])
        report_params = {'message': upload_message,
                         'objects_created': [{'ref': obj_ref,
                                              'description': 'Imported Escher Map'}],
                         'workspace_id': params['workspace_id'],
                         'report_object_name': 'kb_upload_methods_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output
예제 #9
0
class ImportFBAModelUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.fba = fba_tools(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_fbamodel_from_staging(self, params):

        log('--->\nrunning {}.{}\n params:\n{}'.format(
            self.__class__.__name__,
            sys._getframe().f_code.co_name, json.dumps(params, indent=1)))

        self._check_param(params, [
            'model_file', 'file_type', 'workspace_name', 'model_name',
            'biomass'
        ], ['genome', 'compounds_file'])
        if params['file_type'] == 'tsv' and not params.get(
                'compounds_file', None):
            raise ValueError('A compound file is required for tsv upload.')

        fba_tools_params = params.copy()
        for infile in ['model_file', 'compounds_file']:
            if not params.get(infile, None):
                continue
            download_staging_file_params = {
                'staging_file_subdir_path': params[infile]
            }
            scratch_file_path = self.dfu.download_staging_file(
                download_staging_file_params).get('copy_file_path')
            fba_tools_params[infile] = {'path': scratch_file_path}

        if params['file_type'] == 'sbml':
            res = self.fba.sbml_file_to_model(fba_tools_params)
        elif params['file_type'] == 'excel':
            res = self.fba.excel_file_to_model(fba_tools_params)
        elif params['file_type'] == 'tsv':
            res = self.fba.tsv_file_to_model(fba_tools_params)
        else:
            raise ValueError('"{}" is not a valid import file_type'.format(
                params['file_type']))
        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(
            download_staging_file_params.get('staging_file_subdir_path'),
            res['ref'])
        return {'obj_ref': res['ref']}

    @staticmethod
    def _check_param(in_params, req_param, opt_param=list()):
        """
        Check if each of the params in the list are in the input params
        """
        for param in req_param:
            if param not in in_params:
                raise ValueError(
                    'Required parameter "{}" is missing'.format(param))
        defined_param = set(req_param + opt_param)
        for param in in_params:
            if param not in defined_param:
                print(('WARNING: received unexpected parameter "{}"'.format(
                    param)))

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                        import_excel(tsv)_as_media_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """

        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        upload_message += "FBAModel Object Name: "
        upload_message += params['model_name'] + '\n'
        upload_message += 'Imported File: {}\n'.format(
            params.get('model_file'))

        report_params = {
            'message':
            upload_message,
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Imported FBAModel'
            }],
            'workspace_name':
            params.get('workspace_name'),
            'report_object_name':
            'kb_upload_methods_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
class ImportMetagenomeGFFFastaUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url, service_ver='dev')
        self.uploader_utils = UploaderUtil(config)
        self.scratch = os.path.join(config['scratch'],
                                    'import_Metagenome_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)

    def import_metagenome_gff_fasta_from_staging(self, params):
        """
        import_gff_fasta_from_staging: wrapper method for GenomeFileUtil.fasta_gff_to_genome

        required params:
        fasta_file: fasta file from user's staging area
        gff_file: gff file from user's staging area
        genome_name: output genome object name
        workspace_name: workspace name that genome will be stored to

        file paths for both fasta and gff files must be subdirectory file path in staging area
        e.g.
        for file: /data/bulk/user_name/file_name
        staging_file_subdir_path is file_name
        for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
        staging_file_subdir_path is subdir_1/subdir_2/file_name

        optional params:
        release: Release Or Version Of The Source Data
        genetic_code: Genetic Code For The Organism
        type: 'Reference', 'User upload', 'Representative'

        return:
        genome_ref: return object reference
        report_name: name of generated report (if any)
        report_ref: report reference (if any)
        """
        # logging.info('--->\nrunning ImportMetagenomeGFFFastaUtil.import_metagenome_gff_fasta_from_staging\n' +
        #              f'params:\n{json.dumps(params, indent=1)}')

        self.validate_import_metagenome_gff_fasta_from_staging_params(params)

        for key in ('fasta_file', 'gff_file'):
            file_path = params[key]
            download_staging_file_params = {
                'staging_file_subdir_path': file_path
            }
            dfu_returnVal = self.dfu.download_staging_file(
                download_staging_file_params)
            params[key] = {'path': dfu_returnVal['copy_file_path']}

        returnVal = self.gfu.fasta_gff_to_metagenome(params)
        """
        Update the workspace object related meta-data for staged file
        """
        # self.uploader_utils.update_staging_service(download_staging_file_params.get('staging_file_subdir_path'),
        #                                            returnVal['genome_ref'])
        return returnVal

    def validate_import_metagenome_gff_fasta_from_staging_params(self, params):
        """
        validate_import_metagenome_gff_fasta_from_staging_params:
                    validates params passed to import_gff_fasta_from_staging method
        """
        # check for required parameters

        for p in ['genome_name', 'workspace_name', 'fasta_file', 'gff_file']:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

        # for now must use workspace name, but no ws_id_to_name() function available
        if str(params["workspace_name"]).isdigit():
            error_msg = '"{}" parameter is a workspace id and workspace name is required'.format(
                params["workspace_name"])
            raise ValueError(error_msg)

    def generate_html_report(self, genome_ref, params):
        """
        _generate_html_report: generate html summary report
        """
        logging.info('start generating html report')
        genome_obj = self.dfu.get_objects({'object_refs': [genome_ref]})
        html_report = list()
        tmp_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        handler_utils._mkdir_p(tmp_dir)
        result_file_path = os.path.join(tmp_dir, 'report.html')

        genome_name = str(genome_obj.get('data')[0].get('info')[1])
        genome_file = params.get('staging_file_subdir_path')

        genome_data = genome_obj.get('data')[0].get('data')
        genome_info = genome_obj.get('data')[0].get('info')
        genome_metadata = genome_info[10]

        source = genome_metadata.get('Source')
        num_contigs = genome_metadata.get('Number contigs')
        size = genome_metadata.get('Size')
        gc_content = genome_metadata.get('GC content')

        warnings = genome_data.get('warnings', [])
        feature_counts = sorted(
            list(genome_data.get('feature_counts', {}).items()))

        genome_overview_data = collections.OrderedDict()

        genome_overview_data['Name'] = '{} ({})'.format(
            genome_name, genome_ref)
        #genome_overview_data['Uploaded File'] = genome_file
        genome_overview_data['Date Uploaded'] = time.strftime("%c")
        genome_overview_data['Source'] = source
        genome_overview_data['Number of Contigs'] = num_contigs
        genome_overview_data['Size'] = size
        genome_overview_data['GC Content'] = gc_content
        genome_overview_data['Warnings'] = "\n".join(warnings)
        genome_overview_data.update(feature_counts)

        overview_content = '<br/><table>\n'
        for key, val in genome_overview_data.items():
            overview_content += '<tr><td><b>{}</b></td>'.format(key)
            overview_content += '<td>{}</td></tr>\n'.format(val)
        overview_content += '</table>'

        feature_content = str(
            [[str(k), v]
             for k, v in list(genome_data.get('feature_counts', {}).items())
             if k != 'gene'])
        contig_content = str(
            [[str(c), l]
             for c, l in zip(genome_data.get('contig_ids', []),
                             genome_data.get('contig_lengths', []))])

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__), 'report_template',
                                 'report_template_genome.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', overview_content)
                report_template = report_template.replace(
                    '*FEATURE_DATA*', feature_content)
                report_template = report_template.replace(
                    '*CONTIG_DATA*', contig_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': tmp_dir,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for imported Annotated Metagenome Assembly'
        })
        return html_report

    def generate_report(self, genome_ref, params):
        """
        :param genome_ref:  Return Val from GenomeFileUtil for Uploaded metagenome
                            Need to get report warnings and message from it.
        :return:
        """
        uuid_string = str(uuid.uuid4())

        objects_created = [{
            'ref':
            genome_ref,
            'description':
            'Imported Annotated Metagenome Assembly'
        }]

        output_html_files = self.generate_html_report(genome_ref, params)
        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 300,
            'report_object_name': 'kb_metagenome_upload_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
예제 #11
0
class UnpackFileUtil:
    def _staging_service_host(self):

        deployment_path = os.environ["KB_DEPLOYMENT_CONFIG"]

        parser = SafeConfigParser()
        parser.read(deployment_path)

        endpoint = parser.get('kb_uploadmethods', 'kbase-endpoint')
        staging_service_host = endpoint + '/staging_service'

        return staging_service_host

    def _file_to_staging(self, file_path_list, subdir_folder=None):
        """
        _file_to_staging: upload file(s) to staging area
        """
        subdir_folder_str = '/' if not subdir_folder else '/{}'.format(
            subdir_folder)
        staging_service_host = self._staging_service_host()
        end_point = staging_service_host + '/upload'
        headers = {'Authorization': self.token}

        files = {'destPath': subdir_folder_str}

        for file_path in file_path_list:
            files.update({
                'uploads': (os.path.basename(file_path), open(file_path, 'rb'))
            })

            resp = _requests.post(end_point, headers=headers, files=files)

            if resp.status_code != 200:
                raise ValueError(
                    'Upload file {} failed.\nError Code: {}\n{}\n'.format(
                        file_path, resp.status_code, resp.text))
            else:
                log("return message from server:\n{}\n".format(resp.text))

    def _remove_irrelevant_files(self, file_path):
        """
        _remove_irrelevant_files: remove irrelevant files
        """
        target_name = os.path.basename(file_path)
        file_dir = os.path.dirname(file_path)
        for dirpath, dirnames, filenames in os.walk(file_dir):
            for filename in filenames:
                if filename != target_name:
                    irrelevant_file_path = os.sep.join([dirpath, filename])
                    os.remove(irrelevant_file_path)
                    log('removing irrelevant file: {}'.format(
                        irrelevant_file_path))

    def _r_unpack(self, file_path, count):
        """
        _r_unpack: recursively unpack file_path
        """
        if count == 0:
            self._remove_irrelevant_files(file_path)

        count += 1
        if os.path.isfile(file_path):
            log('processing:      {}{}'.format('-' * count, file_path))
            t = magic.from_file(file_path, mime=True)

            if os.path.basename(file_path).endswith('.DS_Store'):
                os.remove(file_path)
                log('removing file:   {}{}'.format('-' * count, file_path))
            elif t in [
                    'application/' + x
                    for x in ('x-gzip', 'gzip', 'x-bzip', 'x-bzip2', 'bzip',
                              'bzip2', 'x-tar', 'tar', 'x-gtar', 'zip',
                              'x-zip-compressed')
            ]:
                file_dir = os.path.dirname(file_path)
                files_before_unpack = os.listdir(file_dir)
                self.dfu.unpack_file({'file_path': file_path}).get('file_path')
                files_after_unpack = os.listdir(file_dir)
                new_files = [
                    item for item in files_after_unpack
                    if item not in files_before_unpack
                ]
                for new_file in new_files:
                    self._r_unpack(os.sep.join([file_dir, new_file]), count)
                os.remove(file_path)
                log('removing file:   {}{}'.format('-' * count, file_path))
            else:
                return file_path
        else:
            if os.path.basename(file_path).startswith('_'):
                shutil.rmtree(file_path, ignore_errors=True)
                log('removing folder: {}{}'.format('-' * count, file_path))
            else:
                for dirpath, dirnames, filenames in os.walk(file_path):
                    for filename in filenames:
                        self._r_unpack(os.sep.join([dirpath, filename]), count)

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.user_id = config['USER_ID']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url)

    def unpack_staging_file(self, params):
        """
        Unpack a staging area file

        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
                for file: /data/bulk/user_name/file_name
                staging_file_subdir_path is file_name
                for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
                staging_file_subdir_path is subdir_1/subdir_2/file_name

        result:
        unpacked_file_path: unpacked file path(s) in staging area

        """

        log('--->\nrunning UnpackFileUtil.unpack_staging_file\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        scratch_file_path = self.dfu.download_staging_file(params).get(
            'copy_file_path')

        self._r_unpack(scratch_file_path, 0)
        unpacked_file_path_list = []
        for dirpath, dirnames, filenames in os.walk(
                os.path.dirname(scratch_file_path)):
            for filename in filenames:
                unpacked_file_path_list.append(os.sep.join([dirpath,
                                                            filename]))

        log("Unpacked files:\n  {}".format(
            '\n  '.join(unpacked_file_path_list)))

        self._file_to_staging(
            unpacked_file_path_list,
            os.path.dirname(params.get('staging_file_subdir_path')))

        unpacked_file_path = ','.join(unpacked_file_path_list)
        returnVal = {'unpacked_file_path': unpacked_file_path}

        return returnVal

    def unpack_web_file(self, params):
        """
        Download and unpack a web file to staging area

        params:
        file_url: file URL
        download_type: one of ['Direct Download', 'FTP',
                    'DropBox', 'Google Drive']

            result:
            unpacked_file_path: unpacked file path(s) in staging area

        """
        log('--->\nrunning UnpackFileUtil.unpack_web_file\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        scratch_file_path = self.dfu.download_web_file(params).get(
            'copy_file_path')

        self._r_unpack(scratch_file_path, 0)
        unpacked_file_path_list = []
        for dirpath, dirnames, filenames in os.walk(
                os.path.dirname(scratch_file_path)):
            for filename in filenames:
                unpacked_file_path_list.append(os.sep.join([dirpath,
                                                            filename]))

        log("Unpacked files:\n  {}".format(
            '\n  '.join(unpacked_file_path_list)))

        self._file_to_staging(unpacked_file_path_list)
        unpacked_file_path = ','.join(unpacked_file_path_list)
        returnVal = {'unpacked_file_path': unpacked_file_path}

        return returnVal

    def generate_report(self, unpacked_file_path, params):
        """
        generate_report: generate summary report

        unpacked_file_path: generated unpacked file path(s) in staging area.
                  (return of unpack_staging_file or unpack_web_file)

        """

        log("generating report")
        uuid_string = str(uuid.uuid4())
        unpacked_file_path_list = unpacked_file_path.split(',')

        subdir = os.path.dirname(
            params.get('staging_file_subdir_path')) + '/' if params.get(
                'staging_file_subdir_path') else '/'

        upload_message = 'Uploaded Files: {}\n'.format(
            len(unpacked_file_path_list))
        for file_path in unpacked_file_path_list:
            upload_message += subdir + os.path.basename(file_path) + '\n'

        report_params = {
            'message': upload_message,
            'workspace_name': params.get('workspace_name'),
            'report_object_name': 'kb_upload_mothods_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
예제 #12
0
class CompMolNWChem_Thermo:
    '''
    Module Name:
    CompMolNWChem_Thermo

    Module Description:
    A KBase module: CompMolNWChem_Thermo
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/nkkchem/CompMolNWChem_Thermo.git"
    GIT_COMMIT_HASH = "7e7b026e26614c14a308141a82acbb2ee913c1e3"

    #BEGIN_CLASS_HEADER
    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """

        #log('start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'Thermo_Result.zip')
        plot_file = os.path.join(output_directory, 'Thermo_Plot.zip')

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.zip') or file.endswith('.png')
                            or file.endswith('.DS_Store')):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path':
            result_file,
            'name':
            os.path.basename(result_file),
            'label':
            os.path.basename(result_file),
            'description':
            'File(s) generated by CompMolNWChem_Thermo App'
        })

        with zipfile.ZipFile(plot_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if file.endswith('.png'):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path':
            plot_file,
            'name':
            os.path.basename(plot_file),
            'label':
            os.path.basename(plot_file),
            'description':
            'Plot(s) generated by CompMolNWChem_Thermo App'
        })

        return output_files

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _save_to_ws_and_report(self, ws_id, source, compoundset, message=None):
        """Save compound set to the workspace and make report"""
        info = self.dfu.save_objects({
            'id':
            ws_id,
            "objects": [{
                "type": "KBaseBiochem.CompoundSet",
                "data": compoundset,
                "name": compoundset['name']
            }]
        })[0]
        compoundset_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        if not message:
            message = 'Imported %s as %s' % (source, info[1])
        report_params = {
            'objects_created': [{
                'ref': compoundset_ref,
                'description': 'Compound Set'
            }],
            'message':
            message,
            'workspace_name':
            info[7],
            'report_object_name':
            'compound_set_creation_report'
        }

        # Construct the output to send back
        report_client = KBaseReport(self.callback_url)
        report_info = report_client.create_extended_report(report_params)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'compoundset_ref': compoundset_ref
        }
        return output

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.comp = CompoundSetUtils(self.callback_url)
        self.scratch = config['scratch']
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #self.scratch = config['scratch']

        #END_CONSTRUCTOR
        pass

    def run_CompMolNWChem_Thermo(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_CompMolNWChem_Thermo
        #--------------------------------------------------------------------------------------------------------------------
        # Initial Tests to Check for Proper Inputs

        for name in ['Input_File', 'Input_Method', 'workspace_name']:
            if name not in params:
                raise ValueError('Parameter "' + name +
                                 '"is required but missing')
        if not isinstance(params['Input_File'], str):
            raise ValueError('Input_File must be a string')

#--------------------------------------------------------------------------------------------------------------------
# Check the Input Method and set Equation_Input as the necessary

        if params['Input_Method'] == "file":

            Equation_Input = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                params['Input_File']
            }).get('copy_file_path')

        else:

            Reaction = params['Input_File']
            with open('nwchem-scripts/EC_modelseed_ID_reactions.csv'
                      ) as csv_input:

                Reaction_Frame = pd.read_csv(csv_input)
                length = len(Reaction_Frame.index)

                for itr in range(0, length):

                    if Reaction == Reaction_Frame.EC.iloc[itr]:
                        Equation_Input = Reaction_Frame.reaction.iloc[itr]
                    elif Reaction == Reaction_Frame.rxn_ID.iloc[itr]:
                        Equation_Input = Reaction_Frame.reaction.iloc[itr]

            Eq_List = [Equation_Input]
            with open('Reaction.csv', 'w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(Eq_List)
            Equation_Input = 'Reaction.csv'

        mol2_file_dir = None
        ext = os.path.splitext(Equation_Input)[1]
        file_name = os.path.basename(Equation_Input)

        #--------------------------------------------------------------------------------------------------------------------
        # SNAKEMAKE PIPELINE START, READ FILES IN AND CHECK EXISTING REACTIONS

        from snakemake import snakemake

        modelseedID_to_deltaG = {}
        calculated = []
        Done = False
        # fill with modelseed ID and correspodning delta_G of metabolites
        # This is the dictionary of all Modelseed cpds with their delta_G values

        with open('nwchem-scripts/modelSeed_ID_delta_G_calculated.csv',
                  'r') as dfile:
            lines = dfile.readlines()
            for line in lines:
                key = line.split(',')[0]
                G = line.split(',')[1]
                modelseedID_to_deltaG[key] = float(G)
                calculated.append(key)

        modelseedID_to_deltaG['cpd00001'] += 2.38
        modelseedID_to_deltaG['cpd00067'] = -268.61

        keggID_to_charge = {}
        with open('Ph7_charge_modelseedID.csv', 'r') as dfile:
            lines = dfile.readlines()
            for i in lines:
                key = i.split(',')[0]
                charge = i.split(',')[1]
                keggID_to_charge[key] = charge
        # map metabolites to the reactions with stoichmetry

        reactionlist = Equation_Input

        with open(reactionlist, 'r') as f:
            reactions = f.readlines()[0].rstrip().replace(' ', '')
            reactants = reactions.split('<=>')[0].split('+')
            products = reactions.split('<=>')[1].split('+')

            charge_reactants = 0
            G_reactants = 0
            left = []
            stoich_left = []
            for reactant in reactants:
                stoich_left.append(
                    int((re.findall('(\(\d+\))cpd',
                                    reactant)[0]).replace('(',
                                                          '').replace(')',
                                                                      '')))
                reactant = re.search(
                    re.findall('[a-zA-Z]{3}\d{5}', reactant)[0],
                    reactant).group(0)
                left.append(reactant.strip())

            charge_products = 0
            G_products = 0
            right = []
            stoich_right = []
            for product in products:
                stoich_right.append(
                    int((re.findall('(\(\d+\))cpd',
                                    product)[0]).replace('(',
                                                         '').replace(')', '')))
                product = re.search(
                    re.findall('[a-zA-Z]{3}\d{5}', product)[0],
                    product).group(0)
                right.append(product.strip())

            print(stoich_right, right)
            print(stoich_left, left)

        # Extract delta_g for reactions for which we have performed calculations

        if (set(left).issubset(set(calculated))) and set(right).issubset(
                set(calculated)):
            for jj in range(len(left)):
                G_reactants += stoich_left[jj] * float(
                    modelseedID_to_deltaG[left[jj]])
                charge_reactants += stoich_left[jj] * float(
                    keggID_to_charge[left[jj]])

            for kk in range(len(right)):
                G_products += stoich_right[kk] * float(
                    modelseedID_to_deltaG[right[kk]])
                charge_products += stoich_right[kk] * float(
                    keggID_to_charge[right[kk]])

            charge = float(charge_products - charge_reactants)
            G = float(G_products - G_reactants)
            if charge == -1:
                G = G - 268.61
            if charge == -2:
                G = G - 268.61 * 2
            if charge == -3:
                G = G - 268.61 * 3
            if charge == +1:
                G = G + 268.61 * 1
            if charge == +2:
                G = G + 268.61 * 2
            if charge == +3:
                G = G + 268.61 * 3
            if charge == +4:
                G = G + 268.61 * 4
            if charge == -4:
                G = G - 268.61 * 4
            if charge == 0:
                G = G

            print("Reaction free energy for given reaction is: ", G)
            Done = True
        else:
            print('Calculations is not finished for one of the metabolotes')

        #If the delta_g for the reaction is not there, we need to run our snakemake pipeline to calculate the reaction free energy
        #for the reaction.

        if not Done:

            id_to_smiles = {}
            data = open('/kb/module/modelSeed_ID_SMILES.csv', 'r')

            for lines in data.readlines():
                id = lines.split(',')[0]
                smiles = lines.split(',')[1].rstrip()
                id_to_smiles[id] = smiles

                data.close()

                metabolites = []
                for each in left:
                    each = each.strip()
                    metabolites.append(each)
                for each in right:
                    each = each.strip()
                    metabolites.append(each)

            # Check to see if cache data already exists for these compounds
            for molecule in metabolites:

                moldir = molecule

                cache_data = {'Metabolites': moldir}
                cache_id = caching.get_cache_id(ctx['token'], cache_data)
                result = caching.download_cache_string(ctx['token'], cache_id)
                if not result or not result.strip():
                    print('None')
                else:
                    print('Some')

            for molecule in metabolites:

                moldir = molecule
                if not os.path.exists(moldir):
                    os.mkdir(moldir)

                initial_structure_dir = moldir + '/initial_structure'
                if not os.path.exists(initial_structure_dir):
                    os.mkdir(initial_structure_dir)

                md_structure_dir = moldir + '/md'
                if not os.path.exists(md_structure_dir):
                    os.mkdir(md_structure_dir)

                dft_structure_dir = moldir + '/dft'
                if not os.path.exists(dft_structure_dir):
                    os.mkdir(dft_structure_dir)

                inchifile_str = initial_structure_dir + '/' + moldir + '.smiles'
                with open(inchifile_str, 'w+') as f:
                    f.write(id_to_smiles[moldir])

            os.system(
                'snakemake -p --cores 3 --snakefile snakemake-scripts/final_pipeline.snakemake -w 12000'
            )

        else:
            print('Calculation already known, moving on')

        # Build KBase Output. Should output entire /simulation directory and build a CompoundSet with Mol2 Files

        #result_directory = '/kb/module/snakemake-scripts'
        result_directory = '/kb/module/dft'

        #-----------------------------------------------------------------------------------------------------------------
        # Extract the information from the results file

        df = pd.DataFrame()
        G = pd.DataFrame()
        for molecule in metabolites:

            moldir = molecule

            result_file = moldir + '_properties.dat'

            # Grab the third last line of the properties file

            with open('dft/' + result_file, 'r') as f:
                all_lines = f.readlines()
                length = len(all_lines)
                d = all_lines[(length - 3)]
                d = d.split()
                df = df.append(d)

            # Save the last number of the line to a new file.
            with open('dft/' + moldir + '_extracted_properties.dat', 'w') as f:

                string = str(df.iloc[-1].values)
                string = string.replace("['", "")
                string = string.replace("']", "")
                f.write(string)
                G_calc = float(string)

            # Cache The Results

            cache_data = {'Metabolite': moldir}
            result = {"Metabolite": moldir, "Free_Energy": G_calc}

            cache_id = CachingUtils.get_cache_id(ctx['token'], cache_data)

            CachingUtils.upload_to_cache(ctx['token'], cache_id, result)
#-----------------------------------------------------------------------------------------------------------------
## Create Extended Report

        output_files = self._generate_output_file_list(result_directory)

        message = "Reaction free energy for given reaction is " + str(G)

        report_params = {
            'message': message,
            'workspace_id': params['workspace_id'],
            'objects_created': [],
            'file_links': output_files,
            'report_object_name': 'kb_deseq2_report_' + str(uuid.uuid4())
        }

        report = KBaseReport(self.callback_url)

        report_info = report.create_extended_report(report_params)

        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }

        return [output]

        #END run_CompMolNWChem_Thermo

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_CompMolNWChem_Thermo return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
class ImportAttributeMappingUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.genapi = GenericsAPI(self.callback_url)

    def import_attribute_mapping_from_staging(self, params):
        """
          import_attribute_mapping_from_staging: wrapper method for
                                    fba_tools.tsv_file_to_attribute_mapping

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          attribute_mapping_name: output conditionSet object name
          workspace_name: workspace name/ID of the object

          return:
          obj_ref: return object reference
        """

        log('--->\nrunning ImportConditionSetUtil.import_attribute_mapping_from_staging\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_attribute_mapping_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        ws_id = params['workspace_id']

        import_attribute_mapping_params = {
            'output_obj_name': params['attribute_mapping_name'],
            'output_ws_id': ws_id,
            'input_file_path': scratch_file_path
        }

        ref = self.genapi.file_to_fbamodel_attribute_mapping(
            import_attribute_mapping_params)

        returnVal = {'obj_ref': ref.get('attribute_mapping_ref')}

        return returnVal

    @staticmethod
    def validate_import_attribute_mapping_from_staging_params(params):
        """
        validate_import_attribute_mapping_from_staging_params:
                    validates params passed to import_attribute_mapping_from_staging method
        """
        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'workspace_id',
                'attribute_mapping_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                        import_attribute_mapping_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to
        """
        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False}

        object_data = self.dfu.get_objects(get_objects_params)

        upload_message += "FBAModelSet Name: "
        upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n'
        upload_message += 'Imported File: {}\n'.format(
            params.get('staging_file_subdir_path'))
        report_params = {
            'message':
            upload_message,
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Imported FBAModelSet'
            }],
            'workspace_id':
            params['workspace_id'],
            'report_object_name':
            'import_model_attri_mapping_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
예제 #14
0
class ImportSRAUtil:

    SRA_TOOLKIT_PATH = '/kb/deployment/bin/fastq-dump'

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)
            raise ValueError(error_msg)

    def _check_fastq_dump_result(self, tmp_dir, sra_name):
        """
        _check_fastq_dump_result: check fastq_dump result is PE or SE
        """
        return os.path.exists(tmp_dir + '/' + sra_name + '/1')

    def _sra_to_fastq(self, scratch_sra_file_path, params):
        """
        _sra_to_fastq: convert SRA file to FASTQ file(s)
        """

        tmp_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        handler_utils._mkdir_p(tmp_dir)

        command = self.SRA_TOOLKIT_PATH + ' --split-3 -T -O '
        command += tmp_dir + ' ' + scratch_sra_file_path

        self._run_command(command)

        sra_name = os.path.basename(scratch_sra_file_path).partition('.')[0]
        paired_end = self._check_fastq_dump_result(tmp_dir, sra_name)

        if paired_end:
            self._validate_paired_end_advanced_params(params)
            fwd_file = os.path.join(tmp_dir, sra_name, '1', 'fastq')
            os.rename(fwd_file, fwd_file + '.fastq')
            fwd_file = fwd_file + '.fastq'

            rev_file = os.path.join(tmp_dir, sra_name, '2', 'fastq')
            os.rename(rev_file, rev_file + '.fastq')
            rev_file = rev_file + '.fastq'
        else:
            self._validate_single_end_advanced_params(params)
            fwd_file = os.path.join(tmp_dir, sra_name, 'fastq')
            os.rename(fwd_file, fwd_file + '.fastq')
            fwd_file = fwd_file + '.fastq'
            rev_file = None

        fastq_file_path = {
            'fwd_file': fwd_file,
            'rev_file': rev_file
        }
        return fastq_file_path


    def _validate_single_end_advanced_params(self, params):
        """
        _validate_single_end_advanced_params: validate advanced params for single end reads
        """
        if (params.get('insert_size_mean')
           or params.get('insert_size_std_dev')
           or params.get('read_orientation_outward')):
            error_msg = 'Advanced params "Mean Insert Size", "St. Dev. of Insert Size" or '
            error_msg += '"Reads Orientation Outward" is Paried End Reads specific'
            raise ValueError(error_msg)

        if 'interleaved' in params:
            del params['interleaved']

    def _validate_paired_end_advanced_params(self, params):
        """
        _validate_paired_end_advanced_params: validate advanced params for paired end reads

        """
        sequencing_tech = params.get('sequencing_tech')

        if sequencing_tech in ['PacBio CCS', 'PacBio CLR']:
            error_msg = 'Sequencing Technology: "PacBio CCS" or "PacBio CLR" '
            error_msg += 'is Single End Reads specific'
            raise ValueError(error_msg)

    def _validate_upload_staging_file_availability(self, staging_file_subdir_path):
        """
        _validate_upload_file_path_availability: validates file availability in user's staging area

        """
        pass
        # TODO ftp_server needs to be fixed for subdir
        # list = ftp_service(self.callback_url).list_files()
        # if staging_file_subdir_path not in list:
        #     error_msg = 'Target file: {} is NOT available.\n'.format(
        #                                         staging_file_subdir_path.rpartition('/')[-1])
        #     error_msg += 'Available files:\n {}'.format("\n".join(list))
        #     raise ValueError(error_msg)

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = os.path.join(config['scratch'], 'import_SRA_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_sra_from_staging(self, params):
        '''
          import_sra_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome

          required params:
          staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          sequencing_tech: sequencing technology
          name: output reads file name
          workspace_name: workspace name/ID of the object

          Optional Params:
          single_genome: whether the reads are from a single genome or a metagenome.
          insert_size_mean: mean (average) insert length
          insert_size_std_dev: standard deviation of insert lengths
          read_orientation_outward: whether reads in a pair point outward

          return:
          obj_ref: return object reference
        '''

        log('--->\nrunning ImportSRAUtil.import_sra_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_sra_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_sra_file_path = self.dfu.download_staging_file(
                        download_staging_file_params).get('copy_file_path')
        log('Downloaded staging file to: {}'.format(scratch_sra_file_path))

        fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, params)

        import_sra_reads_params = params
        import_sra_reads_params.update(fastq_file_path)

        workspace_name_or_id = params.get('workspace_name')
        if str(workspace_name_or_id).isdigit():
            import_sra_reads_params['wsid'] = int(workspace_name_or_id)
        else:
            import_sra_reads_params['wsname'] = str(workspace_name_or_id)

        log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format(
                                            json.dumps(import_sra_reads_params, indent=1)))
        returnVal = self.ru.upload_reads(import_sra_reads_params)

        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'),
                                                   returnVal['obj_ref'])
        return returnVal

    def import_sra_from_web(self, params):
        '''
        import_sra_from_web: wrapper method for GenomeFileUtil.genbank_to_genome

        required params:
        download_type: download type for web source fastq file
                       ('Direct Download', 'FTP', 'DropBox', 'Google Drive')
        workspace_name: workspace name/ID of the object

        sra_urls_to_add: dict of SRA file URLs
            required params:
            file_url: SRA file URL
            sequencing_tech: sequencing technology
            name: output reads file name

            Optional Params:
            single_genome: whether the reads are from a single genome or a metagenome.
            insert_size_mean: mean (average) insert length
            insert_size_std_dev: standard deviation of insert lengths
            read_orientation_outward: whether reads in a pair point outward

        return:
        obj_ref: return object reference
        '''

        log('--->\nrunning ImportSRAUtil.import_sra_from_web\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_sra_from_web_params(params)

        download_type = params.get('download_type')
        workspace_name = params.get('workspace_name')

        obj_refs = []
        uploaded_files = []

        for sra_url_to_add in params.get('sra_urls_to_add'):
            download_web_file_params = {
                'download_type': download_type,
                'file_url': sra_url_to_add.get('file_url')
            }
            scratch_sra_file_path = self.dfu.download_web_file(
                        download_web_file_params).get('copy_file_path')
            log('Downloaded web file to: {}'.format(scratch_sra_file_path))

            fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, sra_url_to_add)

            import_sra_reads_params = sra_url_to_add
            import_sra_reads_params.update(fastq_file_path)

            workspace_name_or_id = workspace_name
            if str(workspace_name_or_id).isdigit():
                import_sra_reads_params['wsid'] = int(workspace_name_or_id)
            else:
                import_sra_reads_params['wsname'] = str(workspace_name_or_id)

            log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format(
                                            json.dumps(import_sra_reads_params, indent=1)))

            obj_ref = self.ru.upload_reads(import_sra_reads_params).get('obj_ref')
            obj_refs.append(obj_ref)
            uploaded_files.append(sra_url_to_add.get('file_url'))

        return {'obj_refs': obj_refs, 'uploaded_files': uploaded_files}

    def validate_import_sra_from_staging_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method
        """
        # check for required parameters
        for p in ['staging_file_subdir_path', 'sequencing_tech', 'name', 'workspace_name']:
            if p not in params:
                raise ValueError('"' + p + '" parameter is required, but missing')

        self._validate_upload_staging_file_availability(params.get('staging_file_subdir_path'))

    def validate_import_sra_from_web_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method
        """
        # check for required parameters
        for p in ['download_type', 'workspace_name', 'sra_urls_to_add']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        if not isinstance(params.get('sra_urls_to_add'), list):
            raise ValueError('sra_urls_to_add is not type list as required')

        for sra_url_to_add in params.get('sra_urls_to_add'):
            for p in ['file_url', 'sequencing_tech', 'name']:
                if p not in sra_url_to_add:
                    raise ValueError('"{}" parameter is required, but missing'.format(p))

    def generate_report(self, obj_refs_list, params):
        """
        generate_report: generate summary report

        obj_refs: generated workspace object references. (return of import_sra_from_staging/web)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """
        uuid_string = str(uuid.uuid4())

        objects_created = list()
        objects_data = list()

        for obj_ref in obj_refs_list:
            get_objects_params = {
                'object_refs': [obj_ref],
                'ignore_errors': False
            }
            objects_data.append(self.dfu.get_objects(get_objects_params))

            objects_created.append({'ref': obj_ref,
                                    'description': 'Imported Reads'})

        output_html_files = self.generate_html_report(objects_data, params, uuid_string)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 460,
            'report_object_name': 'kb_sra_upload_report_' + uuid_string}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def generate_html_report(self, reads_objs, params, uuid_string):
        """
        _generate_html_report: generate html summary report
        """
        log('Start generating html report')
        pprint(params)

        tmp_dir = os.path.join(self.scratch, uuid_string)
        handler_utils._mkdir_p(tmp_dir)
        result_file_path = os.path.join(tmp_dir, 'report.html')
        html_report = list()
        objects_content = ''

        for index, reads_obj in enumerate(reads_objs):

            idx = str(index)
            reads_data = reads_obj.get('data')[0].get('data')
            reads_info = reads_obj.get('data')[0].get('info')
            reads_ref = str(reads_info[6]) + '/' + str(reads_info[0]) + '/' + str(reads_info[4])
            reads_obj_name = str(reads_info[1])

            with open(os.path.join(os.path.dirname(__file__), 'report_template_sra/table_panel.html'),
                      'r') as object_content_file:
                report_template = object_content_file.read()
                report_template = report_template.replace('_NUM', str(idx))
                report_template = report_template.replace('OBJECT_NAME', reads_obj_name)
                if index == 0:
                    report_template = report_template.replace('panel-collapse collapse', 'panel-collapse collapse in')

            objects_content += report_template
            base_percentages = ''
            for key, val in reads_data.get('base_percentages').items():
                base_percentages += '{}({}%) '.format(key, val)

            reads_overview_data = collections.OrderedDict()

            reads_overview_data['Name'] = '{} ({})'.format(reads_obj_name, reads_ref)
            reads_overview_data['Uploaded File'] = params.get('uploaded_files')[index]
            reads_overview_data['Date Uploaded'] = time.strftime("%c")
            reads_overview_data['Number of Reads'] = '{:,}'.format(reads_data.get('read_count'))

            reads_type = reads_info[2].lower()
            if 'single' in reads_type:
                reads_overview_data['Type'] = 'Single End'
            elif 'paired' in reads_type:
                reads_overview_data['Type'] = 'Paired End'
            else:
                reads_overview_data['Type'] = 'Unknown'

            reads_overview_data['Platform'] = reads_data.get('sequencing_tech', 'Unknown')

            reads_single_genome = str(reads_data.get('single_genome', 'Unknown'))
            if '0' in reads_single_genome:
                reads_overview_data['Single Genome'] = 'No'
            elif '1' in reads_single_genome:
                reads_overview_data['Single Genome'] = 'Yes'
            else:
                reads_overview_data['Single Genome'] = 'Unknown'

            insert_size_mean = params.get('insert_size_mean', 'Not Specified')
            if insert_size_mean is not None:
                reads_overview_data['Insert Size Mean'] = str(insert_size_mean)
            else:
                reads_overview_data['Insert Size Mean'] = 'Not Specified'

            insert_size_std_dev = params.get('insert_size_std_dev', 'Not Specified')
            if insert_size_std_dev is not None:
                reads_overview_data['Insert Size Std Dev'] = str(insert_size_std_dev)
            else:
                reads_overview_data['Insert Size Std Dev'] = 'Not Specified'

            reads_outward_orientation = str(reads_data.get('read_orientation_outward', 'Unknown'))
            if '0' in reads_outward_orientation:
                reads_overview_data['Outward Read Orientation'] = 'No'
            elif '1' in reads_outward_orientation:
                reads_overview_data['Outward Read Orientation'] = 'Yes'
            else:
                reads_overview_data['Outward Read Orientation'] = 'Unknown'

            reads_stats_data = collections.OrderedDict()

            reads_stats_data['Number of Reads'] = '{:,}'.format(reads_data.get('read_count'))
            reads_stats_data['Total Number of Bases'] = '{:,}'.format(reads_data.get('total_bases'))
            reads_stats_data['Mean Read Length'] = str(reads_data.get('read_length_mean'))
            reads_stats_data['Read Length Std Dev'] = str(reads_data.get('read_length_stdev'))
            dup_reads_percent = '{:.2f}'.format(float(reads_data.get('number_of_duplicates') * 100) / \
                                                reads_data.get('read_count'))
            reads_stats_data['Number of Duplicate Reads(%)'] = '{} ({}%)' \
                .format(str(reads_data.get('number_of_duplicates')),
                        dup_reads_percent)
            reads_stats_data['Phred Type'] = str(reads_data.get('phred_type'))
            reads_stats_data['Quality Score Mean'] = '{0:.2f}'.format(reads_data.get('qual_mean'))
            reads_stats_data['Quality Score (Min/Max)'] = '{}/{}'.format(str(reads_data.get('qual_min')),
                                                                         str(reads_data.get('qual_max')))
            reads_stats_data['GC Percentage'] = str(round(reads_data.get('gc_content') * 100, 2)) + '%'
            reads_stats_data['Base Percentages'] = base_percentages

            overview_content = ''
            for key, val in reads_overview_data.items():
                overview_content += '<tr><td><b>{}</b></td>'.format(key)
                overview_content += '<td>{}</td>'.format(val)
                overview_content += '</tr>'

            stats_content = ''
            for key, val in reads_stats_data.items():
                stats_content += '<tr><td><b>{}</b></td>'.format(key)
                stats_content += '<td>{}</td>'.format(val)
                stats_content += '</tr>'

            objects_content = objects_content.replace('###OVERVIEW_CONTENT###', overview_content)
            objects_content = objects_content.replace('###STATS_CONTENT###', stats_content)

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template_sra/report_head.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('###TABLE_PANELS_CONTENT###',
                                                          objects_content)
                result_file.write(report_template)
        result_file.close()

        shutil.copytree(os.path.join(os.path.dirname(__file__), 'report_template_sra/bootstrap-3.3.7'),
                        os.path.join(tmp_dir, 'bootstrap-3.3.7'))
        shutil.copy(os.path.join(os.path.dirname(__file__), 'report_template_sra/jquery-3.2.1.min.js'),
                    os.path.join(tmp_dir, 'jquery-3.2.1.min.js'))

        matched_files = []
        for root, dirnames, filenames in os.walk(tmp_dir):
            for filename in fnmatch.filter(filenames, '*.gz'):
                matched_files.append(os.path.join(root, filename))

        for gz_file in matched_files:
            print(('Removing ' + gz_file))
            os.remove(gz_file)

        report_shock_id = self.dfu.file_to_shock({'file_path': tmp_dir,
                                                  'pack': 'zip'})['shock_id']
        html_report.append({'shock_id': report_shock_id,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for Imported Assembly'})
        return html_report
예제 #15
0
class ImportGFFFastaUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url, service_ver='beta')
        self.uploader_utils = UploaderUtil(config)

    def import_gff_fasta_from_staging(self, params):
        """
        import_gff_fasta_from_staging: wrapper method for GenomeFileUtil.fasta_gff_to_genome

        required params:
        fasta_file: fasta file from user's staging area
        gff_file: gff file from user's staging area
        genome_name: output genome object name
        workspace_name: workspace name that genome will be stored to

        file paths for both fasta and gff files must be subdirectory file path in staging area
        e.g.
        for file: /data/bulk/user_name/file_name
        staging_file_subdir_path is file_name
        for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
        staging_file_subdir_path is subdir_1/subdir_2/file_name

        optional params:
        scientific_name: proper name for species, key for taxonomy lookup.Default to 'unknown_taxon'
        source: Source Of The GenBank File. Default to 'User'
        taxon_wsname - where the reference taxons are. Default to 'ReferenceTaxons'
        taxon_reference - if defined, will try to link the Genome to the specified taxonomy object
        release: Release Or Version Of The Source Data
        genetic_code: Genetic Code For The Organism
        type: 'Reference', 'User upload', 'Representative'

        return:
        genome_ref: return object reference
        report_name: name of generated report (if any)
        report_ref: report reference (if any)
        """

        logging.info(
            '--->\nrunning ImportGFFFastaUtil.import_gff_fasta_from_staging\n'
            + f'params:\n{json.dumps(params, indent=1)}')

        self.validate_import_gff_fasta_from_staging_params(params)

        for key in ('fasta_file', 'gff_file'):
            file_path = params[key]
            download_staging_file_params = {
                'staging_file_subdir_path': file_path
            }
            dfu_returnVal = self.dfu.download_staging_file(
                download_staging_file_params)
            params[key] = {'path': dfu_returnVal['copy_file_path']}

        returnVal = self.gfu.fasta_gff_to_genome(params)
        """
        Update the workspace object related meta-data for staged file
        """
        # self.uploader_utils.update_staging_service(download_staging_file_params.get('staging_file_subdir_path'),
        #                                            returnVal['genome_ref'])
        return returnVal

    def validate_import_gff_fasta_from_staging_params(self, params):
        """
        validate_import_gff_fasta_from_staging_params:
                    validates params passed to import_gff_fasta_from_staging method
        """
        # check for required parameters
        for p in ['genome_name', 'workspace_name', 'fasta_file', 'gff_file']:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

        # for now must use workspace name, but no ws_id_to_name() function available
        if str(params["workspace_name"]).isdigit():
            error_msg = '"{}" parameter is a workspace id and workspace name is required'.format(
                params["workspace_name"])
            raise ValueError(error_msg)
예제 #16
0
class ImportAssemblyUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'],
                                    'import_assembly_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.uploader_utils = UploaderUtil(config)
        self.max_contigs_for_report = 200

    def import_fasta_as_assembly_from_staging(self, params):
        """
          import_fasta_as_assembly_from_staging: wrapper method for
                                    AssemblyUtil.save_assembly_from_fasta

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          assembly_name - output Assembly file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        """
        logging.info(
            '--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n'
            f'params:\n{json.dumps(params, indent=1)}')

        self.validate_import_fasta_as_assembly_from_staging(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        file = {'path': scratch_file_path}
        import_assembly_params = params
        import_assembly_params['file'] = file

        ref = self.au.save_assembly_from_fasta(import_assembly_params)
        """
        Update the workspace object related meta-data for staged file
        """
        # self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'), ref)

        returnVal = {'obj_ref': ref}
        return returnVal

    def validate_import_fasta_as_assembly_from_staging(self, params):
        """
        validate_import_fasta_as_assembly_from_staging:
                    validates params passed to import_fasta_as_assembly_from_staging method
        """
        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'workspace_name', 'assembly_name'
        ]:
            if p not in params:
                raise ValueError(f'"{p}" parameter is required, but missing')

    def generate_html_report(self, assembly_ref, assembly_object, params):
        """
        _generate_html_report: generate html summary report
        """
        logging.info('start generating html report')
        html_report = list()

        assembly_data = assembly_object.get('data')[0].get('data')
        assembly_info = assembly_object.get('data')[0].get('info')

        tmp_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        handler_utils._mkdir_p(tmp_dir)
        result_file_path = os.path.join(tmp_dir, 'report.html')

        assembly_name = str(assembly_info[1])
        assembly_file = params.get('staging_file_subdir_path')

        dna_size = assembly_data.get('dna_size')
        num_contigs = assembly_data.get('num_contigs')

        assembly_overview_data = collections.OrderedDict()

        assembly_overview_data['Name'] = '{} ({})'.format(
            assembly_name, assembly_ref)
        assembly_overview_data['Uploaded File'] = assembly_file
        assembly_overview_data['Date Uploaded'] = time.strftime("%c")
        assembly_overview_data['DNA Size'] = dna_size
        assembly_overview_data['Number of Contigs'] = num_contigs

        overview_content = ['<br/><table>\n']
        for key, val in assembly_overview_data.items():
            overview_content.append(f'<tr><td><b>{key}</b></td>')
            overview_content.append(f'<td>{val}</td></tr>\n')
        overview_content.append('</table>')

        contig_data = assembly_data.get('contigs').values()
        contig_content = str([str(e['contig_id']), e['length']]
                             for e in contig_data)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__), 'report_template',
                                 'report_template_assembly.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>*Overview_Content*</p>', ''.join(overview_content))
                report_template = report_template.replace(
                    '*CONTIG_DATA*', contig_content)
                result_file.write(report_template)
        result_file.close()

        report_shock_id = self.dfu.file_to_shock({
            'file_path': tmp_dir,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Imported Assembly'
        })
        return html_report

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                         import_fasta_as_assembly_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """
        object_data = self.dfu.get_objects({'object_refs': [obj_ref]})

        report_params = {
            'workspace_name':
            params.get('workspace_name'),
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Imported Assembly'
            }],
            'report_object_name':
            f'kb_upload_assembly_report_{uuid.uuid4()}'
        }

        num_contigs = object_data['data'][0]['data']['num_contigs']
        if num_contigs > self.max_contigs_for_report:
            report_params['message'] = (
                "The uploaded assembly has too many contigs to display "
                "here. Click on the object for a dedicated viewer")
        else:
            output_html_files = self.generate_html_report(
                obj_ref, object_data, params)
            report_params.update({
                'html_links': output_html_files,
                'direct_html_link_index': 0,
                'html_window_height': 375,
            })

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
예제 #17
0
class ImportGenbankUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = os.path.join(config['scratch'],
                                    'import_GenBank_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url, service_ver='beta')
        self.uploader_utils = UploaderUtil(config)

    def import_genbank_from_staging(self, params):
        '''
          import_genbank_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          genome_name - becomes the name of the object
          workspace_name - the name of the workspace it gets saved to.
          source - Source of the file typically something like RefSeq or Ensembl

          optional params:
          release - Release or version number of the data
              per example Ensembl has numbered releases of all their data: Release 31
          generate_ids_if_needed - If field used for feature id is not there,
              generate ids (default behavior is raising an exception)
          genetic_code - Genetic code of organism. Overwrites determined GC from
              taxon object
          type - Reference, Representative or User upload

          return:
          genome_ref: return object reference
        '''

        logging.info(
            '--->\nrunning ImportGenbankUtil.import_genbank_from_staging\n' +
            f'params:\n{json.dumps(params, indent=1)}')

        self.validate_import_genbank_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        file = {'path': scratch_file_path}
        import_genbank_params = params
        import_genbank_params['file'] = file
        del import_genbank_params['staging_file_subdir_path']

        returnVal = self.gfu.genbank_to_genome(import_genbank_params)
        """
        Update the workspace object related meta-data for staged file
        """
        #self.uploader_utils.update_staging_service(
        #    download_staging_file_params.get('staging_file_subdir_path'),
        #    returnVal['genome_ref'])
        return returnVal

    def validate_import_genbank_from_staging_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method
        """
        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'genome_name', 'workspace_name',
                'source'
        ]:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

    def generate_html_report(self, genome_ref, params):
        """
        _generate_html_report: generate html summary report
        """
        logging.info('start generating html report')
        genome_obj = self.dfu.get_objects({'object_refs': [genome_ref]})
        html_report = list()
        tmp_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        handler_utils._mkdir_p(tmp_dir)
        result_file_path = os.path.join(tmp_dir, 'report.html')

        genome_name = str(genome_obj.get('data')[0].get('info')[1])
        genome_file = params.get('staging_file_subdir_path')

        genome_data = genome_obj.get('data')[0].get('data')
        genome_info = genome_obj.get('data')[0].get('info')
        source = genome_info[10].get('Source')
        num_contigs = genome_info[10].get('Number contigs')
        size = genome_info[10].get('Size')
        gc_content = genome_info[10].get('GC content')
        warnings = genome_data.get('warnings', [])
        feature_counts = sorted(
            list(genome_data.get('feature_counts', {}).items()))

        genome_overview_data = collections.OrderedDict()

        genome_overview_data['Name'] = '{} ({})'.format(
            genome_name, genome_ref)
        #genome_overview_data['Uploaded File'] = genome_file
        genome_overview_data['Date Uploaded'] = time.strftime("%c")
        genome_overview_data['Source'] = source
        genome_overview_data['Number of Contigs'] = num_contigs
        genome_overview_data['Size'] = size
        genome_overview_data['GC Content'] = gc_content
        genome_overview_data['Warnings'] = "\n".join(warnings)
        genome_overview_data.update(feature_counts)

        overview_content = ''
        overview_content += '<br/><table>\n'
        for key, val in genome_overview_data.items():
            overview_content += '<tr><td><b>{}</b></td>'.format(key)
            overview_content += '<td>{}</td>'.format(val)
            overview_content += '</tr>\n'
        overview_content += '</table>'

        feature_content = str(
            [[str(k), v]
             for k, v in list(genome_data.get('feature_counts', {}).items())
             if k != 'gene'])
        contig_content = str(
            [[str(c), l]
             for c, l in zip(genome_data.get('contig_ids', []),
                             genome_data.get('contig_lengths', []))])
        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__), 'report_template',
                                 'report_template_genome.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', overview_content)
                report_template = report_template.replace(
                    '*FEATURE_DATA*', feature_content)
                report_template = report_template.replace(
                    '*CONTIG_DATA*', contig_content)
                result_file.write(report_template)
        result_file.close()

        report_shock_id = self.dfu.file_to_shock({
            'file_path': tmp_dir,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for imported Genome'
        })
        return html_report

    def generate_report(self, genome_ref, params):
        """
        :param genome_ref:  Return Val from GenomeFileUtil for Uploaded genome
                            Need to get report warnings and message from it.
        :return:
        """
        uuid_string = str(uuid.uuid4())

        objects_created = [{
            'ref': genome_ref,
            'description': 'Imported Genome'
        }]

        output_html_files = self.generate_html_report(genome_ref, params)
        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 300,
            'report_object_name': 'kb_genome_upload_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output