示例#1
0
class MotifSaver:
    def __init__(self, callback, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback)
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)

    def saveMotifSet(self, motifset, params):
        if isinstance(motifset, list):
            logging.info('Saving multiple motifset objects...')
            # TODO: accept lists of constructed motif set object
            # TODO: check if list is a save_objects list or list of motifsets process accordingly
            # TODO: accept list of object names
            obj = self.dfu.save_objects({
                'id':
                self.dfu.ws_name_to_id(params['ws_name']),
                'objects': [{
                    'type': 'KBaseGeneRegulation.MotifSet',
                    'data': motifset[0],
                    'name': str(uuid.uuid4())
                }]
            })[0]

            return str(obj[6]) + "/" + str(obj[0]) + "/" + str(obj[4])
        elif isinstance(motifset, dict):
            logging.info('Saving a single motifset object...')
            # TODO: accept object name
            obj = self.dfu.save_objects({
                'id':
                self.dfu.ws_name_to_id(params['ws_name']),
                'objects': [{
                    'type': 'KBaseGeneRegulation.MotifSet',
                    'data': motifset,
                    'name': str(uuid.uuid4())
                }]
            })[0]

            return str(obj[6]) + "/" + str(obj[0]) + "/" + str(obj[4])
        else:
            raise ValueError(
                'Input to motif saver should be either: ' + '\n'
                '1. a list of constructed KBaseGeneRegulation.MotifSet objects (dictionary)\n'
                +
                '2. a single KBaseGeneRegulation.MotifSet object (dictionary)')
示例#2
0
    def test_AssemblySet_input(self):

        # Initiate empty data dictionaries and get data_util
        dfu = DataFileUtil(self.callback_url)
        assembly_dict = dict()
        assembly_set_dict = dict()
        dfu_dict = dict()
        dfu_dict_2 = dict()
        # Get workspace id and name
        wsName = self.getWsName()
        ws_id = dfu.ws_name_to_id(wsName)

        # FASTA to assembly object
        Fasta_assembly_dict = {
            "path": "/kb/module/work/tmp/NC_021490.fasta",
            "assembly_name": "test_assembly"
        }
        params = {
            "file": Fasta_assembly_dict,
            "workspace_name": wsName,
            "assembly_name": "test_assembly"
        }
        ref = self.getImpl().save_assembly_from_fasta(self.ctx, params)

        # Create assembly data dictionaries
        assembly_dict.update({"label": "assemblySetTest", "ref": ref[0]})
        assembly_set_dict.update({
            "description": " ",
            "items": [assembly_dict]
        })
        # Create DataFileUtil dictionaries
        dfu_dict.update({
            "type": "KBaseSets.AssemblySet",
            "data": assembly_set_dict,
            "name": "Assembly_Test"
        })
        dfu_dict_2.update({'id': ws_id, 'objects': [dfu_dict]})

        # Create assembly set object
        assembly_set_obj = dfu.save_objects(dfu_dict_2)
        assembly_set_ref = [
            str(assembly_set_obj[0][6]) + '/' + str(assembly_set_obj[0][0]) +
            '/' + str(assembly_set_obj[0][4])
        ]

        # Get FASTA
        ret = self.getImpl().get_fastas(self.callback_url, assembly_set_ref)
示例#3
0
    def UploadFromMdscan(self, callback_url, params):
        """
          :param params: instance of type "UploadmfmdInParams" -> structure:
             parameter "path" of String, parameter "ws_name" of String,
             parameter "obj_name" of String
          :returns: instance of type "UploadOutput" -> structure: parameter
             "obj_ref" of String
          """
        # ctx is the context object
        # return variables are: output
        #BEGIN UploadFromMdscan
        print('Extracting motifs')
        motifList = self.parse_mdscan_output(params['path'])
        print(motifList)

        MSO = {}
        MSO = motifList

        dfu = DataFileUtil(callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name'])
        save_objects_params['objects'] = [{
            'type': 'KBaseGeneRegulation.MotifSet',
            'data': MSO,
            'name': params['obj_name']
        }]

        info = dfu.save_objects(save_objects_params)[0]
        print('SAVED OBJECT')
        print(info)
        motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        print(motif_set_ref)
        output = {'obj_ref': motif_set_ref}
        print(output)

        #exit("test")
        #END UploadFromMdscan

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method UploadFrommfmd return value ' +
                             'output is not type dict as required.')

        # return the results
        return [output]
示例#4
0
def upload_pangenome(cb_url, scratch, Pangenome, workspace_name,
                     pangenome_name):
    """
    params:
        cb_url         : callback url
        scratch        : folder path to Pangenome object 
        pangenome      : KBaseGenomes.Pangenome like object
        workspace_name : workspace name
        pangenome_name : Pangenome display name
    Returns:
        pangenome_ref: Pangenome workspace reference
        pangenome_info: info on pangenome object
    """
    dfu = DataFileUtil(cb_url)
    meta = {}
    hidden = 0

    # dump pangenome to scratch for upload
    # data_path = os.path.join(scratch, pangenome_name + '.json')
    # json.dump(pangenome, open(data_path, 'w'))

    if isinstance(workspace_name, int) or workspace_name.isdigit():
        workspace_id = workspace_name
    else:
        workspace_id = dfu.ws_name_to_id(workspace_name)

    save_params = {
        'id':
        workspace_id,
        'objects': [{
            'type': 'KBaseGenomes.Pangenome',
            'data': Pangenome,
            'name': pangenome_name,
            'meta': meta,
            'hidden': hidden
        }]
    }

    info = dfu.save_objects(save_params)[0]

    ref = "{}/{}/{}".format(info[6], info[0], info[4])
    print("Pangenome saved to {}".format(ref))

    return {'pangenome_ref': ref, 'pangenome_info': info}
示例#5
0
    def test_metagenome_binned_input(self):

        # Setup
        path = "data/binnedContigs.json"
        ws_path = '/kb/module/work/tmp'
        assembly_path = "data/CCESR16_SPAdes.assembly.fa"
        shutil.copy2(path, ws_path)
        shutil.copy2(assembly_path, ws_path)
        dfu = DataFileUtil(self.callback_url)
        wsName = self.getWsName()
        ws_id = dfu.ws_name_to_id(wsName)

        # FASTA to assembly object
        Fasta_assembly_dict = {
            "path": '/kb/module/work/tmp/CCESR16_SPAdes.assembly.fa',
            "assembly_name": "meta_assembly"
        }
        assembly_params = {
            "file": Fasta_assembly_dict,
            "workspace_name": wsName,
            "assembly_name": "test_assembly"
        }
        meta_assembly_ref = self.getImpl().save_assembly_from_fasta(
            self.ctx, assembly_params)[0]

        # Upload genome, copy genome to workspace folder, & genome data dictionary input
        meta_data = json.load(open(path))
        meta_data['assembly_ref'] = meta_assembly_ref
        meta_dict = [{
            'name': 'Meta_test',
            'type': 'KBaseMetagenomes.BinnedContigs',
            'data': meta_data
        }]

        # Create .Genome object in workspace with save_objects
        binned_obj = dfu.save_objects({'id': ws_id, 'objects': meta_dict})

        binned_obj_info = binned_obj[0]
        binned_obj_ref = str(binned_obj_info[6]) + '/' + str(
            binned_obj_info[0]) + '/' + str(binned_obj_info[4])

        # Get FASTA
        ret = self.getImpl().get_fastas(self.callback_url, [binned_obj_ref])
示例#6
0
class GenomeInterface:
    def __init__(self, config):
        self.handle_url = config.handleURL
        self.shock_url = config.shockURL
        self.sw_url = config.srvWizURL
        self.token = config.token
        self.auth_service_url = config.authServiceUrl
        self.callback_url = config.callbackURL
        self.re_api_url = config.re_api_url
        self.auth_client = _KBaseAuth(self.auth_service_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.taxon_wsname = config.raw['taxon-workspace-name']
        self.scratch = config.raw['scratch']
        self.ws_large_data = WsLargeDataIO(self.callback_url)

    @staticmethod
    def _validate_save_one_genome_params(params):
        """
        _validate_save_one_genome_params:
                validates params passed to save_one_genome method
        """
        logging.info('start validating save_one_genome params')
        # check for required parameters
        for p in ['workspace', 'name', 'data']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _check_shock_response(self, response, errtxt):
        """
        _check_shock_response: check shock node response (Copied from DataFileUtil)
        """
        logging.info('start checking shock response')

        if not response.ok:
            try:
                err = json.loads(response.content)['error'][0]
            except Exception:
                # this means shock is down or not responding.
                logging.error(
                    "Couldn't parse response error content from Shock: " +
                    response.content)
                response.raise_for_status()
            raise ValueError(errtxt + str(err))

    def _own_handle(self, genome_data, handle_property):
        """
        _own_handle: check that handle_property point to shock nodes owned by calling user
        """

        logging.info(
            'start checking handle {} ownership'.format(handle_property))

        if handle_property in genome_data:
            handle_id = genome_data[handle_property]
            hs = HandleService(self.handle_url, token=self.token)
            handles = hs.hids_to_handles([handle_id])
            shock_id = handles[0]['id']

            # Copy from DataFileUtil.own_shock_node implementation:
            header = {'Authorization': 'Oauth {}'.format(self.token)}
            res = requests.get(self.shock_url + '/node/' + shock_id +
                               '/acl/?verbosity=full',
                               headers=header,
                               allow_redirects=True)
            self._check_shock_response(
                res, 'Error getting ACLs for Shock node {}: '.format(shock_id))
            owner = res.json()['data']['owner']['username']
            user_id = self.auth_client.get_user(self.token)

            if owner != user_id:
                logging.info('start copying node to owner: {}'.format(user_id))
                dfu_shock = self.dfu.copy_shock_node({
                    'shock_id': shock_id,
                    'make_handle': True
                })
                handle_id = dfu_shock['handle']['hid']
                genome_data[handle_property] = handle_id

    def _check_dna_sequence_in_features(self, genome):
        """
        _check_dna_sequence_in_features: check dna sequence in each feature
        """
        logging.info('start checking dna sequence in each feature')

        if 'features' in genome:
            features_to_work = {}
            for feature in genome['features']:
                if not ('dna_sequence' in feature and feature['dna_sequence']):
                    features_to_work[feature['id']] = feature['location']

            if len(features_to_work) > 0:
                aseq = AssemblySequenceAPI(self.sw_url, token=self.token)
                get_dna_params = {'requested_features': features_to_work}
                if 'assembly_ref' in genome:
                    get_dna_params['assembly_ref'] = genome['assembly_ref']
                elif 'contigset_ref' in genome:
                    get_dna_params['contigset_ref'] = genome['contigset_ref']
                else:
                    # Nothing to do (it may be test genome without contigs)...
                    return
                dna_sequences = aseq.get_dna_sequences(
                    get_dna_params)['dna_sequences']
                for feature in genome['features']:
                    if feature['id'] in dna_sequences:
                        feature['dna_sequence'] = dna_sequences[feature['id']]
                        feature['dna_sequence_length'] = len(
                            feature['dna_sequence'])

    def get_one_genome(self, params):
        """Fetch a genome using WSLargeDataIO and return it as a python dict"""
        logging.info('fetching genome object')

        res = self.ws_large_data.get_objects(params)['data'][0]
        data = json.load(open(res['data_json_file']))
        return data, res['info']
        # return self.dfu.get_objects(params)['data'][0]

    def save_one_genome(self, params):
        logging.info('start saving genome object')
        self._validate_save_one_genome_params(params)
        workspace = params['workspace']
        name = params['name']
        data = params['data']
        # XXX there is no `workspace_datatype` param in the spec
        ws_datatype = params.get('workspace_datatype', "KBaseGenomes.Genome")
        # XXX there is no `meta` param in the spec
        meta = params.get('meta', {})
        if "AnnotatedMetagenomeAssembly" in ws_datatype:
            if params.get('upgrade') or 'feature_counts' not in data:
                data = self._update_metagenome(data)
        else:
            if params.get('upgrade') or 'feature_counts' not in data:
                data = self._update_genome(data)

        # check all handles point to shock nodes owned by calling user
        self._own_handle(data, 'genbank_handle_ref')
        self._own_handle(data, 'gff_handle_ref')
        if "AnnotatedMetagenomeAssembly" not in ws_datatype:
            self._check_dna_sequence_in_features(data)
            data['warnings'] = self.validate_genome(data)

        # sort data
        data = GenomeUtils.sort_dict(data)
        # dump genome to scratch for upload
        data_path = os.path.join(self.scratch, name + ".json")
        json.dump(data, open(data_path, 'w'))
        if 'hidden' in params and str(
                params['hidden']).lower() in ('yes', 'true', 't', '1'):
            hidden = 1
        else:
            hidden = 0

        if isinstance(workspace, int) or workspace.isdigit():
            workspace_id = workspace
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace)

        save_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': ws_datatype,
                'data_json_file': data_path,
                'name': name,
                'meta': meta,
                'hidden': hidden
            }]
        }
        dfu_oi = self.ws_large_data.save_objects(save_params)[0]
        returnVal = {'info': dfu_oi, 'warnings': data.get('warnings', [])}
        return returnVal

    @staticmethod
    def determine_tier(source):
        """
        Given a user provided source parameter, assign a source and genome tier
        """
        low_source = source.lower()
        if 'refseq' in low_source:
            if 'reference' in low_source:
                return "RefSeq", ['Reference', 'Representative', 'ExternalDB']
            if 'representative' in low_source:
                return "RefSeq", ['Representative', 'ExternalDB']
            if 'user' in low_source:
                return "RefSeq", ['ExternalDB', 'User']
            return "RefSeq", ['ExternalDB']
        if 'phytozome' in low_source:
            if 'flagship' in source:
                return "Phytosome", [
                    'Reference', 'Representative', 'ExternalDB'
                ]
            return "Phytosome", ['Representative', 'ExternalDB']
        if 'ensembl' in low_source:
            if 'user' in low_source:
                return "Ensembl", ['ExternalDB', 'User']
            return "Ensembl", ['Representative', 'ExternalDB']
        return source, ['User']

    def _update_metagenome(self, genome):
        """Checks for missing required fields and fixes breaking changes"""
        if 'molecule_type' not in genome:
            genome['molecule_type'] = 'Unknown'

    def _update_genome(self, genome):
        """Checks for missing required fields and fixes breaking changes"""
        # do top level updates
        ontologies_present = defaultdict(dict)  # type: dict
        ontologies_present.update(genome.get('ontologies_present', {}))
        ontology_events = genome.get('ontology_events', [])
        # NOTE: 'genome_tiers' not in Metagenome spec
        if 'genome_tiers' not in genome:
            genome['source'], genome['genome_tiers'] = self.determine_tier(
                genome['source'])
        if 'molecule_type' not in genome:
            genome['molecule_type'] = 'Unknown'

        # If an NCBI taxonomy ID is provided, fetch additional data about the taxon
        # NOTE: Metagenome object does not have a 'taxon_assignments' field
        if 'taxon_assignments' in genome and genome['taxon_assignments'].get(
                'ncbi'):
            tax_id = int(genome['taxon_assignments']['ncbi'])
            GenomeUtils.set_taxon_data(tax_id, self.re_api_url, genome)
        else:
            GenomeUtils.set_default_taxon_data(genome)

        if any([
                x not in genome
                for x in ('dna_size', 'md5', 'gc_content', 'num_contigs')
        ]):
            if 'assembly_ref' in genome:
                assembly_data = self.dfu.get_objects({
                    'object_refs': [genome['assembly_ref']],
                    'ignore_errors':
                    0
                })['data'][0]['data']
                genome["gc_content"] = assembly_data['gc_content']
                genome["dna_size"] = assembly_data['dna_size']
                genome["md5"] = assembly_data['md5']
                genome["num_contigs"] = assembly_data['num_contigs']

                if assembly_data.get('type'):
                    genome['genome_type'] = assembly_data['type']

            elif 'contigset_ref' in genome:
                contig_data = self.dfu.get_objects({
                    'object_refs': [genome['contigset_ref']],
                    'included': ['contigs/[*]/length', 'md5'],
                    'ignore_errors':
                    0
                })['data'][0]['data']
                genome["gc_content"] = None
                genome["dna_size"] = sum(
                    (c['length'] for c in contig_data['contigs']))
                genome["md5"] = contig_data['md5']
                genome["num_contigs"] = len(contig_data['contigs'])

        # NOTE: metagenomes do not have the following fields
        if 'cdss' not in genome:
            genome['cdss'] = []
        if 'mrnas' not in genome:
            genome['mrnas'] = []
        if 'non_coding_features' not in genome:
            genome['non_coding_features'] = []

        # do feature level updates
        retained_features = []
        type_counts = defaultdict(int)
        for field in ('mrnas', 'cdss', 'features'):
            for i, feat in enumerate(genome.get(field, [])):
                if 'function' in feat and not isinstance(feat, list):
                    feat['functions'] = feat['function'].split('; ')
                    del feat['function']
                if 'aliases' in feat:
                    if not feat['aliases']:
                        del feat['aliases']
                    elif not isinstance(feat['aliases'][0], (list, tuple)):
                        feat['aliases'] = [['gene_synonym', x]
                                           for x in feat['aliases']]
                if 'type' in feat:
                    type_counts[feat['type']] += 1
                for ontology, terms in feat.get('ontology_terms', {}).items():
                    for term in terms.values():
                        if isinstance(term, list):
                            continue
                        ontologies_present[ontology][
                            term['id']] = term['term_name']
                        term_evidence = []
                        for ev in term['evidence']:
                            ev['id'] = ontology
                            if "ontology_ref" in term:
                                ev['ontology_ref'] = term["ontology_ref"]
                            if ev not in ontology_events:
                                ontology_events.append(ev)
                            term_evidence.append(ontology_events.index(ev))
                        feat['ontology_terms'][ontology][
                            term['id']] = term_evidence

                # remove deprecated fields
                feat.pop('protein_families', None)
                feat.pop('atomic_regulons', None)
                feat.pop('orthologs', None)
                feat.pop('coexpressed_fids', None)
                feat.pop('publications', None)
                feat.pop('regulon_data', None)
                feat.pop('subsystem_data', None)

                if 'dna_sequence_length' not in feat:
                    feat['dna_sequence_length'] = sum(
                        x[3] for x in feat['location'])

                if 'protein_translation' in feat and 'protein_md5' not in feat:
                    feat['protein_md5'] = hashlib.md5(
                        feat.get('protein_translation',
                                 '').encode('utf8')).hexdigest()

                # split all the stuff lumped together in old versions into the
                # right arrays
                if field == 'features':
                    if feat.get('type', 'gene') == 'gene':
                        if not feat.get('cdss', []):
                            type_counts['non_coding_genes'] += 1
                            genome['non_coding_features'].append(feat)
                        else:
                            retained_features.append(feat)
                    elif feat.get('type', 'gene') == 'CDS':
                        if 'parent_gene' not in feat:
                            feat['parent_gene'] = ''
                        genome['cdss'].append(feat)
                    elif feat.get('type', 'gene') == 'mRNA':
                        if 'parent_gene' not in feat:
                            feat['parent_gene'] = ''
                        genome['mrnas'].append(feat)

        genome['features'] = retained_features
        if ontology_events:
            genome['ontology_events'] = ontology_events
        if ontologies_present:
            genome['ontologies_present'] = ontologies_present

        type_counts['mRNA'] = len(genome.get('mrnas', []))
        type_counts['CDS'] = len(genome.get('cdss', []))
        type_counts['protein_encoding_gene'] = len(genome['features'])
        type_counts['non_coding_features'] = len(
            genome.get('non_coding_features', []))
        genome['feature_counts'] = type_counts
        return genome

    @staticmethod
    def validate_genome(g):
        """
        Run a series of checks on the genome object and return any warnings
        """

        allowed_tiers = {'Representative', 'Reference', 'ExternalDB', 'User'}

        logging.info('Validating genome object contents')
        warnings = g.get('warnings', [])

        # TODO: Determine whether these checks make any sense for Metagenome
        #       object. Looks like many don't.
        #       Add validations for Metagenome object

        # this will fire for some annotation methods like PROKKA
        if g.get('domain') == "Bacteria" and len(g.get('cdss', [])) != len(
                g['features']):
            warnings.append(
                "For prokaryotes, CDS array should generally be the"
                " same length as the Features array.")

        if g.get('domain') == "Eukaryota" and len(g.get(
                'features', [])) == len(g.get('cdss', [])):
            warnings.append(
                "For Eukaryotes, CDS array should not be the same "
                "length as the Features array due to RNA splicing.")

        if g.get('molecule_type') not in {"DNA", 'ds-DNA'}:
            if g.get('domain', '') not in {'Virus', 'Viroid'} and \
                            g['molecule_type'] not in {"DNA", 'ds-DNA'}:
                warnings.append("Genome molecule_type {} is not expected "
                                "for domain {}.".format(
                                    g['molecule_type'], g.get('domain', '')))

        if "genome_tiers" in g and set(g['genome_tiers']) - allowed_tiers:
            warnings.append("Undefined terms in genome_tiers: " +
                            ", ".join(set(g['genome_tiers']) - allowed_tiers))
        assignments = g.get('taxon_assignments', {})
        if 'ncbi' not in assignments or ('taxon_ref' in g and g['taxon_ref']
                                         == "ReferenceTaxons/unknown_taxon"):
            warnings.append('Unable to determine organism taxonomy')

        GenomeInterface.handle_large_genomes(g)
        return warnings

    @staticmethod
    def handle_large_genomes(g):
        """Determines the size of various feature arrays and starts removing the dna_sequence if
        the genome is getting too big to store in the workspace"""
        def _get_size(obj):
            return sys.getsizeof(json.dumps(obj))

        # seems pretty uneccessary...
        def sizeof_fmt(num):
            for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
                if abs(num) < 1024.0:
                    return "%3.1f %sB" % (num, unit)
                num /= 1024.0
            return "%.1f %sB" % (num, 'Yi')

        feature_lists = ('mrnas', 'features', 'non_coding_features', 'cdss')
        master_key_sizes = dict()
        # Change want full breakdown to True if want to see break down of sizes.
        # By making this a changeable flag it will run faster for standard uploads.
        want_full_breakdown = False
        for x in feature_lists:
            if x in g:
                need_to_remove_dna_sequence = _get_size(g) > MAX_GENOME_SIZE
                if need_to_remove_dna_sequence or want_full_breakdown:
                    feature_type_dict_keys = dict()
                    for feature in g[x]:
                        for feature_key in list(feature.keys()):
                            if feature_key == "dna_sequence" and need_to_remove_dna_sequence:
                                # NOTE: should this get stored somewhere?
                                del (feature["dna_sequence"])
                            else:
                                if feature_key not in feature_type_dict_keys:
                                    feature_type_dict_keys[feature_key] = 0
                                feature_type_dict_keys[
                                    feature_key] += sys.getsizeof(
                                        feature[feature_key])
                    for feature_key in feature_type_dict_keys:
                        feature_type_dict_keys[feature_key] = sizeof_fmt(
                            feature_type_dict_keys[feature_key])
                    master_key_sizes[x] = feature_type_dict_keys
                print(f"{x}: {sizeof_fmt(_get_size(g[x]))}")
        total_size = _get_size(g)
        print(f"Total size {sizeof_fmt(total_size)} ")
        if want_full_breakdown:
            print(
                f"Here is the breakdown of the sizes of feature lists elements : "
                f"{str(master_key_sizes)}")
        if total_size > MAX_GENOME_SIZE:
            print(
                f"Here is the breakdown of the sizes of feature lists elements : "
                f"{str(master_key_sizes)}")
            raise ValueError(
                f"This genome size of {sizeof_fmt(total_size)} exceeds the maximum "
                f"permitted size of {sizeof_fmt(MAX_GENOME_SIZE)}.\n"
                f"Here is the breakdown for feature lists and their respective "
                f"sizes:\n{master_key_sizes}")
class VariationUtil:
    '''
    Module Name:
    VariationUtil

    Module Description:
    A KBase module: VariationUtil
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.4"
    GIT_URL = ""
    GIT_COMMIT_HASH = "2a4c2dbc058b702811c967997e7100c834e755d4"

    #BEGIN_CLASS_HEADER

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        # TODO: Make sure we need to define config just once
        # TODO: Change the code tp match this style
        self.config = config
        self.config['SDK_CALLBACK_URL'] = os.environ['SDK_CALLBACK_URL']
        self.config['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN']
        self.scratch = config['scratch']
        self.config['ws_url'] = config['workspace-url']

        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.shared_folder = config['scratch']
        self.hr = htmlreportutils()
        self.ws_url = config['workspace-url']
        self.wsc = Workspace(self.ws_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.shock_url = config['shock-url']
        self.sw_url = config['srv-wiz-url']
        pass
        #END_CONSTRUCTOR
        pass

    def save_variation_from_vcf(self, ctx, params):
        """
        Save a variation (and trait?) object to Kbase given a reference genome, object output name,
        Variant Call Format (VCF) file, and sample attribute file.
        :param params: instance of type "save_variation_input" (## funcdef
           save_variation_from_vcf ## required input params:
           genome_or_assembly_ref: KBaseGenomes.Genome or
           KBaseGenomeAnnotations.Assembly object reference *** variation
           input data *** vcf_staging_file_path: path to location data
           associated with samples variation_object_name: output name for
           KBase variation object *** sample input data ***
           sample_attribute_ref: x/y/z reference to kbase sample attribute
           optional params: NA output report: report_name report_ref HTML
           visualization: Manhattan plot *** Visualization *** plot_maf:
           generate histogram of minor allele frequencies plot_hwe: generate
           histogram of Hardy-Weinberg Equilibrium p-values) -> structure:
           parameter "workspace_name" of String, parameter
           "genome_or_assembly_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "vcf_staging_file_path" of type "filepath"
           (KBase file path to staging files), parameter
           "variation_object_name" of String, parameter
           "sample_attribute_ref" of type "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "save_variation_output" -> structure:
           parameter "variation_ref" of String, parameter "report_name" of
           String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: report
        #BEGIN save_variation_from_vcf

        # Get workspace id
        ws_id = self.dfu.ws_name_to_id(params['workspace_name'])

        genome_ref = None
        assembly_ref = None

        # 1) Find whether the input is a genome or assembly
        #    and get genome_ref and assembly_ref

        genome_or_assembly_ref = params['genome_or_assembly_ref']
        obj_type = self.wsc.get_object_info3(
            {'objects': [{
                'ref': genome_or_assembly_ref
            }]})['infos'][0][2]
        if ('KBaseGenomes.Genome' in obj_type):
            genome_ref = genome_or_assembly_ref
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])
            assembly_ref = subset[0]['data']['assembly_ref']
        elif ('KBaseGenomeAnnotations.Assembly' in obj_type):
            assembly_ref = genome_or_assembly_ref
        else:
            raise ValueError(obj_type +
                             ' is not the right input for this method. ' +
                             'Valid input include KBaseGenomes.Genome or ' +
                             'KBaseGenomeAnnotations.Assembly ')

        # 2)  Validate VCF, compress, and build VCF index
        logging.info("Validating VCF, Compressing VCF and Indexing VCF")
        VCFUtilsConfig = {"scratch": self.scratch}
        VCFUtilsParams = {
            'vcf_staging_file_path': params['vcf_staging_file_path']
        }
        VCU = VCFUtils(VCFUtilsConfig)
        vcf_compressed, vcf_index, vcf_strain_ids = VCU.validate_compress_and_index_vcf(
            VCFUtilsParams)

        if vcf_index is not None:
            logging.info("vcf compressed :" + str(vcf_compressed))
            logging.info("vcf index :" + str(vcf_index))
            logging.info("vcf strain ids :" + str(vcf_strain_ids))
        else:
            raise ValueError(
                "No result obtained after compression and indexing step")

        # Get strain info
        # TODO: Remove hard coded stuff
        StrainInfoConfig = self.config
        StrainInfoParams = {
            "ws_id": ws_id,
            "vcf_strain_ids": vcf_strain_ids,
            "sample_set_ref": params["sample_set_ref"],
            "sample_attribute_name": params["sample_attribute_name"]
        }
        si = StrainInfo(StrainInfoConfig)
        sample_attribute_ref, strains = si.sample_strain_info(StrainInfoParams)
        print(sample_attribute_ref)
        print(strains)

        # 3) Create json for variation object. In a following step genomic_indexes will be
        # added to this json before it is saved as Variation object

        VCFToVariationConfig = {"ws_url": self.ws_url, "scratch": self.scratch}
        VCFToVariationParams = {
            "vcf_compressed": vcf_compressed,
            "vcf_index": vcf_index,
            "assembly_ref": assembly_ref
        }
        if genome_ref is not None:
            VCFToVariationParams['genome_ref'] = genome_ref

        vtv = VCFToVariation(VCFToVariationConfig)
        variation_object_data = vtv.generate_variation_object_data(
            VCFToVariationParams)
        # Append sample information
        if sample_attribute_ref:
            variation_object_data[
                'sample_attribute_ref'] = sample_attribute_ref
        else:
            raise ValueError(f'sample attribute ref not found')
        if strains:
            variation_object_data['strains'] = strains
        else:
            raise ValueError(f'strains not found')
        if 'sample_set_ref' in params:
            variation_object_data['sample_set_ref'] = params['sample_set_ref']
        else:
            raise ValueError(f'sample_set_ref not found in params')

        # 4)
        JbrowseConfig = {
            "ws_url": self.ws_url,
            "scratch": self.scratch,
            "sw_url": self.sw_url,
            "shock_url": self.shock_url
        }
        JbrowseParams = {
            "vcf_path": vcf_compressed,
            "assembly_ref": assembly_ref,
            "binsize": 10000,
            "vcf_shock_id": variation_object_data['vcf_handle']['id'],
            "vcf_index_shock_id":
            variation_object_data['vcf_index_handle']['id']
        }
        if genome_ref is not None:
            JbrowseParams["genome_ref"] = genome_ref

        jb = JbrowseUtil(JbrowseConfig)
        jbrowse_report = jb.prepare_jbrowse_report(JbrowseParams)

        # 5) Now we have the genomic indices and we have all the information needed to save
        # the variation object
        # TODO: Take out the genomic_indexes field from the object spec
        #  TODO: Take out the vcf_handle stuff not needed

        variation_object_data['genomic_indexes'] = jbrowse_report[
            'genomic_indexes']

        var_obj = self.dfu.save_objects({
            'id':
            self.dfu.ws_name_to_id(params['workspace_name']),
            'objects': [{
                'type': 'KBaseGwasData.Variations',
                'data': variation_object_data,
                'name': params['variation_object_name']
            }]
        })[0]

        var_obj_ref = str(var_obj[6]) + "/" + str(var_obj[0]) + "/" + str(
            var_obj[4])
        print(var_obj_ref)

        # 5) Build Variation report
        # This is a simple report
        #
        workspace = params['workspace_name']
        created_objects = []
        created_objects.append({
            "ref": var_obj_ref,
            "description": "Variation Object"
        })
        ReportConfig = {
            "ws_url": self.ws_url,
            "scratch": self.scratch,
        }
        ReportParams = {"variation_ref": var_obj_ref}
        vr = VariationReport(ReportConfig)
        htmlreport_dir = vr.create_variation_report(ReportParams)

        report = self.hr.create_html_report(htmlreport_dir, workspace,
                                            created_objects)
        report['variation_ref'] = var_obj_ref
        print(report)
        #END save_variation_from_vcf

        # At some point might do deeper type checking...
        if not isinstance(report, dict):
            raise ValueError('Method save_variation_from_vcf return value ' +
                             'report is not type dict as required.')
        # return the results
        return [report]

    def export_variation_as_vcf(self, ctx, params):
        """
        Export KBase variation object as Variant Call Format (VCF) file
        :param params: instance of type "export_variation_input" (## funcdef
           export_variation_as_vcf ## required input params: Variation object
           reference optional params: NA output report: Shock id pointing to
           exported vcf file) -> structure: parameter "input_var_ref" of type
           "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "export_variation_output" -> structure:
           parameter "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_variation_as_vcf

        vtv = VariationToVCF(self.callback_url, self.shared_folder)
        output = vtv.export_as_vcf(params)

        #END export_variation_as_vcf

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_variation_as_vcf return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def get_variation_as_vcf(self, ctx, params):
        """
        Given a reference to a variation object, and output name: return a Variant Call Format (VCF)
        file path and name.
        :param params: instance of type "get_variation_input" (## funcdef
           get_variation_as_vcf ## required input params: Variation object
           reference output file name optional params: NA output report: path
           to returned vcf name of variation object) -> structure: parameter
           "variation_ref" of type "obj_ref" (An X/Y/Z style reference),
           parameter "filename" of String
        :returns: instance of type "get_variation_output" -> structure:
           parameter "path" of type "filepath" (KBase file path to staging
           files), parameter "variation_name" of String
        """
        # ctx is the context object
        # return variables are: file
        #BEGIN get_variation_as_vcf
        vtv = VariationToVCF(self.callback_url, self.shared_folder)
        file = vtv.variation_to_vcf(params)

        #END get_variation_as_vcf

        # At some point might do deeper type checking...
        if not isinstance(file, dict):
            raise ValueError('Method get_variation_as_vcf return value ' +
                             'file is not type dict as required.')
        # return the results
        return [file]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
示例#8
0
class VCFToVariation:
    def __init__(self, config, scratch, callback_url ):
        self.scratch = config['scratch']
        self.ws_url = config['workspace-url']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.wsc = Workspace(self.ws_url)
        self.scratch = scratch
        self.callback_url = callback_url
        self.au = AssemblyUtil(self.callback_url)
        self.gapi = GenericsAPI(self.callback_url)


    def _parse_vcf_data(self, params):
        vcf_filepath = self._stage_input(params)

        # file is validated by this point, can assume vcf_filepath is valid
        reader = vcf.Reader(open(vcf_filepath, 'r'))

        version = float(reader.metadata['fileformat'][4:6])
        genotypes = reader.samples
        chromosomes = []
        contigs = {}
        totalvars = 0

        for record in reader:
            totalvars += 1
            if record.CHROM not in chromosomes:
                chromosomes.append(record.CHROM)

            if record.CHROM not in contigs.keys():
                passvar = 1 if not record.FILTER else 0

                contigs[record.CHROM] = {
                    'contig_id': record.CHROM,
                    'totalvariants': 1,
                    'passvariants': passvar,
                    'length': int(record.affected_end-record.affected_start),
                }
            else:
                contigs[record.CHROM]['totalvariants'] += 1
                if not record.FILTER:
                    contigs[record.CHROM]['passvariants'] += 1

        vcf_info = {
            'version': version,
            'contigs': contigs,
            'total_variants': totalvars,
            'genotype_ids': genotypes,
            'chromosome_ids': chromosomes,
            'file_ref': vcf_filepath
        }

        return vcf_info


    def _validate_vcf_to_sample(self, vcf_genotypes, sample_ids):
        genos_not_found = []

        vgenotypes = [x.upper().strip() for x in vcf_genotypes]
        sids = [x.upper().strip() for x in sample_ids]

        for geno in vgenotypes:
            if geno not in sids:
                genos_not_found.append(geno)

        if not genos_not_found:
            return True
        else:
            return genos_not_found

    def _chk_if_vcf_ids_in_assembly(self, vcf_chromosomes, assembly_chromosomes):
        chromos_not_in_assembly = []

        pp(assembly_chromosomes)

        for chromo in vcf_chromosomes:
            if chromo not in assembly_chromosomes:
                chromos_not_in_assembly.append(chromo)

        if not chromos_not_in_assembly:
            return True
        else:
            return chromos_not_in_assembly

    def _get_vcf_version(self, vcf_filepath):
        with(gzip.open if is_gz_file(vcf_filepath) else open)(vcf_filepath, 'rt') as vcf:
            line = vcf.readline()
            tokens = line.split('=')

            if not (tokens[0].startswith('##fileformat')):
                log("Invalid VCF.  ##fileformat line in meta is improperly formatted.")
                raise ValueError("Invalid VCF.  ##fileformat line in meta is improperly formatted. "
                                 "Check VCF file specifications: https://samtools.github.io/hts-specs/")

            vcf_version = float(tokens[1][-4:].rstrip())

            return vcf_version

    def validate_vcf(self, params):
        if 'genome_or_assembly_ref' not in params:
            raise ValueError('Genome or Assembly reference not in input parameters: \n\n'+params)
        if 'vcf_staging_file_path' not in params:
            raise ValueError('VCF staging file path not in input parameters: \n\n' + params)


        vcf_filepath = self._stage_input(params)

        vcf_version = self._get_vcf_version(vcf_filepath)

        # setup directorys for validation output
        validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4()))
        os.mkdir(validation_output_dir)

        # vcftools (vcf-validator) supports VCF v4.0-4.2
        # https://github.com/vcftools/vcftools

        # EBIvariation/vcf-validator (vcf_validator_linux) supports VCF v4.1-4.3
        # https://github.com/EBIvariation/vcf-validator

        # vcftools is only to validate VCF v4.0

        if vcf_version >= 4.1:
            print("Using vcf_validator_linux...")
            validator_cmd = ["vcf_validator_linux"]
            validator_cmd.append("-i")
            validator_cmd.append(vcf_filepath)
            validator_cmd.append("-l")
            validator_cmd.append('error')
            print("VCF version "+str(vcf_version)+".")
        elif vcf_version >= 4.0:
            print("Using vcftools to validate...")
            validator_cmd = ["vcf-validator"]
            validator_cmd.append(vcf_filepath)
            print("VCF version 4.0.")
        else:
            raise ValueError('VCF Version not in file, or fileformat line malformatted, or not version >=4.0. file format line must be the '
                             'first line of vcf file and in appropriate syntax. Check VCF file specifications: '
                             'https://samtools.github.io/hts-specs/')

        print("Validator command: {}".format(validator_cmd))

        p = subprocess.Popen(validator_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        validator_output = []
        while True:
            line = p.stdout.readline()
            if not line:
                break
            if line.decode("utf-8").strip().startswith('[info]'):
                validator_output.append(line.decode("utf-8"))

        out, err = p.communicate()

        validation_output_filename = os.path.join(validation_output_dir, 'vcf_validation.txt')
        file_output_chk = []

        try:
            if validator_output[0][:6] == '[info]':
                # validation by vcf_validator_linux
                validation_output_filename = validator_output[1].split(' ')[6].strip('\n')
                vo = validator_output[2].split(' ')
                file_output_chk = ''.join(vo[9:]).strip('\n')

                if not os.path.exists(validation_output_filename):
                    raise ValueError(validation_output_filename+' does not exist!')

                if not file_output_chk == 'isvalid':
                    print('\n'.join(validator_output))
                    raise ValueError('\n'.join(validator_output))

                #TODO: more detailed validation parsing for vcf_validator_linux
            else:
                if validator_output:
                    with open(validation_output_filename, 'w') as f:
                        for line in validator_output:
                            f.write(str(line))
                        f.close()
                    print('\n'.join(validator_output))
                    raise ValueError('\n'.join(validator_output))
                else:
                    with open(validation_output_filename, 'w') as f:
                        f.write("vcftools used to validate vcf file:\n"+vcf_filepath+"\n\File is validate as of vcf spec v4.0")
                        f.close()

                # TODO: more detailed validation parsing for vcftools
        except IndexError:
            # if vcf file < v4.1, and valid it will produce index error on line 132
            if validator_output:
                with open(validation_output_filename, 'w') as f:
                    for line in validator_output:
                        f.write(str(line))
                    f.close()
                print('\n'.join(validator_output))
                raise ValueError('\n'.join(validator_output))
            else:
                with open(validation_output_filename, 'w') as f:
                    f.write("vcftools used to validate vcf file:\n" + vcf_filepath + "\n\File is validate as of vcf spec v4.0")
                    f.close()

        if not os.path.exists(validation_output_filename):
            print('Validator did not generate log file!')
            raise SystemError("Validator did not generate a log file.")

        log("Validator output filepath: {}".format(validation_output_filename))

        log("Return code from validator {}".format(p.returncode))

        return validation_output_filename

    def _stage_input(self, params):
        # extract file location from input ui parameters
        if params['vcf_staging_file_path'].startswith('/kb/module/test/'):
            # variation utils unit test
            vcf_local_file_path = params['vcf_staging_file_path']

            if vcf_local_file_path.endswith('.gz'):
                with gzip.open(vcf_local_file_path, 'rb') as f_in:
                    with open(vcf_local_file_path[:-3], 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)

                vcf_local_file_path = vcf_local_file_path[:-3]
        else:
            staging_dir = '/staging'
            vcf_local_file_path = os.path.join(staging_dir, params['vcf_staging_file_path'])

        if not os.path.exists(vcf_local_file_path):
            raise OSError('VCF input path does not exist, or is not readable')

        orig_file_path = os.path.join(self.scratch, 'original_' + os.path.basename(vcf_local_file_path))
        print(f'VCF: {vcf_local_file_path} Orig: {orig_file_path}')
        self.original_file = shutil.copy(vcf_local_file_path, orig_file_path)

        # TODO: use data file utils here, upload vcf to shock, use dfu.
        if is_gz_file(vcf_local_file_path):
            # /staging is read only, therefore have to copy before uncompressing
            if not vcf_local_file_path == os.path.join(self.scratch, params['vcf_staging_file_path']):
                copy = shutil.copy(vcf_local_file_path, os.path.join(self.scratch,params['vcf_staging_file_path']))
                unpack = self.dfu.unpack_file({'file_path': copy})
            else:
                unpack = {}
                unpack['file_path'] = os.path.join(self.scratch,params['vcf_staging_file_path'])
            params['vcf_local_file_path'] = unpack['file_path']
            return unpack['file_path']
        else:
            params['vcf_local_file_path'] = vcf_local_file_path 
            return vcf_local_file_path

    def _create_sample_attribute_file(self, vcf_file, sample_attribute_mapping_file):
        """
        function for creating sample attribute mapping file.
        """
        try:
            with open (vcf_file, 'r') as vcf_handle:
                Lines = vcf_handle.readlines()

                for line in Lines:
                    if(line.startswith("#CHROM")):
                       header = line.lstrip().split("\t")

                       try:
                          with open (sample_attribute_mapping_file, 'w') as attribute_mapping_handle:
                              attribute_mapping_handle.write("Attribute\tAttribute ontology ID\tUnit\tUnit ontology ID")

                              for i in range(9,len(header)):
                                  attribute_mapping_handle.write("\t"+header[i])
                              #attribute_mapping_handle.write("\n")


                              attribute_mapping_handle.write("label\t\t\t")
                              for j in range(9,len(header)):
                                  attribute_mapping_handle.write("\t"+header[j])
                              #attribute_mapping_handle.write("\n")
                       except IOError:
                           print("Could not write to file:", sample_attribute_mapping_file)

        except IOError:
               print("Could not read file:", vcf_file)

    def _validate_assembly_ids(self, params):
        # All chromosome ids from the vcf should be in assembly
        # but not all assembly chromosome ids should be in vcf


        if ('genome_ref' in params):
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': params['genome_or_assembly_ref']
            }])

            self.vcf_info['assembly_ref'] = subset[0]['data']['assembly_ref']

        if ('assembly_ref' in params):
            self.vcf_info['assembly_ref'] = params['assembly_ref']

        assembly_chromosome_ids_call = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': self.vcf_info['assembly_ref']
        }])

        assembly_chromosomes = assembly_chromosome_ids_call[0]['data']['contigs'].keys()
        vcf_chromosomes = self.vcf_info['chromosome_ids']

        chk_assembly_ids =  self._chk_if_vcf_ids_in_assembly(vcf_chromosomes, assembly_chromosomes)

        if isinstance(chk_assembly_ids, list):
            failed_ids = ' '.join(chk_assembly_ids)
            print(f'VCF contig ids: {failed_ids} are not present in assembly.')
            raise ValueError(f'VCF contig ids: {failed_ids} are not present in assembly.')


        return assembly_chromosomes

    def _validate_sample_ids(self, params):
        # All samples within the VCF file need to be in sample attribute list


        vcf_genotypes = self.vcf_info['genotype_ids']

        sample_ids_subset = self.wsc.get_object_subset([{
            'included': ['/instances'],
            'ref': params['sample_attribute_ref']
        }])

        sample_ids = sample_ids_subset[0]['data']['instances'].keys()

        validate_genotypes = self._validate_vcf_to_sample(vcf_genotypes, sample_ids)

        if isinstance(validate_genotypes, list):
            failed_genos = ' '.join(validate_genotypes)
            print(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.')
            raise ValueError(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.')

        return sample_ids

    def _construct_contig_info(self, params):
        """
            KBaseGwasData.Variations type spec

            /*
               Contig variation data
                 contig_id - contig identifier
                 totalvariants - total number of variants in each contig
                 passvariants - total number of variants that pass quality variation filter in contig
                 length - length of contig from assembly data
             */

             typdef structure {
               string contig_id;
               int totalvariants;
               int passvariants;
               int length; // from assembly
             } contig_info;
        """

        assembly_chromosome_dict = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': self.vcf_info['assembly_ref']
        }])[0]['data']['contigs']


        contigs = []

        contig_infos = self.vcf_info['contigs']


        for contig_id in contig_infos:
            length_contig = assembly_chromosome_dict[contig_id].get("length")
            contig_infos[contig_id]["length"] = length_contig
            contigs.append(contig_infos[contig_id])

        return contigs
   

    def _bgzip_vcf(self, vcf_filepath):

        if not os.path.exists(vcf_filepath):
           print (vcf_filepath + " does not exist")

        zip_cmd = ["bgzip", vcf_filepath]
        
        p = subprocess.Popen(zip_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()        
        
        bgzip_file_path = vcf_filepath + ".gz"
        print (bgzip_file_path)
          
        return bgzip_file_path
  
 
    def _index_vcf(self, bgzip_file):
 
        output_dir = self.scratch

        bgzip_filepath = os.path.join(self.scratch, bgzip_file)
        if not os.path.exists(bgzip_filepath):
           print (bgzip_filepath + " does not exist")

        index_cmd = ["tabix", "-p", "vcf", bgzip_filepath]       
        p = subprocess.Popen(index_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()
         
        index_file_path = bgzip_filepath + ".tbi"
     
        return index_file_path

    def _index_assembly(self, assembly_file):
        if not os.path.exists(assembly_file):
           print (assembly_file + " does not exist")

        logging.info("indexing assembly file")

        assembly_index_cmd = ["samtools", "faidx", assembly_file]
        print(assembly_index_cmd)
        p = subprocess.Popen(assembly_index_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()

        logging.info("indexing of assembly file done!")

        return assembly_file + ".fai"

    def _download_assembly(self, assembly_ref):
        file = self.au.get_assembly_as_fasta({
          'ref': assembly_ref
        })
        return file
 
    def _construct_variation(self, params, contigs_info):
        
        """
            KBaseGwasData.Variations type spec
             /*
               Variation object data structure
                 num_genotypes - number of total genotypes within variant file
                 num_variants - number of total variants within variant file
                 contigs - list of contig ids and variant information
                 attribute_ref - KBase reference to attribute mapping workspace object
                 genome_ref - KBase reference to genome workspace object
                 assembly_ref - KBase reference to assemebly workspace object
                 vcf_handle_ref - VCF handle reference to VCF file

                 @optional genome_ref
             */
             typedef structure {
               int numgenotypes;
               int numvariants;
               list<contig_info> contigs;
               attribute_ref population; // KBaseExperiments.AttributeMapping
               genome_ref genome_ref; // KBaseGenomes.Genome
               assembly_ref assemby_ref; // KBaseGenomeAnnotations.Assembly
               vcf_handle_ref vcf_handle_ref;
             } Variations;

            :param params: KBase ui input parameters
            :param population: previoiusly constructed sample population data
            :return: constructed variation object (dictionary)
        """

        if not self.vcf_info['file_ref'].startswith(self.scratch):
            new_vcf_file = os.path.join(self.scratch, os.path.basename(self.vcf_info['file_ref']))
            self.vcf_info['file_ref'] = shutil.copy(self.vcf_info['file_ref'], new_vcf_file)
      

        vcf_staged_file = self.original_file

        bgzip_file_path = self._bgzip_vcf(vcf_staged_file)
        vcf_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': bgzip_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(bgzip_file_path, vcf_shock_file_ref)


        index_file_path = self._index_vcf(bgzip_file_path)
        vcf_index_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': index_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(index_file_path, vcf_index_shock_file_ref)


        assembly_file_path = self._download_assembly(self.vcf_info['assembly_ref'])['path']

        assembly_index_file_path = self._index_assembly(assembly_file_path)
        assembly_index_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': assembly_index_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(assembly_index_file_path, assembly_index_shock_file_ref)
        
        variation_obj = {
            'numgenotypes': int(len(self.vcf_info['genotype_ids'])),
            'numvariants': int(self.vcf_info['total_variants']),
            'contigs': contigs_info,
            'population': params['sample_attribute_ref'],

            # TYPE SPEC CHANGE: need to change type spec to assembly_ref instead of assemby_ref
            'assemby_ref': self.vcf_info['assembly_ref'],
            'vcf_handle_ref': vcf_shock_file_ref['handle']['hid'],
            'vcf_handle' : vcf_shock_file_ref['handle'],
            'vcf_index_handle_ref': vcf_index_shock_file_ref['handle']['hid'],
            'vcf_index_handle': vcf_index_shock_file_ref['handle'],
            'assembly_index_handle_ref': assembly_index_shock_file_ref['handle']['hid'],
            'assembly_index_handle': assembly_index_shock_file_ref['handle']
        }
        if 'genome_ref' in params:
            variation_obj['genome_ref'] =  params['genome_ref']

        return variation_obj

    def _save_var_obj(self, params, var):
        """
        :param params:
        :param var:
        :return:
            DataFileUtils object_info:
                objid - the numerical id of the object.
                name - the name of the object.
                type - the type of the object.
                save_date - the save date of the object.
                ver - the version of the object.
                saved_by - the user that saved or copied the object.
                wsid - the id of the workspace containing the object.
                workspace - the name of the workspace containing the object.
                chsum - the md5 checksum of the object.
                size - the size of the object in bytes.
                meta - arbitrary user-supplied metadata about the object.
        """

        print('Saving Variation to workspace...\n')

        if var:
            if not 'variation_object_name' in params:
                var_obj_name = 'variation_'+str(uuid.uuid4())
            else:
                var_obj_name = params['variation_object_name']

            var_obj_info = self.dfu.save_objects({
                'id': self.dfu.ws_name_to_id(params['workspace_name']),
                'objects': [{
                    'type': 'KBaseGwasData.Variations',
                    'data': var,
                    'name': var_obj_name
                }]
            })[0]

            return var_obj_info
        else:
            raise ValueError('Variation object blank, cannot not save to workspace!')

    def _validate_sample_attribute_ref(self, params):

        #params["sample_attribute_ref"] = ''  #just for testing
        if not params['sample_attribute_ref']:
           sample_attribute_mapping_file = os.path.join(self.scratch ,"sample_attribute.tsv")   #hardcoded for testing
           self._create_sample_attribute_file(params['vcf_local_file_path'], sample_attribute_mapping_file)
          
           logging.info("Uploading sample attribute file to ref")
           vcf_sample_attribute_shock_file_ref = self.dfu.file_to_shock(
               {'file_path': sample_attribute_mapping_file, 'make_handle': 1}
           )
           shock_id = vcf_sample_attribute_shock_file_ref['shock_id']
           ws_id = self.dfu.ws_name_to_id(params['workspace_name'])
           import_params = {
                  'input_shock_id' : shock_id,
                  'output_ws_id': ws_id,
                  'output_obj_name': 'Sample_attribute'}

           ret = self.gapi.file_to_attribute_mapping(import_params)
           params['sample_attribute_ref'] = ret['attribute_mapping_ref']

    def import_vcf(self, params):
        # VCF validation
        # VCF file validation
        file_valid_result = self.validate_vcf(params)
        self._validate_sample_attribute_ref(params)
        # VCF file parsing
        self.vcf_info = self._parse_vcf_data(params)
        # Validate vcf chromosome ids against assembly chromosome ids
        self._validate_assembly_ids(params)
        # Validate vcf genotypes against sample meta data ids
        self._validate_sample_ids(params)

        # Variation object construction
        # construct contigs_info
        contigs_info = self._construct_contig_info(params)
        # construct variation
        var = self._construct_variation(params, contigs_info)

        # Save variation object to workspace
        var_wksp_obj = self._save_var_obj(params, var)

        return [var_wksp_obj, var]
class FastaToAssembly:

    def __init__(self, callback_url, scratch, ws_url):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)
        self.ws = Workspace(ws_url)

        # Note added X due to kb|g.1886.fasta
        self.valid_chars = "-ACGTUWSMKRYBDHVNX"
        self.amino_acid_specific_characters = "PLIFQE"

    def import_fasta(self, ctx, params):
        print('validating parameters')
        self.validate_params(params)

        print('staging input files')
        fasta_file_path = self.stage_input(params)

        if 'min_contig_length' in params:
            min_contig_length = int(params['min_contig_length'])
            print(f'filtering FASTA file by contig length (min len={min_contig_length} bp)')
            fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length)

        print(f'parsing FASTA file: {fasta_file_path}')
        assembly_data = self.parse_fasta(fasta_file_path, params)
        print(f' - parsed {assembly_data["num_contigs"]} contigs,{assembly_data["dna_size"]} bp')
        print('saving assembly to KBase')

        # save file to shock and build handle
        fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path)
        # construct the output object
        assembly_object_to_save = self.build_assembly_object(assembly_data,
                                                             fasta_file_handle_info,
                                                             params)
        json.dump(assembly_object_to_save, open(self.scratch+"/example.json", 'w'))

        # save to WS and return
        if 'workspace_id' in params:
            workspace_id = int(params['workspace_id'])
        else:
            workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])
        assembly_info = self.save_assembly_object(workspace_id,
                                                  params['assembly_name'],
                                                  assembly_object_to_save)

        return assembly_info

    def build_assembly_object(self, assembly_data, fasta_file_handle_info, params):
        """ construct the WS object data to save based on the parsed info and params """
        assembly_data['assembly_id'] = params['assembly_name']
        assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid']
        fasta_file_handle_info['handle'] = fasta_file_handle_info['handle']
        assembly_data['fasta_handle_info'] = fasta_file_handle_info

        assembly_data['type'] = 'Unknown'
        if 'type' in params:
            assembly_data['type'] = params['type']

        if 'taxon_ref' in params:
            info = self.ws.get_object_info3({'objects':[{'ref': params['taxon_ref']}]})['infos'][0]
            assembly_data['taxon_ref'] = f'{info[6]}/{info[0]}/{info[4]}'

        if 'external_source' in params:
            assembly_data['external_source'] = params['external_source']

        if 'external_source_id' in params:
            assembly_data['external_source_id'] = params['external_source_id']

        if 'external_source_origination_date' in params:
            assembly_data['external_source_origination_date'] = params['external_source_origination_date']

        return sort_dict(assembly_data)

    def parse_fasta(self, fasta_file_path, params):
        """ Do the actual work of inspecting each contig """

        # variables to store running counts of things
        total_length = 0
        base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0}
        md5_list = []

        # map from contig_id to contig_info
        all_contig_data = {}
        extra_contig_info = {}
        if'contig_info' in params:
            extra_contig_info = params['contig_info']

        for record in SeqIO.parse(fasta_file_path, "fasta"):
            # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()),
            #           id='gi|113968346|ref|NC_008321.1|',
            #           name='gi|113968346|ref|NC_008321.1|',
            #           description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome',
            #           dbxrefs=[])

            sequence = str(record.seq).upper()

            contig_info = {
                'contig_id': record.id,
                'name': record.id,
                'description': record.description[len(record.id):].strip(),
                'length': len(record.seq)
            }

            # 1) compute sequence character statistics running total
            total_length += contig_info['length']
            sequence_count_table = dict(Counter(sequence))
            for character in sequence_count_table:
                if character in base_counts:
                    base_counts[character] = base_counts[character] + sequence_count_table[character]
                else:
                    base_counts[character] = sequence_count_table[character]
                if character not in self.valid_chars:
                    if character in self.amino_acid_specific_characters:
                        raise ValueError('This FASTA file may have amino acids in it instead '
                                         'of the required nucleotides.')
                    raise ValueError(f"This FASTA file has non nucleic acid characters: "
                                     f"{character}")

            # 2) record number of 'N' characters (only set if there are some)
            Ncount = 0
            if 'N' in sequence_count_table:
                Ncount = sequence_count_table['N']
                contig_info['Ncount'] = Ncount

            # 2b) record if the contig is circular
            if record.id in extra_contig_info:
                if 'is_circ' in extra_contig_info[record.id]:
                    contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ'])
                if 'description' in extra_contig_info[record.id]:
                    contig_info['description'] = str(extra_contig_info[record.id]['description'])

            # 3) record md5 checksum
            contig_md5 = md5(sequence.encode()).hexdigest()
            contig_info['md5'] = contig_md5
            md5_list.append(contig_md5)

            # 4) record the all important GC to ~3 significant digits
            GC_count = 0
            for base in ['G', 'C']:
                if base in sequence_count_table:
                    GC_count += sequence_count_table[base]
            contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5)

            # 5) add to contig list
            if contig_info['contig_id'] in all_contig_data:
                raise ValueError('The FASTA header key ' + contig_info['contig_id'] +
                                 'appears more than once in the file')

            all_contig_data[contig_info['contig_id']] = contig_info

        # Aggregate stats for the data
        total_gc_content = None
        if total_length > 0:
            total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5)
        assembly_data = {
            'md5': md5(",".join(sorted(md5_list)).encode()).hexdigest(),
            'base_counts': base_counts,
            'dna_size': total_length,
            'gc_content': total_gc_content,
            'contigs': all_contig_data,
            'num_contigs': len(all_contig_data)
        }
        return assembly_data

    @staticmethod
    def fasta_filter_contigs_generator(fasta_record_iter, min_contig_length):
        """ generates SeqRecords iterator for writing from a legacy contigset object """
        rows = 0
        rows_added = 0
        for record in fasta_record_iter:
            rows += 1
            if len(record.seq) >= min_contig_length:
                rows_added += 1
                yield record
        print(f' - filtered out {rows - rows_added} of {rows} contigs that were shorter '
              f'than {(min_contig_length)} bp.')

    def filter_contigs_by_length(self, fasta_file_path, min_contig_length):
        """ removes all contigs less than the min_contig_length provided """
        filtered_fasta_file_path = fasta_file_path + '.filtered.fa'

        fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta')
        SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length),
                    filtered_fasta_file_path, 'fasta')

        return filtered_fasta_file_path

    def save_assembly_object(self, workspace_id, assembly_name, obj_data):
        print('Saving Assembly to Workspace')
        sys.stdout.flush()
        if len(obj_data["contigs"]) == 0:
            raise ValueError('There are no contigs to save, thus there is no valid assembly.')
        obj_info = self.dfu.save_objects({'id': workspace_id,
                                          'objects': [{'type': 'KBaseGenomeAnnotations.Assembly',
                                                       'data': obj_data,
                                                       'name': assembly_name
                                                       }]
                                          })[0]
        return obj_info

    def save_fasta_file_to_shock(self, fasta_file_path):
        """ Given the path to the file, upload to shock and return Handle information
            returns:
                typedef structure {
                    string shock_id;
                    Handle handle;
                    string node_file_name;
                    string size;
                } FileToShockOutput;

        """
        print(f'Uploading FASTA file ({fasta_file_path}) to SHOCK')
        sys.stdout.flush()
        return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1})

    def stage_input(self, params):
        """ Setup the input_directory by fetching the files and returning the path to the file"""
        file_path = None
        if 'file' in params:
            if not os.path.isfile(params['file']['path']):
                raise ValueError('KBase Assembly Utils tried to save an assembly, but the calling application specified a file ('+params['file']['path']+') that is missing. Please check the application logs for details.')
            file_path = os.path.abspath(params['file']['path'])
        elif 'shock_id' in params:
            print(f'Downloading file from SHOCK node: {params["shock_id"]}')
            sys.stdout.flush()
            input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4()))
            os.makedirs(input_directory)
            file_name = self.dfu.shock_to_file({'file_path': input_directory,
                                                'shock_id': params['shock_id']
                                                })['node_file_name']
            file_path = os.path.join(input_directory, file_name)
        elif 'ftp_url' in params:
            print(f'Downloading file from: {params["ftp_url"]}')
            sys.stdout.flush()
            file_path = self.dfu.download_web_file({'file_url': params['ftp_url'],
                                                    'download_type': 'FTP'
                                                    })['copy_file_path']

        # extract the file if it is compressed
        if file_path is not None:
            unpacked_file = self.dfu.unpack_file({'file_path': file_path})
            return unpacked_file['file_path']

        raise ValueError('No valid FASTA could be extracted based on the input parameters')


    @staticmethod
    def validate_params(params):
        for key in ('workspace_name', 'assembly_name'):
            if key not in params:
                raise ValueError('required "' + key + '" field was not defined')

        # one and only one of either 'file', 'shock_id', or ftp_url is required
        input_count = 0
        for key in ('file', 'shock_id', 'ftp_url'):
            if key in params and params[key] is not None:
                input_count = input_count + 1
                if key == 'file':
                    if not isinstance(params[key], dict) or 'path' not in params[key]:
                        raise ValueError('when specifying a FASTA file input, "path" field was not defined in "file"')

        if input_count == 0:
            raise ValueError('required FASTA file as input, set as either "file", "shock_id", or "ftp_url"')
        if input_count > 1:
            raise ValueError('required exactly one FASTA file as input source, you set more than one of ' +
                             'these fields: "file", "shock_id", or "ftp_url"')
示例#10
0
class MatrixUtil:
    def _validate_import_matrix_from_excel_params(self, params):
        """
        _validate_import_matrix_from_excel_params:
            validates params passed to import_matrix_from_excel method
        """
        logging.info('start validating import_matrix_from_excel params')

        # check for required parameters
        for p in ['obj_type', 'matrix_name', 'workspace_name', 'scale']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        obj_type = params.get('obj_type')
        if obj_type not in self.matrix_types:
            raise ValueError('Unknown matrix object type: {}'.format(obj_type))

        scale = params.get('scale')
        if scale not in SCALE_TYPES:
            raise ValueError('Unknown scale type: {}'.format(scale))

        if params.get('input_file_path'):
            file_path = params.get('input_file_path')
        elif params.get('input_shock_id'):
            file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        elif params.get('input_staging_file_path'):
            file_path = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                params.get('input_staging_file_path')
            }).get('copy_file_path')
        else:
            error_msg = "Must supply either a input_shock_id or input_file_path "
            error_msg += "or input_staging_file_path"
            raise ValueError(error_msg)

        refs = {k: v for k, v in params.items() if "_ref" in k}

        return (obj_type, file_path, params.get('workspace_name'),
                params.get('matrix_name'), refs, scale)

    def _upload_to_shock(self, file_path):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
        """
        logging.info('Start uploading file to shock: {}'.format(file_path))

        file_to_shock_params = {'file_path': file_path, 'pack': 'zip'}
        shock_id = self.dfu.file_to_shock(file_to_shock_params).get('shock_id')

        return shock_id

    @staticmethod
    def _mkdir_p(path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    @staticmethod
    def _find_between(s, start, end):
        """
        _find_between: find string in between start and end
        """

        return re.search('{}(.*){}'.format(start, end), s).group(1)

    @staticmethod
    def _write_mapping_sheet(file_path, sheet_name, mapping, index):
        """
        _write_mapping_sheet: write mapping to sheet
        """
        df_dict = collections.OrderedDict()

        df_dict[index[0]] = []
        df_dict[index[1]] = []

        for key, value in mapping.items():
            df_dict.get(index[0]).append(key)
            df_dict.get(index[1]).append(value)

        df = pd.DataFrame.from_dict(df_dict)

        with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
            writer.book = load_workbook(file_path)
            df.to_excel(writer, sheet_name=sheet_name)

    def _generate_report(self, matrix_obj_ref, workspace_name):
        """
        _generate_report: generate summary report
        """

        report_params = {
            'message':
            '',
            'objects_created': [{
                'ref': matrix_obj_ref,
                'description': 'Imported Matrix'
            }],
            'workspace_name':
            workspace_name,
            'report_object_name':
            'import_matrix_from_excel_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    @staticmethod
    def _process_mapping_sheet(file_path, sheet_name):
        """
        _process_mapping: process mapping sheet
        """

        try:
            df = pd.read_excel(file_path, sheet_name=sheet_name, dtype='str')
        except XLRDError:
            return dict()
        else:
            mapping = {value[0]: value[1] for value in df.values.tolist()}

        return mapping

    def _process_attribute_mapping_sheet(self, file_path, sheet_name,
                                         matrix_name, workspace_id):
        """
        _process_attribute_mapping_sheet: process attribute_mapping sheet
        """

        try:
            df = pd.read_excel(file_path, sheet_name=sheet_name)
        except XLRDError:
            return ''
        else:
            obj_name = f'{matrix_name}_{sheet_name}'
            result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
            self._mkdir_p(result_directory)
            file_path = os.path.join(result_directory,
                                     '{}.xlsx'.format(obj_name))
            df.to_excel(file_path)
            import_attribute_mapping_params = {
                'output_obj_name': obj_name,
                'output_ws_id': workspace_id,
                'input_file_path': file_path
            }

            ref = self.attr_util.file_to_attribute_mapping(
                import_attribute_mapping_params)

            return ref.get('attribute_mapping_ref')

    @staticmethod
    def _file_to_df(file_path):
        logging.info('start parsing file content to data frame')

        try:
            df = pd.read_excel(file_path, sheet_name='data', index_col=0)

        except XLRDError:
            try:
                df = pd.read_excel(file_path, index_col=0)
                logging.warning(
                    'WARNING: A sheet named "data" was not found in the attached file,'
                    ' proceeding with the first sheet as the data sheet.')

            except XLRDError:

                try:
                    reader = pd.read_csv(file_path, sep=None, iterator=True)
                    inferred_sep = reader._engine.data.dialect.delimiter
                    df = pd.read_csv(file_path, sep=inferred_sep, index_col=0)
                except Exception:
                    raise ValueError(
                        'Cannot parse file. Please provide valide tsv, excel or csv file'
                    )

        df.index = df.index.astype('str')
        df.columns = df.columns.astype('str')
        # fill NA with "None" so that they are properly represented as nulls in the KBase Object
        df = df.where((pd.notnull(df)), None)

        return df

    def _file_to_data(self, file_path, refs, matrix_name, workspace_id):
        logging.info('Start reading and converting excel file data')
        data = refs

        df = self._file_to_df(file_path)

        matrix_data = {
            'row_ids': df.index.tolist(),
            'col_ids': df.columns.tolist(),
            'values': df.values.tolist()
        }

        data.update({'data': matrix_data})
        data.update(
            self._get_axis_attributes('col', matrix_data, refs, file_path,
                                      matrix_name, workspace_id))
        data.update(
            self._get_axis_attributes('row', matrix_data, refs, file_path,
                                      matrix_name, workspace_id))

        # processing metadata
        metadata = self._process_mapping_sheet(file_path, 'metadata')
        data['attributes'] = {}
        data['search_attributes'] = []
        for k, v in metadata.items():
            k = k.strip()
            v = v.strip()
            if k in TYPE_ATTRIBUTES:
                data[k] = v
            else:
                data['attributes'][k] = v
                data['search_attributes'].append(" | ".join((k, v)))

        return data

    def _get_axis_attributes(self, axis, matrix_data, refs, file_path,
                             matrix_name, workspace_id):
        """Get the row/col_attributemapping and mapping of ids, validating as needed"""
        # Parameter specified mappings should take precedence over tabs in excel so only process
        # if attributemapping_ref is missing:
        attr_data = {}

        if refs.get(f'{axis}_attributemapping_ref'):
            attributemapping_ref = refs[f'{axis}_attributemapping_ref']
        else:
            attributemapping_ref = self._process_attribute_mapping_sheet(
                file_path, f'{axis}_attribute_mapping', matrix_name,
                workspace_id)

        if attributemapping_ref:
            attr_data[f'{axis}_attributemapping_ref'] = attributemapping_ref

        # col/row_mappings may not be supplied
        id_mapping = self._process_mapping_sheet(file_path, f'{axis}_mapping')
        if id_mapping:
            attr_data[f'{axis}_mapping'] = id_mapping
        # if no mapping, axis ids must match the attribute mapping
        elif attributemapping_ref:
            am_data = self.dfu.get_objects(
                {'object_refs': [attributemapping_ref]})['data'][0]['data']
            axis_ids = matrix_data[f'{axis}_ids']
            unmatched_ids = set(axis_ids) - set(am_data['instances'].keys())
            if unmatched_ids:
                name = "Column" if axis == 'col' else "Row"
                raise ValueError(
                    f"The following {name} IDs from the uploaded matrix do not match "
                    f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}"
                    f"\nPlease verify the input data or upload an excel file with a"
                    f"{name} mapping tab.")
            else:
                # just gen the IDs in this matrix
                attr_data[f'{axis}_mapping'] = {x: x for x in axis_ids}

        return attr_data

    @staticmethod
    def _build_header_str(attribute_names):  #not going to be used

        header_str = ''
        width = 100.0 / len(attribute_names)

        header_str += '<tr class="header">'
        header_str += '<th style="width:{0:.2f}%;">Feature ID</th>'.format(
            width)

        for attribute_name in attribute_names:
            header_str += '<th style="width:{0:.2f}%;"'.format(width)
            header_str += '>{}</th>'.format(attribute_name)
        header_str += '</tr>'

        return header_str

    def _build_html_str(self, row_mapping, attributemapping_data,
                        row_ids):  #not going to be used

        logging.info('Start building html replacement')

        attribute_names = [
            attributes.get('attribute')
            for attributes in attributemapping_data.get('attributes')
        ]

        header_str = self._build_header_str(attribute_names)

        table_str = ''

        instances = attributemapping_data.get('instances')

        for feature_id, attribute_id in row_mapping.items():
            if feature_id in row_ids:
                feature_instances = instances.get(attribute_id)

                table_str += '<tr>'
                table_str += '<td>{}</td>'.format(feature_id)

                for feature_instance in feature_instances:
                    table_str += '<td>{}</td>'.format(feature_instance)
                table_str += '</tr>'

        return header_str, table_str

    def _generate_search_html_report(self, header_str,
                                     table_str):  #generate search html report

        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'search.html')

        shutil.copy2(
            os.path.join(os.path.dirname(__file__), 'templates',
                         'kbase_icon.png'), output_directory)
        shutil.copy2(
            os.path.join(os.path.dirname(__file__), 'templates',
                         'search_icon.png'), output_directory)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__), 'templates',
                                 'search_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '//HEADER_STR', header_str)
                report_template = report_template.replace(
                    '//TABLE_STR', table_str)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Search Matrix App'
        })

        return html_report

    def _generate_search_report(self, header_str, table_str, workspace_name):
        logging.info('Start creating report')

        output_html_files = self._generate_search_html_report(
            header_str, table_str)

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name':
            'kb_matrix_filter_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    @staticmethod
    def _filter_value_data(value_data, remove_ids, dimension):
        """Filters a value matrix based on column or row ids"""
        def _norm_id(_id):
            return _id.replace(" ", "_")

        val_df = pd.DataFrame(value_data['values'],
                              index=value_data['row_ids'],
                              columns=value_data['col_ids'],
                              dtype='object')

        if dimension == 'row':
            filtered_df = val_df.drop(remove_ids, axis=0, errors='ignore')
            filtered_df = filtered_df.drop([_norm_id(x) for x in remove_ids],
                                           axis=0,
                                           errors='ignore')
        elif dimension == 'col':
            filtered_df = val_df.drop(remove_ids, axis=1, errors='ignore')
            filtered_df = filtered_df.drop([_norm_id(x) for x in remove_ids],
                                           axis=1,
                                           errors='ignore')
        else:
            raise ValueError('Unexpected dimension: {}'.format(dimension))

        filtered_value_data = {
            "values": filtered_df.values.tolist(),
            "col_ids": list(filtered_df.columns),
            "row_ids": list(filtered_df.index),
        }

        return filtered_value_data

    def _standardize_df(self, df, with_mean=True, with_std=True):

        logging.info("Standardizing matrix data")

        df.fillna(0, inplace=True)

        x_train = df.values

        scaler = preprocessing.StandardScaler(with_mean=with_mean,
                                              with_std=with_std).fit(x_train)

        standardized_values = scaler.transform(x_train)

        standardize_df = pd.DataFrame(index=df.index,
                                      columns=df.columns,
                                      data=standardized_values)

        return standardize_df

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.data_util = DataUtil(config)
        self.attr_util = AttributesUtil(config)
        self.matrix_types = [
            x.split(".")[1].split('-')[0]
            for x in self.data_util.list_generic_types()
        ]

    def standardize_matrix(self, params):
        """
        standardize a matrix
        """

        input_matrix_ref = params.get('input_matrix_ref')
        workspace_name = params.get('workspace_name')
        new_matrix_name = params.get('new_matrix_name')
        with_mean = params.get('with_mean', 1)
        with_std = params.get('with_std', 1)

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        input_matrix_obj = self.dfu.get_objects(
            {'object_refs': [input_matrix_ref]})['data'][0]
        input_matrix_info = input_matrix_obj['info']
        input_matrix_name = input_matrix_info[1]
        input_matrix_data = input_matrix_obj['data']

        if not new_matrix_name:
            current_time = time.localtime()
            new_matrix_name = input_matrix_name + time.strftime(
                '_%H_%M_%S_%Y_%m_%d', current_time)

        data_matrix = self.data_util.fetch_data({
            'obj_ref': input_matrix_ref
        }).get('data_matrix')
        df = pd.read_json(data_matrix)

        standardize_df = self._standardize_df(df, with_mean, with_std)

        new_matrix_data = {
            'row_ids': df.index.tolist(),
            'col_ids': df.columns.tolist(),
            'values': standardize_df.values.tolist()
        }

        input_matrix_data['data'] = new_matrix_data

        logging.info("Saving new standardized matrix object")
        info = self.dfu.save_objects({
            "id":
            workspace_id,
            "objects": [{
                "type": input_matrix_info[2],
                "data": input_matrix_data,
                "name": new_matrix_name
            }]
        })[0]

        new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])

        objects_created = [{
            'ref': new_matrix_obj_ref,
            'description': 'Standardized Matrix'
        }]

        report_params = {
            'message': '',
            'objects_created': objects_created,
            'workspace_name': workspace_name,
            'report_object_name':
            'import_matrix_from_biom_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        return {
            'new_matrix_obj_ref': new_matrix_obj_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }

    def filter_matrix(self, params):  #not going to be used
        """
        filter_matrix: create sub-matrix based on input feature_ids

        arguments:
        matrix_obj_ref: object reference of a matrix
        workspace_name: workspace name
        feature_ids: string of feature ids that result matrix contains
        filtered_matrix_name: name of newly created filtered matrix object
        """

        matrix_obj_ref = params.get('matrix_obj_ref')
        workspace_name = params.get('workspace_name')
        remove_ids = params.get('remove_ids')
        dimension = params.get('dimension')
        filtered_matrix_name = params.get('filtered_matrix_name')

        matrix_source = self.dfu.get_objects({"object_refs":
                                              [matrix_obj_ref]})['data'][0]
        matrix_info = matrix_source.get('info')
        matrix_data = matrix_source.get('data')

        matrix_type = self._find_between(matrix_info[2], '\.', '\-')

        value_data = matrix_data.get('data')
        remove_ids = [x.strip() for x in remove_ids.split(',')]
        filtered_value_data = self._filter_value_data(value_data, remove_ids,
                                                      dimension)

        # if the matrix has changed shape, update the mappings
        if len(filtered_value_data['row_ids']) < len(
                matrix_data['data']['row_ids']):
            if matrix_data.get('row_mapping'):
                matrix_data['row_mapping'] = {
                    k: matrix_data['row_mapping'][k]
                    for k in filtered_value_data['row_ids']
                }
            if matrix_data.get('feature_mapping'):
                matrix_data['feature_mapping'] = {
                    k: matrix_data['feature_mapping'][k]
                    for k in filtered_value_data['row_ids']
                }

        if len(filtered_value_data['col_ids']) < len(
                matrix_data['data']['col_ids']):
            if matrix_data.get('col_mapping'):
                matrix_data['col_mapping'] = {
                    k: matrix_data['col_mapping'][k]
                    for k in filtered_value_data['col_ids']
                }
        matrix_data['data'] = filtered_value_data

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        filtered_matrix_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(matrix_type),
            'obj_name':
            filtered_matrix_name,
            'data':
            matrix_data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        returnVal = {'matrix_obj_refs': [filtered_matrix_obj_ref]}

        report_output = self._generate_report(filtered_matrix_obj_ref,
                                              workspace_name)

        returnVal.update(report_output)

        return returnVal

    def search_matrix(self, params):  #not going to be used
        """
        search_matrix: generate a HTML report that allows users to select feature ids

        arguments:
        matrix_obj_ref: object reference of a matrix
        workspace_name: workspace name
        """

        matrix_obj_ref = params.get('matrix_obj_ref')
        workspace_name = params.get('workspace_name')

        matrix_source = self.dfu.get_objects({"object_refs":
                                              [matrix_obj_ref]})['data'][0]
        matrix_data = matrix_source.get('data')

        row_mapping = matrix_data.get('row_mapping')
        row_attributemapping_ref = matrix_data.get('row_attributemapping_ref')

        row_ids = matrix_data['data']['row_ids']

        if not (row_mapping and row_attributemapping_ref):
            raise ValueError(
                'Matrix obejct is missing either row_mapping or row_attributemapping_ref'
            )

        attributemapping_data = self.dfu.get_objects(
            {"object_refs": [row_attributemapping_ref]})['data'][0]['data']

        header_str, table_str = self._build_html_str(row_mapping,
                                                     attributemapping_data,
                                                     row_ids)

        returnVal = self._generate_search_report(header_str, table_str,
                                                 workspace_name)

        return returnVal

    def import_matrix_from_excel(self, params):
        """
        import_matrix_from_excel: import matrix object from excel

        arguments:
        obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix
        matrix_name: matrix object name
        workspace_name: workspace name matrix object to be saved to
        input_shock_id: file shock id
        or
        input_file_path: absolute file path
        or
        input_staging_file_path: staging area file path

        optional arguments:
        col_attributemapping_ref: column AttributeMapping reference
        row_attributemapping_ref: row AttributeMapping reference
        genome_ref: genome reference
        matrix_obj_ref: Matrix reference
        """

        (obj_type, file_path, workspace_name, matrix_name, refs,
         scale) = self._validate_import_matrix_from_excel_params(params)

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        data = self._file_to_data(file_path, refs, matrix_name, workspace_id)
        data['scale'] = scale
        if params.get('description'):
            data['description'] = params['description']

        matrix_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(obj_type),
            'obj_name':
            matrix_name,
            'data':
            data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        returnVal = {'matrix_obj_ref': matrix_obj_ref}

        report_output = self._generate_report(matrix_obj_ref, workspace_name)

        returnVal.update(report_output)

        return returnVal

    def export_matrix(self, params):
        """
        export_matrix: univeral downloader for matrix data object

        arguments:
        obj_ref: generics object reference

        optional arguments:
        generics_module: select the generics data to be retrieved from
                        e.g. for an given data type like below:
                        typedef structure {
                          FloatMatrix2D data;
                          condition_set_ref condition_set_ref;
                        } SomeGenericsMatrix;
                        and only data is needed
                        generics_module should be
                        {'data': 'FloatMatrix2D'}
        """
        logging.info('Start exporting matrix')

        if 'input_ref' in params:
            params['obj_ref'] = params.pop('input_ref')

        obj_source = self.dfu.get_objects(
            {"object_refs": [params.get('obj_ref')]})['data'][0]
        obj_data = obj_source.get('data')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        file_path = os.path.join(result_directory,
                                 '{}.xlsx'.format(obj_source.get('info')[1]))

        data_matrix = self.data_util.fetch_data(params).get('data_matrix')
        df = pd.read_json(data_matrix)

        df.to_excel(file_path, sheet_name='data')

        if obj_data.get('col_mapping'):
            self._write_mapping_sheet(file_path, 'col_mapping',
                                      obj_data.get('col_mapping'),
                                      ['col_name', 'instance_name'])
            obj_data.pop('col_mapping')

        if obj_data.get('row_mapping'):
            self._write_mapping_sheet(file_path, 'row_mapping',
                                      obj_data.get('row_mapping'),
                                      ['row_name', 'instance_name'])
            obj_data.pop('row_mapping')

        try:
            obj_data.pop('data')
        except KeyError:
            logging.warning('Missing key [data]')

        obj_data.update(obj_data.get('attributes', {}))  # flatten for printing
        self._write_mapping_sheet(file_path, 'metadata', obj_data,
                                  ['name', 'value'])

        shock_id = self._upload_to_shock(file_path)

        return {'shock_id': shock_id}
示例#11
0
class AttributesUtil:
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.srv_wiz_url = config['srv-wiz-url']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url)
        self.kbse = KBaseSearchEngine(config['search-url'])
        self.data_util = DataUtil(config)
        self.wsClient = workspaceService(self.ws_url, token=self.token)
        self.DEFAULT_ONTOLOGY_ID = "Custom:Term"
        self.DEFAULT_UNIT_ID = "Custom:Unit"
        self.ONT_LABEL_DEL = " - "
        self.ONT_TERM_DEL = ":"

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning(
                    "Unexpected parameter {} supplied".format(param))

    def file_to_attribute_mapping(self, params):
        """Convert a user supplied file to a compound set"""
        if 'input_file_path' in params:
            scratch_file_path = params['input_file_path']
        elif 'input_shock_id' in params:
            scratch_file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        else:
            raise ValueError(
                "Must supply either a input_shock_id or input_file_path")
        attr_mapping = self._file_to_am_obj(scratch_file_path)
        info = self.dfu.save_objects({
            "id":
            params['output_ws_id'],
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": attr_mapping,
                "name": params['output_obj_name']
            }]
        })[0]
        return {
            "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4])
        }

    def append_file_to_attribute_mapping(self,
                                         staging_file_subdir_path,
                                         old_am_ref,
                                         output_ws_id,
                                         new_am_name=None):
        """append an attribute mapping file to existing attribute mapping object
        """

        download_staging_file_params = {
            'staging_file_subdir_path': staging_file_subdir_path
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')

        append_am_data = self._file_to_am_obj(scratch_file_path)

        old_am_obj = self.dfu.get_objects({'object_refs':
                                           [old_am_ref]})['data'][0]

        old_am_info = old_am_obj['info']
        old_am_name = old_am_info[1]
        old_am_data = old_am_obj['data']

        new_am_data = self._check_and_append_am_data(old_am_data,
                                                     append_am_data)

        if not new_am_name:
            current_time = time.localtime()
            new_am_name = old_am_name + time.strftime('_%H_%M_%S_%Y_%m_%d',
                                                      current_time)

        info = self.dfu.save_objects({
            "id":
            output_ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": new_am_data,
                "name": new_am_name
            }]
        })[0]
        return {
            "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4])
        }

    def update_matrix_attribute_mapping(self, params):

        dimension = params.get('dimension')
        if dimension not in ['col', 'row']:
            raise ValueError('Please use "col" or "row" for input dimension')

        workspace_name = params.get('workspace_name')

        old_matrix_ref = params.get('input_matrix_ref')
        old_matrix_obj = self.dfu.get_objects(
            {'object_refs': [old_matrix_ref]})['data'][0]
        old_matrix_info = old_matrix_obj['info']
        old_matrix_data = old_matrix_obj['data']

        old_am_ref = old_matrix_data.get(
            '{}_attributemapping_ref'.format(dimension))

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        if not old_am_ref:
            raise ValueError(
                'Matrix object does not have {} attribute mapping'.format(
                    dimension))

        new_am_ref = self.append_file_to_attribute_mapping(
            params['staging_file_subdir_path'], old_am_ref, workspace_id,
            params['output_am_obj_name'])['attribute_mapping_ref']

        old_matrix_data['{}_attributemapping_ref'.format(
            dimension)] = new_am_ref

        info = self.dfu.save_objects({
            "id":
            workspace_id,
            "objects": [{
                "type": old_matrix_info[2],
                "data": old_matrix_data,
                "name": params['output_matrix_obj_name']
            }]
        })[0]

        new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])

        objects_created = [{
            'ref': new_am_ref,
            'description': 'Updated Attribute Mapping'
        }, {
            'ref': new_matrix_obj_ref,
            'description': 'Updated Matrix'
        }]

        report_params = {
            'message': '',
            'objects_created': objects_created,
            'workspace_name': workspace_name,
            'report_object_name':
            'import_matrix_from_biom_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        return {
            'new_matrix_obj_ref': new_matrix_obj_ref,
            'new_attribute_mapping_ref': new_am_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }

    def _check_and_append_am_data(self, old_am_data, append_am_data):

        exclude_keys = {'attributes', 'instances'}
        new_am_data = {
            k: old_am_data[k]
            for k in set(list(old_am_data.keys())) - exclude_keys
        }

        old_attrs = old_am_data.get('attributes')
        old_insts = old_am_data.get('instances')

        append_attrs = append_am_data.get('attributes')
        append_insts = append_am_data.get('instances')

        # checking duplicate attributes
        old_attrs_names = [old_attr.get('attribute') for old_attr in old_attrs]
        append_attrs_names = [
            append_attr.get('attribute') for append_attr in append_attrs
        ]

        duplicate_attrs = set(old_attrs_names).intersection(append_attrs_names)

        if duplicate_attrs:
            error_msg = 'Duplicate attribute mappings: [{}]'.format(
                duplicate_attrs)
            raise ValueError(error_msg)

        # checking missing instances
        missing_inst = old_insts.keys() - append_insts.keys()

        if missing_inst:
            error_msg = 'Appended attribute mapping misses [{}] instances'.format(
                missing_inst)
            raise ValueError(error_msg)

        new_attrs = old_attrs + append_attrs
        new_am_data['attributes'] = new_attrs

        new_insts = deepcopy(old_insts)

        for inst_name, val in new_insts.items():
            append_val = append_insts.get(inst_name)
            val.extend(append_val)

        new_am_data['instances'] = new_insts

        return new_am_data

    def _am_data_to_df(self, data):
        """
        Converts a compound set object data to a dataframe
        """

        attributes = pd.DataFrame(data['attributes'])
        attributes.rename(columns=lambda x: x.replace("ont", "ontology").
                          capitalize().replace("_", " "))
        instances = pd.DataFrame(data['instances'])
        am_df = attributes.join(instances)

        return am_df

    def _clusterset_data_to_df(self, data):
        """
        Converts a cluster set object data to a dataframe
        """

        original_matrix_ref = data.get('original_data')
        data_matrix = self.data_util.fetch_data({
            'obj_ref': original_matrix_ref
        }).get('data_matrix')

        data_df = pd.read_json(data_matrix)
        clusters = data.get('clusters')

        id_name_list = [
            list(cluster.get('id_to_data_position').keys())
            for cluster in clusters
        ]
        id_names = [item for sublist in id_name_list for item in sublist]

        if set(data_df.columns.tolist()) == set(
                id_names):  # cluster is based on columns
            data_df = data_df.T

        cluster_names = [None] * data_df.index.size

        cluster_id = 0
        for cluster in clusters:
            item_ids = list(cluster.get('id_to_data_position').keys())
            item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids]

            for idx in item_idx:
                cluster_names[idx] = cluster_id

            cluster_id += 1

        data_df['cluster'] = cluster_names

        return data_df

    def _ws_obj_to_df(self, input_ref):
        """Converts workspace obj to a DataFrame"""
        res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0]
        name = res['info'][1]

        obj_type = res['info'][2]

        if "KBaseExperiments.AttributeMapping" in obj_type:
            cs_df = self._am_data_to_df(res['data'])
        elif "KBaseExperiments.ClusterSet" in obj_type:
            cs_df = self._clusterset_data_to_df(res['data'])
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please supply KBaseExperiments.AttributeMapping or KBaseExperiments.ClusterSet'
            raise ValueError("err_msg")

        return name, cs_df, obj_type

    def _file_to_am_obj(self, scratch_file_path):
        try:
            df = pd.read_excel(scratch_file_path, dtype='str')
        except XLRDError:
            df = pd.read_csv(scratch_file_path, sep=None, dtype='str')
        df = df.replace('nan', '')
        if df.columns[1].lower() == "attribute ontology id":
            am_obj = self._df_to_am_obj(df)
        else:
            am_obj = self._isa_df_to_am_object(df)
        return am_obj

    def _df_to_am_obj(self, am_df):
        """Converts a dataframe from a user file to a compound set object"""
        if not len(am_df):
            raise ValueError("No attributes in supplied files")

        attribute_df = am_df.filter(regex="[Uu]nit|[Aa]ttribute")
        instance_df = am_df.drop(attribute_df.columns, axis=1)
        if not len(instance_df.columns):
            raise ValueError(
                "Unable to find any instance columns in supplied file")

        attribute_df.rename(
            columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(),
            inplace=True)
        if "attribute" not in attribute_df.columns:
            raise ValueError(
                "Unable to find a 'attribute' column in supplied file")
        attribute_df['source'] = 'upload'
        attribute_fields = ('attribute', 'unit', 'attribute_ont_id',
                            'unit_ont_id', 'source')
        attributes = attribute_df.filter(
            items=attribute_fields).to_dict('records')
        print(attributes)
        self._validate_attribute_values(
            am_df.set_index(attribute_df.attribute).iterrows())

        attribute_mapping = {
            'ontology_mapping_method': "User Curation",
            'attributes': [self._add_ontology_info(f) for f in attributes],
            'instances': instance_df.to_dict('list')
        }

        return attribute_mapping

    def _isa_df_to_am_object(self, isa_df):
        skip_columns = {
            'Raw Data File', 'Derived Data File', 'Array Data File',
            'Image File'
        }
        if 'Sample Name' in isa_df.columns and not any(
                isa_df['Sample Name'].duplicated()):
            isa_df.set_index('Sample Name', inplace=True)
        elif 'Assay Name' in isa_df.columns and not any(
                isa_df['Assay Name'].duplicated()):
            isa_df.set_index('Assay Name', inplace=True)
        elif not any(isa_df[isa_df.columns[0]].duplicated()):
            logging.warning(f'Using {isa_df.columns[0]} as ID column')
            isa_df.set_index(isa_df.columns[0], inplace=True)
        else:
            raise ValueError(
                "Unable to detect an ID column that was unigue for each row. "
                f"Considered 'Sample Names', 'Assay Names' and {isa_df.columns[0]}"
            )
        self._validate_attribute_values(isa_df.iteritems())

        attribute_mapping = {
            'ontology_mapping_method': "User Curation - ISA format"
        }
        attribute_mapping[
            'attributes'], new_skip_cols = self._get_attributes_from_isa(
                isa_df, skip_columns)
        reduced_isa = isa_df.drop(columns=new_skip_cols, errors='ignore')
        attribute_mapping['instances'] = reduced_isa.T.to_dict('list')

        return attribute_mapping

    def _validate_attribute_values(self, attribute_series):
        errors = {}
        for attr, vals in attribute_series:
            try:
                validator = getattr(AttributeValidation, attr)
                attr_errors = validator(vals)
                if attr_errors:
                    errors[attr] = attr_errors
            except AttributeError:
                continue

        if errors:
            for attr, attr_errors in errors.items():
                logging.error(
                    f'Attribute {attr} had the following validation errors:\n'
                    "\n".join(attr_errors) + '\n')
                raise ValueError(
                    f'The following attributes failed validation: {", ".join(errors)}'
                    f'\n See the log for details')

    def _get_attributes_from_isa(self, isa_df, skip_columns):
        attributes = []
        # associate attribute columns with the other columns that relate to them
        for i, col in enumerate(isa_df.columns):
            if col.startswith('Term Source REF'):
                skip_columns.add(col)
                last_attr = attributes[-1]
                if '_unit' in last_attr:
                    last_attr['_unit_ont'] = col
                else:
                    last_attr['_val_ont'] = col

            elif col.startswith('Term Accession Number'):
                # If the term Accession is a web link only grab the last bit
                # Similarly, sometimes the number is prefixed with the term source e.x. UO_0000012
                isa_df[col] = isa_df[col].map(
                    lambda x: x.split("/")[-1].split("_")[-1])
                skip_columns.add(col)
                last_attr = attributes[-1]
                if '_unit' in last_attr:
                    last_attr['_unit_accession'] = col
                else:
                    last_attr['_val_accession'] = col

            elif col.startswith('Unit'):
                skip_columns.add(col)
                last_attr = attributes[-1]
                if last_attr.get('unit'):
                    raise ValueError(
                        "More than one unit column is supplied for attribute {}"
                        .format(last_attr['attribute']))
                last_attr['_unit'] = col

            elif col not in skip_columns:
                split_col = col.split("|", 1)
                if len(split_col) > 1:
                    attributes.append({
                        "attribute": split_col[0],
                        "attribute_ont_id": split_col[1],
                        "source": "upload"
                    })
                else:
                    attributes.append({"attribute": col, "source": "upload"})

        # handle the categories for each attribute
        for i, attribute in enumerate(attributes):
            if '_val_accession' in attribute:
                category_df = isa_df[[
                    attribute['attribute'],
                    attribute.pop('_val_ont'),
                    attribute.pop('_val_accession')
                ]].drop_duplicates()
                category_df[
                    'attribute_ont_id'] = category_df.iloc[:, 1].str.cat(
                        category_df.iloc[:, 2], ":")
                category_df['value'] = category_df[attribute['attribute']]
                cats = category_df.set_index(attribute['attribute'])[[
                    'value', 'attribute_ont_id'
                ]].to_dict('index')
                attribute['categories'] = {
                    k: self._add_ontology_info(v)
                    for k, v in cats.items()
                }

            if '_unit' in attribute:
                units = isa_df[attribute.pop('_unit')].unique()
                if len(units) > 1:
                    raise ValueError(
                        "More than one unit type is supplied for attribute {}: {}"
                        .format(attribute['attribute'], units))
                attribute['unit'] = units[0]
                if '_unit_ont' in attribute:
                    unit_ont = isa_df[attribute.pop('_unit_ont')].str.cat(
                        isa_df[attribute.pop('_unit_accession')],
                        ":").unique()
                    if len(units) > 1:
                        raise ValueError(
                            "More than one unit ontology is supplied for attribute "
                            "{}: {}".format(attribute['attribute'], unit_ont))
                    attribute['unit_ont_id'] = unit_ont[0]
            attributes[i] = self._add_ontology_info(attribute)
        return attributes, skip_columns

    def _search_ontologies(self, term, closest=False):
        """
        Match to an existing KBase ontology term
        :param term: Test to match
        :param closest: if false, term must exactly match an ontology ID
        :return: dict(ontology_ref, id)
        """
        params = {
            "object_types": ["OntologyTerm"],
            "match_filter": {
                "lookup_in_keys": {
                    "id": {
                        "value": term
                    }
                }
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "pagination": {
                "count": 1
            },
            "post_processing": {
                "skip_data": 1
            }
        }
        if closest:
            params['match_filter'] = {"full_text_in_all": term}
        res = self.kbse.search_objects(params)
        if not res['objects']:
            return None
        term = res['objects'][0]
        return {
            "ontology_ref": term['guid'].split(":")[1],
            "id": term['key_props']['id']
        }

    def _add_ontology_info(self, attribute):
        """Searches KBASE ontologies for terms matching the user supplied attributes and units.
        Add the references if found"""
        optionals = {
            "unit",
            "unit_ont_id",
            "unit_ont_ref",
        }
        attribute = {
            k: v
            for k, v in attribute.items() if k not in optionals or v != ""
        }
        ont_info = self._search_ontologies(
            attribute.get('attribute_ont_id', "").replace("_", ":"))
        if ont_info:
            attribute['attribute_ont_ref'] = ont_info['ontology_ref']
            attribute['attribute_ont_id'] = ont_info['id']
        elif not attribute.get(
                'attribute_ont_id') or attribute['attribute_ont_id'] == ":":
            attribute.pop('attribute_ont_id', None)

        if attribute.get('unit'):
            ont_info = self._search_ontologies(
                attribute.get('unit_ont_id', '').replace("_", ":"))
            if ont_info:
                attribute['unit_ont_ref'] = ont_info['ontology_ref']
                attribute['unit_ont_id'] = ont_info['id']
            elif not attribute.get(
                    'attribute_ont_id') or attribute['unit_ont_id'] == ":":
                attribute.pop('unit_ont_id', None)

        return attribute

    def to_tsv(self, params):
        """Convert an compound set to TSV file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".tsv")
        df.to_csv(files['file_path'], sep="\t", index=False)

        return _id, files

    def to_excel(self, params):
        """Convert an compound set to Excel file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".xlsx")

        writer = pd.ExcelWriter(files['file_path'])

        if "KBaseExperiments.AttributeMapping" in obj_type:
            df.to_excel(writer, "Attributes", index=False)
        elif "KBaseExperiments.ClusterSet" in obj_type:
            df.to_excel(writer, "ClusterSet", index=True)
        # else is checked in `_ws_obj_to_df`

        writer.save()

        return _id, files

    def export(self, file, name, input_ref):
        """Saves a set of files to SHOCK for export"""
        export_package_dir = os.path.join(self.scratch,
                                          name + str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        shutil.move(file,
                    os.path.join(export_package_dir, os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [input_ref]
        })

        return {'shock_id': package_details['shock_id']}
示例#12
0
class PDBUtil:

    def _validate_import_pdb_file_params(self, params):
        """
        _validate_import_matrix_from_excel_params:
            validates params passed to import_matrix_from_excel method
        """
        # check for required parameters
        for p in ['structure_name', 'workspace_name']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        if params.get('input_file_path'):
            file_path = params.get('input_file_path')
        elif params.get('input_shock_id'):
            file_path = self.dfu.shock_to_file(
                {'shock_id': params['input_shock_id'],
                 'file_path': self.scratch}).get('file_path')
        elif params.get('input_staging_file_path'):
            file_path = self.dfu.download_staging_file(
                        {'staging_file_subdir_path': params.get('input_staging_file_path')}
                        ).get('copy_file_path')
        else:
            error_msg = "Must supply either a input_shock_id or input_file_path "
            error_msg += "or input_staging_file_path"
            raise ValueError(error_msg)

        return file_path, params.get('workspace_name'), params.get('structure_name')

    def _file_to_data(self, file_path):
        """Do the PDB conversion"""
        pdb1 = file_path
        structure = parser.get_structure("test", pdb1)
        model = structure[0]
        chain_no = 0
        res_no = 0
        atom_no = 0
        pp_list = []
        pp_no = 0        
        for model in structure:
            for chain in model:
                chain_no += 1
        for residue in model.get_residues():
            if PDB.is_aa(residue):
                res_no += 1
            for atom in residue.get_atoms():
                atom_no += 1


        for pp in ppb.build_peptides(structure):
            pp_no += 1
            my_seq= pp.get_sequence()
            pp_list += str(my_seq)
        seq = ''.join(pp_list)
        return {
            'name': os.path.basename(file_path),
            'num_chains': chain_no,
            'num_residues': res_no,
            'num_atoms': atom_no,
            'protein': {
                'id': os.path.basename(file_path),
                'sequence': seq,
                'md5': hashlib.md5(seq.encode()).hexdigest()
            },
        }

    def _get_pdb_shock_id(self, obj_ref):
        """Return the shock id for the PDB file"""
        obj_data = self.dfu.get_objects({"object_refs": [obj_ref]})['data'][0]['data']
        return self.hs.hids_to_handles([obj_data['pdb_handle']])[0]['id']

    def _upload_to_shock(self, file_path):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
        """
        logging.info('Start uploading file to shock: {}'.format(file_path))

        file_to_shock_params = {
            'file_path': file_path,
            'pack': 'gzip',
            'make_handle': True,
        }
        shock_id = self.dfu.file_to_shock(file_to_shock_params)['handle']['hid']

        return shock_id

    def _generate_html_report(self, header_str, table_str):
        #TODO: make this work with the PDB viewer

        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'search.html')

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'templates', 'viewer_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('//HEADER_STR', header_str)
                report_template = report_template.replace('//TABLE_STR', table_str)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({'file_path': output_directory,
                                                  'pack': 'zip'})['shock_id']

        html_report.append({'shock_id': report_shock_id,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for Search Matrix App'})

        return html_report

    def _generate_report(self, pdb_obj_ref, workspace_name):
        """
        _generate_report: generate summary report
        """
        # included as an example. Replace with your own implementation
        # output_html_files = self._generate_html_report(header_str, table_str)

        report_params = {'message': 'You uploaded a PDB file!',
                         #'html_links': output_html_files,
                         #'direct_html_link_index': 0,
                         'objects_created': [{'ref': pdb_obj_ref,
                                              'description': 'Imported PDB'}],
                         'workspace_name': workspace_name,
                         'report_object_name': 'import_pdb_from_staging_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.hs = AbstractHandle(config['handle-service-url'])

    def import_model_pdb_file(self, params):

        file_path, workspace_name, pdb_name = self._validate_import_pdb_file_params(params)

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        data = self._file_to_data(file_path)
        data['pdb_handle'] = self._upload_to_shock(file_path)
        data['user_data'] = params.get('description', '')
        logging.info(data)

        info = self.dfu.save_objects({
            'id': workspace_id,
            'objects': [
                {'type': 'KBaseStructure.ModelProteinStructure',
                 'name': pdb_name,
                 'data': data}]
        })[0]
        obj_ref = f"{info[6]}/{info[0]}/{info[4]}"

        returnVal = {'structure_obj_ref': obj_ref}

        report_output = self._generate_report(obj_ref, workspace_name)

        returnVal.update(report_output)

        return returnVal

    def export_pdb(self, params):
        if "input_ref" not in params:
            raise ValueError("input_ref not in supplied params")

        return {'shock_id': self._get_pdb_shock_id(params['input_ref'])}

    def structure_to_pdb_file(self, params):
        if "input_ref" not in params:
            raise ValueError("input_ref not in supplied params")
        if "destination_dir" not in params:
            raise ValueError("destination_dir not in supplied params")

        shock_id = self._get_pdb_shock_id(params['input_ref'])
        file_path = self.dfu.shock_to_file({
            'shock_id': shock_id,
            'file_path': params['destination_dir'],
            'unpack': 'uncompress'
        })['file_path']

        return {'file_path': file_path}
示例#13
0
class FeatureSetBuilder:
    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_upload_featureset_from_diff_expr_params(self, params):
        """
        _validate_upload_featureset_from_diff_expr_params:
                validates params passed to upload_featureset_from_diff_expr method
        """

        log('start validating upload_featureset_from_diff_expr params')

        # check for required parameters
        for p in [
                'diff_expression_ref', 'workspace_name', 'p_cutoff',
                'q_cutoff', 'fold_change_cutoff'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        p = params.get('fold_scale_type')
        if p and p != 'logarithm':
            raise ValueError(
                '"fold_scale_type" parameter must be set to "logarithm", if used'
            )

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning(
                    "Unexpected parameter {} supplied".format(param))

    def _generate_report(self, up_feature_set_ref_list,
                         down_feature_set_ref_list,
                         filtered_expression_matrix_ref_list, workspace_name):
        """
        _generate_report: generate summary report
        """

        log('start creating report')

        output_html_files = self._generate_html_report(
            up_feature_set_ref_list, down_feature_set_ref_list)

        objects_created = list()
        for up_feature_set_ref in up_feature_set_ref_list:
            objects_created += [{
                'ref': up_feature_set_ref,
                'description': 'Upper FeatureSet Object'
            }]
        for down_feature_set_ref in down_feature_set_ref_list:
            objects_created += [{
                'ref': down_feature_set_ref,
                'description': 'Lower FeatureSet Object'
            }]

        for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list:
            objects_created += [{
                'ref':
                filtered_expression_matrix_ref,
                'description':
                'Filtered ExpressionMatrix Object'
            }]

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name':
            'kb_FeatureSetUtils_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _generate_html_report(self, up_feature_set_ref_list,
                              down_feature_set_ref_list):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        uppper_feature_content = ''
        for up_feature_set_ref in up_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2(
                {'objects': [{
                    'ref': up_feature_set_ref
                }]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(
                feature_set_name, len(feature_ids))

        lower_feature_content = ''
        for down_feature_set_ref in down_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2(
                {'objects': [{
                    'ref': down_feature_set_ref
                }]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(
                feature_set_name, len(feature_ids))

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<tr><td>Upper_FeatureSet</td></tr>',
                    uppper_feature_content)

                report_template = report_template.replace(
                    '<tr><td>Lower_FeatureSet</td></tr>',
                    lower_feature_content)

                result_file.write(report_template)

        html_report.append({
            'path': result_file_path,
            'name': os.path.basename(result_file_path),
            'label': os.path.basename(result_file_path),
            'description': 'HTML summary report'
        })
        return html_report

    def _process_diff_expression(self, diff_expression_set_ref,
                                 result_directory, condition_label_pair):
        """
        _process_diff_expression: process differential expression object info
        """

        log('start processing differential expression object')

        diff_expr_set_data = self.ws.get_objects2(
            {'objects': [{
                'ref': diff_expression_set_ref
            }]})['data'][0]['data']

        set_items = diff_expr_set_data['items']

        diff_expr_matrix_file_name = 'gene_results.csv'
        diff_expr_matrix_file = os.path.join(result_directory,
                                             diff_expr_matrix_file_name)

        with open(diff_expr_matrix_file, 'w') as csvfile:
            fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

        for set_item in set_items:
            diff_expression_ref = set_item['ref']

            diff_expression_data = self.ws.get_objects2(
                {'objects': [{
                    'ref': diff_expression_ref
                }]})['data'][0]['data']

            label_string = set_item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_1 = label_list[0]
            condition_2 = label_list[1]

            if condition_1 in condition_label_pair and condition_2 in condition_label_pair:
                genome_id = diff_expression_data['genome_ref']
                matrix_data = diff_expression_data['data']
                selected_diff_expression_ref = diff_expression_ref

                with open(diff_expr_matrix_file, 'a') as csvfile:
                    row_ids = matrix_data.get('row_ids')
                    row_values = matrix_data.get('values')
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                    for pos, row_id in enumerate(row_ids):
                        row_value = row_values[pos]
                        writer.writerow({
                            'gene_id': row_id,
                            'log2_fold_change': row_value[0],
                            'p_value': row_value[1],
                            'q_value': row_value[2]
                        })

        return diff_expr_matrix_file, genome_id, selected_diff_expression_ref

    def _generate_feature_set(self, feature_ids, genome_id, workspace_name,
                              feature_set_name):
        """
        _generate_feature_set: generate FeatureSet object

        KBaseCollections.FeatureSet type:
        typedef structure {
            string description;
            list<feature_id> element_ordering;
            mapping<feature_id, list<genome_ref>> elements;
        } FeatureSet;
        """

        log('start saving KBaseCollections.FeatureSet object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        elements = {feature_id: [genome_id] for feature_id in feature_ids}
        feature_set_data = {
            'description': 'Generated FeatureSet from DifferentialExpression',
            'element_ordering': feature_ids,
            'elements': elements
        }

        object_type = 'KBaseCollections.FeatureSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': feature_set_data,
                'name': feature_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0],
                                                dfu_oi[4])

        return feature_set_obj_ref

    def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value,
                             comp_q_value, comp_fold_change_cutoff):
        """
        _process_matrix_file: filter matrix file by given cutoffs
        """

        log('start processing matrix file')

        up_feature_ids = []
        down_feature_ids = []

        if comp_fold_change_cutoff < 0:
            comp_fold_change_cutoff = -comp_fold_change_cutoff

        with open(diff_expr_matrix_file, 'r') as file:
            reader = csv.DictReader(file)

            for row in reader:
                feature_id = row['gene_id']
                row_p_value = row['p_value']
                row_q_value = row['q_value']
                row_fold_change_cutoff = row['log2_fold_change']

                null_value = {'NA', 'null', ''}
                col_value = {row_p_value, row_q_value, row_fold_change_cutoff}

                if not col_value.intersection(null_value):
                    p_value_condition = float(row_p_value) <= comp_p_value
                    q_value_condition = float(row_q_value) <= comp_q_value

                    up_matches_condition = (p_value_condition
                                            and q_value_condition
                                            and (float(row_fold_change_cutoff)
                                                 >= comp_fold_change_cutoff))

                    down_matches_condition = (p_value_condition
                                              and q_value_condition and
                                              (float(row_fold_change_cutoff) <=
                                               -comp_fold_change_cutoff))

                    if up_matches_condition:
                        up_feature_ids.append(feature_id)
                    elif down_matches_condition:
                        down_feature_ids.append(feature_id)

        return list(set(up_feature_ids)), list(set(down_feature_ids))

    def _filter_expression_matrix(self,
                                  expression_matrix_ref,
                                  feature_ids,
                                  workspace_name,
                                  filtered_expression_matrix_suffix="",
                                  diff_expression_matrix_ref=None,
                                  filtered_expression_matrix_name=None):
        """
        _filter_expression_matrix: generated filtered expression matrix
        """

        log('start saving ExpressionMatrix object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_matrix_obj = self.dfu.get_objects(
            {'object_refs': [expression_matrix_ref]})['data'][0]

        expression_matrix_info = expression_matrix_obj['info']
        expression_matrix_data = expression_matrix_obj['data']

        expression_matrix_name = expression_matrix_info[1]

        if not filtered_expression_matrix_name:
            if re.match('.*_*[Ee]xpression_*[Mm]atrix',
                        expression_matrix_name):
                filtered_expression_matrix_name = re.sub(
                    '_*[Ee]xpression_*[Mm]atrix',
                    filtered_expression_matrix_suffix, expression_matrix_name)
            else:
                filtered_expression_matrix_name = expression_matrix_name + \
                                                  filtered_expression_matrix_suffix

        filtered_expression_matrix_data = expression_matrix_data.copy()

        data = filtered_expression_matrix_data['data']

        row_ids = data['row_ids']
        values = data['values']
        filtered_data = data.copy()

        filtered_row_ids = list()
        filtered_values = list()
        for pos, row_id in enumerate(row_ids):
            if row_id in feature_ids:
                filtered_row_ids.append(row_id)
                filtered_values.append(values[pos])

        filtered_data['row_ids'] = filtered_row_ids
        filtered_data['values'] = filtered_values
        filtered_expression_matrix_data['data'] = filtered_data

        expression_obj = {
            'type': expression_matrix_info[2],
            'data': filtered_expression_matrix_data,
            'name': filtered_expression_matrix_name
        }
        # we now save the filtering DEM in a EM field added for this purpose
        if diff_expression_matrix_ref:
            expression_obj['data'][
                'diff_expr_matrix_ref'] = diff_expression_matrix_ref
            expression_obj['extra_provenance_input_refs'] = [
                diff_expression_matrix_ref
            ]

        save_object_params = {'id': workspace_id, 'objects': [expression_obj]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        filtered_expression_matrix_ref = "{}/{}/{}".format(
            dfu_oi[6], dfu_oi[0], dfu_oi[4])

        return filtered_expression_matrix_ref

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _check_input_labels(self, condition_pairs, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        for condition_pair in condition_pairs:

            try:
                label_string = condition_pair['label_string'][0].strip()
                label_list = [x.strip() for x in label_string.split(',')]
                first_label = label_list[0]
                second_label = label_list[1]
            except IndexError:
                raise IndexError('No selected values for Partial Condition')

            if first_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(
                    first_label)
                error_msg += 'Available conditions: {}'.format(
                    available_condition_labels)
                raise ValueError(error_msg)

            if second_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(
                    second_label)
                error_msg += 'Available conditions: {}'.format(
                    available_condition_labels)
                raise ValueError(error_msg)

            if first_label == second_label:
                raise ValueError('Input conditions are the same')

        return checked

    def _get_condition_labels(self, diff_expression_set_ref):
        """
        _get_condition_labels: get all possible condition label pairs
        """
        log('getting all possible condition pairs')

        condition_label_pairs = list()
        available_condition_labels = set()
        diff_expression_set_obj = self.ws.get_objects2(
            {'objects': [{
                'ref': diff_expression_set_ref
            }]})['data'][0]
        diff_expression_set_data = diff_expression_set_obj['data']
        items = diff_expression_set_data.get('items')
        for item in items:
            label_string = item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_label_pairs.append(label_list)
            available_condition_labels |= set(label_list)

        log('all possible condition pairs:\n{}'.format(condition_label_pairs))

        return condition_label_pairs, available_condition_labels

    def _get_feature_ids(self, genome_ref, ids):
        """
        _get_feature_ids: get feature ids from genome
        """

        genome_features = self.gsu.search({
            'ref': genome_ref,
            'limit': len(ids),
            'structured_query': {
                "$or": [{
                    "feature_id": x
                } for x in ids]
            },
            'sort_by': [['feature_id', True]]
        })['features']

        features_ids = set(
            (feature.get('feature_id') for feature in genome_features))

        return features_ids

    def _build_fs_obj(self, params):
        new_feature_set = {
            'description': '',
            'element_ordering': [],
            'elements': {}
        }
        genome_ref = params['genome']
        if params.get('base_feature_sets',
                      []) and None not in params['base_feature_sets']:
            base_feature_sets = self.dfu.get_objects(
                {'object_refs': params['base_feature_sets']})['data']
            for ret in base_feature_sets:
                base_set = ret['data']
                base_set_name = ret['info'][1]

                new_feature_set['element_ordering'] += [
                    x for x in base_set['element_ordering']
                    if x not in new_feature_set['elements']
                ]
                for element, genome_refs in base_set['elements'].items():
                    if element in new_feature_set['elements']:
                        new_feature_set['elements'][element] += [
                            x for x in genome_refs
                            if x not in new_feature_set['elements'][element]
                        ]
                    else:
                        new_feature_set['elements'][element] = genome_refs
                new_feature_set[
                    'description'] += 'From FeatureSet {}: {}\n'.format(
                        base_set_name, base_set.get('description'))
        new_feature_ids = []
        if params.get('feature_ids'):
            if isinstance(params['feature_ids'], str):
                new_feature_ids += params['feature_ids'].split(',')
            else:
                new_feature_ids += params['feature_ids']
        if params.get('feature_ids_custom'):
            new_feature_ids += params['feature_ids_custom'].split(',')
        if new_feature_ids:
            genome_feature_ids = self._get_feature_ids(genome_ref,
                                                       new_feature_ids)
        for new_feature in new_feature_ids:
            if new_feature not in genome_feature_ids:
                raise ValueError(
                    'Feature ID {} does not exist in the supplied genome {}'.
                    format(new_feature, genome_ref))
            if new_feature in new_feature_set['elements']:
                if genome_ref not in new_feature_set['elements'][new_feature]:
                    new_feature_set['elements'][new_feature].append(genome_ref)
            else:
                new_feature_set['elements'][new_feature] = [genome_ref]
                new_feature_set['element_ordering'].append(new_feature)

        if params.get('description'):
            new_feature_set['description'] = params['description']

        return new_feature_set

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.ws = Workspace(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.scratch = config['scratch']

    def upload_featureset_from_diff_expr(self, params):
        """
        upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression
                                          based on given threshold cutoffs

        required params:
        diff_expression_ref: DifferetialExpressionMatrixSet object reference
        expression_matrix_ref: ExpressionMatrix object reference
        p_cutoff: p value cutoff
        q_cutoff: q value cutoff
        fold_scale_type: one of ["linear", "log2+1", "log10+1"]
        fold_change_cutoff: fold change cutoff
        feature_set_suffix: Result FeatureSet object name suffix
        filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix
        workspace_name: the name of the workspace it gets saved to

        return:
        result_directory: folder path that holds all files generated
        up_feature_set_ref_list: list of generated upper FeatureSet object reference
        down_feature_set_ref_list: list of generated down FeatureSet object reference
        filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        self._validate_upload_featureset_from_diff_expr_params(params)

        diff_expression_set_ref = params.get('diff_expression_ref')
        diff_expression_set_info = self.ws.get_object_info3(
            {"objects": [{
                "ref": diff_expression_set_ref
            }]})['infos'][0]
        diff_expression_set_name = diff_expression_set_info[1]

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        (available_condition_label_pairs, available_condition_labels
         ) = self._get_condition_labels(diff_expression_set_ref)

        run_all_combinations = params.get('run_all_combinations')
        condition_pairs = params.get('condition_pairs')
        if not self._xor(run_all_combinations, condition_pairs):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide partial condition pairs. Don't do both or neither"
            raise ValueError(error_msg)

        if run_all_combinations:
            condition_label_pairs = available_condition_label_pairs
        else:
            if self._check_input_labels(condition_pairs,
                                        available_condition_labels):
                condition_label_pairs = list()
                for condition_pair in condition_pairs:
                    label_string = condition_pair['label_string'][0].strip()
                    condition_labels = [
                        x.strip() for x in label_string.split(',')
                    ]
                    condition_label_pairs.append(condition_labels)

        up_feature_set_ref_list = list()
        down_feature_set_ref_list = list()
        filtered_expression_matrix_ref_list = list()

        for condition_label_pair in condition_label_pairs:
            condition_string = '-'.join(reversed(condition_label_pair))
            diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression(
                diff_expression_set_ref, result_directory,
                condition_label_pair)
            up_feature_ids, down_feature_ids = self._process_matrix_file(
                diff_expr_matrix_file, params.get('p_cutoff'),
                params.get('q_cutoff'), params.get('fold_change_cutoff'))
            filtered_em_name = _sanitize_name(condition_string) + params.get(
                'filtered_expression_matrix_suffix')
            if params.get('expression_matrix_ref'):
                filtered_expression_matrix_ref = self._filter_expression_matrix(
                    params.get('expression_matrix_ref'),
                    up_feature_ids + down_feature_ids,
                    params.get('workspace_name'), "", diff_expr_matrix_ref,
                    filtered_em_name)
                filtered_expression_matrix_ref_list.append(
                    filtered_expression_matrix_ref)

            feature_set_suffix = params.get('feature_set_suffix', "")
            up_feature_set_name = "{}_{}_up{}".format(
                diff_expression_set_name, _sanitize_name(condition_string),
                feature_set_suffix)
            up_feature_set_ref = self._generate_feature_set(
                up_feature_ids, genome_id, params.get('workspace_name'),
                up_feature_set_name)
            up_feature_set_ref_list.append(up_feature_set_ref)

            down_feature_set_name = "{}_{}_down{}".format(
                diff_expression_set_name, _sanitize_name(condition_string),
                feature_set_suffix)
            down_feature_set_ref = self._generate_feature_set(
                down_feature_ids, genome_id, params.get('workspace_name'),
                down_feature_set_name)
            down_feature_set_ref_list.append(down_feature_set_ref)

        returnVal = {
            'result_directory':
            result_directory,
            'up_feature_set_ref_list':
            up_feature_set_ref_list,
            'down_feature_set_ref_list':
            down_feature_set_ref_list,
            'filtered_expression_matrix_ref_list':
            filtered_expression_matrix_ref_list
        }

        report_output = self._generate_report(
            up_feature_set_ref_list, down_feature_set_ref_list,
            filtered_expression_matrix_ref_list, params.get('workspace_name'))
        returnVal.update(report_output)

        return returnVal

    def filter_matrix_with_fs(self, params):
        self.validate_params(
            params,
            ('feature_set_ref', 'workspace_name', 'expression_matrix_ref',
             'filtered_expression_matrix_suffix'))
        ret = self.dfu.get_objects(
            {'object_refs': [params['feature_set_ref']]})['data'][0]
        feature_set = ret['data']
        feature_set_name = ret['info'][1]
        feature_ids = set(feature_set['elements'].keys())
        filtered_matrix_ref = self._filter_expression_matrix(
            params['expression_matrix_ref'], feature_ids,
            params['workspace_name'],
            params['filtered_expression_matrix_suffix'])

        objects_created = [{
            'ref': filtered_matrix_ref,
            'description': 'Filtered ExpressionMatrix Object'
        }]
        message = "Filtered Expression Matrix based of the {} feature ids present in {}" \
            .format(len(feature_ids), feature_set_name)

        report_params = {
            'message': message,
            'workspace_name': params['workspace_name'],
            'objects_created': objects_created,
            'report_object_name':
            'kb_FeatureSetUtils_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {
            'filtered_expression_matrix_ref': filtered_matrix_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }

    def build_feature_set(self, params):
        self.validate_params(params, {
            'output_feature_set',
            'workspace_name',
        }, {
            'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets',
            'description'
        })
        feature_sources = ('feature_ids', 'feature_ids_custom',
                           'base_feature_sets')
        if not any([params.get(x) for x in feature_sources]):
            raise ValueError(
                "You must supply at least one feature source: {}".format(
                    ", ".join(feature_sources)))
        workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])

        new_feature_set = self._build_fs_obj(params)
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': 'KBaseCollections.FeatureSet',
                'data': new_feature_set,
                'name': params['output_feature_set']
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0],
                                                dfu_oi[4])

        objects_created = [{
            'ref': feature_set_obj_ref,
            'description': 'Feature Set'
        }]
        message = 'A new feature set containing {} features was created.'.format(
            len(new_feature_set['elements']))

        report_params = {
            'message': message,
            'workspace_name': params['workspace_name'],
            'objects_created': objects_created,
            'report_object_name':
            'kb_FeatureSetUtils_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {
            'feature_set_ref': feature_set_obj_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }
示例#14
0
class GenomeInterface:
    def __init__(self, config):
        self.handle_url = config.handleURL
        self.shock_url = config.shockURL
        self.sw_url = config.srvWizURL
        self.token = config.token
        self.auth_service_url = config.authServiceUrl
        self.callback_url = config.callbackURL

        self.auth_client = _KBaseAuth(self.auth_service_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.kbse = KBaseSearchEngine(config.raw['search-url'])
        self.taxon_wsname = config.raw['taxon-workspace-name']
        self.scratch = config.raw['scratch']
        self.ws_large_data = WsLargeDataIO(self.callback_url)

    @staticmethod
    def _validate_save_one_genome_params(params):
        """
        _validate_save_one_genome_params:
                validates params passed to save_one_genome method
        """

        log('start validating save_one_genome params')

        # check for required parameters
        for p in ['workspace', 'name', 'data']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _check_shock_response(self, response, errtxt):
        """
        _check_shock_response: check shock node response (Copied from DataFileUtil)
        """
        log('start checking shock response')

        if not response.ok:
            try:
                err = json.loads(response.content)['error'][0]
            except:
                # this means shock is down or not responding.
                self.log("Couldn't parse response error content from Shock: " +
                         response.content)
                response.raise_for_status()
            raise ValueError(errtxt + str(err))

    def _own_handle(self, genome_data, handle_property):
        """
        _own_handle: check that handle_property point to shock nodes owned by calling user
        """

        log('start checking handle {} ownership'.format(handle_property))

        if handle_property in genome_data:
            handle_id = genome_data[handle_property]
            hs = HandleService(self.handle_url, token=self.token)
            handles = hs.hids_to_handles([handle_id])
            shock_id = handles[0]['id']

            # Copy from DataFileUtil.own_shock_node implementation:
            header = {'Authorization': 'Oauth {}'.format(self.token)}
            res = requests.get(self.shock_url + '/node/' + shock_id +
                               '/acl/?verbosity=full',
                               headers=header,
                               allow_redirects=True)
            self._check_shock_response(
                res, 'Error getting ACLs for Shock node {}: '.format(shock_id))
            owner = res.json()['data']['owner']['username']
            user_id = self.auth_client.get_user(self.token)

            if owner != user_id:
                log('start copying node to owner: {}'.format(user_id))
                dfu_shock = self.dfu.copy_shock_node({
                    'shock_id': shock_id,
                    'make_handle': True
                })
                handle_id = dfu_shock['handle']['hid']
                genome_data[handle_property] = handle_id

    def _check_dna_sequence_in_features(self, genome):
        """
        _check_dna_sequence_in_features: check dna sequence in each feature
        """
        log('start checking dna sequence in each feature')

        if 'features' in genome:
            features_to_work = {}
            for feature in genome['features']:
                if not ('dna_sequence' in feature and feature['dna_sequence']):
                    features_to_work[feature['id']] = feature['location']

            if len(features_to_work) > 0:
                aseq = AssemblySequenceAPI(self.sw_url, token=self.token)
                get_dna_params = {'requested_features': features_to_work}
                if 'assembly_ref' in genome:
                    get_dna_params['assembly_ref'] = genome['assembly_ref']
                elif 'contigset_ref' in genome:
                    get_dna_params['contigset_ref'] = genome['contigset_ref']
                else:
                    # Nothing to do (it may be test genome without contigs)...
                    return
                dna_sequences = aseq.get_dna_sequences(
                    get_dna_params)['dna_sequences']
                for feature in genome['features']:
                    if feature['id'] in dna_sequences:
                        feature['dna_sequence'] = dna_sequences[feature['id']]
                        feature['dna_sequence_length'] = len(
                            feature['dna_sequence'])

    def get_one_genome(self, params):
        """Fetch a genome using WSLargeDataIO and return it as a python dict"""
        log('fetching genome object')

        res = self.ws_large_data.get_objects(params)['data'][0]
        data = json.load(open(res['data_json_file']))
        return data, res['info']
        #return self.dfu.get_objects(params)['data'][0]

    def save_one_genome(self, params):
        log('start saving genome object')

        self._validate_save_one_genome_params(params)

        workspace = params['workspace']
        name = params['name']
        data = params['data']
        if 'meta' in params and params['meta']:
            meta = params['meta']
        else:
            meta = {}
        if params.get('upgrade') or 'feature_counts' not in data:
            data = self._update_genome(data)

        # check all handles point to shock nodes owned by calling user
        self._own_handle(data, 'genbank_handle_ref')
        self._own_handle(data, 'gff_handle_ref')

        self._check_dna_sequence_in_features(data)
        data['warnings'] = self.validate_genome(data)

        # dump genome to scratch for upload
        data_path = os.path.join(self.scratch, name + ".json")
        json.dump(data, open(data_path, 'w'))

        if 'hidden' in params and str(
                params['hidden']).lower() in ('yes', 'true', 't', '1'):
            hidden = 1
        else:
            hidden = 0

        if isinstance(workspace, int) or workspace.isdigit():
            workspace_id = workspace
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace)

        save_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': 'KBaseGenomes.Genome',
                'data_json_file': data_path,
                'name': name,
                'meta': meta,
                'hidden': hidden
            }]
        }

        dfu_oi = self.ws_large_data.save_objects(save_params)[0]

        returnVal = {'info': dfu_oi, 'warnings': data['warnings']}

        return returnVal

    def old_retrieve_taxon(self, taxon_wsname, scientific_name):
        """
        old_retrieve_taxon: use SOLR to retrieve taxonomy and taxon_reference

        """
        default = ('Unconfirmed Organism: ' + scientific_name,
                   'ReferenceTaxons/unknown_taxon', 'Unknown', 11)
        solr_url = 'http://kbase.us/internal/solr-ci/search/'
        solr_core = 'taxonomy_ci'
        query = '/select?q=scientific_name:"{}"&fl=scientific_name%2Cscientific_lineage%2Ctaxonomy_id%2Cdomain%2Cgenetic_code&rows=5&wt=json'
        match = re.match("\S+\s?\S*", scientific_name)
        if not match:
            return default
        res = requests.get(solr_url + solr_core + query.format(match.group(0)))
        results = res.json()['response']['docs']
        if not results:
            return default
        taxonomy = results[0]['scientific_lineage']
        taxon_reference = '{}/{}_taxon'.format(taxon_wsname,
                                               results[0]['taxonomy_id'])
        domain = results[0]['domain']
        genetic_code = results[0]['genetic_code']

        return taxonomy, taxon_reference, domain, genetic_code

    def retrieve_taxon(self, taxon_wsname, scientific_name):
        """
        _retrieve_taxon: retrieve taxonomy and taxon_reference

        """
        default = ('Unconfirmed Organism: ' + scientific_name,
                   'ReferenceTaxons/unknown_taxon', 'Unknown', 11)

        def extract_values(search_obj):
            return (search_obj['data']['scientific_lineage'],
                    taxon_wsname + "/" + search_obj['object_name'],
                    search_obj['data']['domain'],
                    search_obj['data'].get('genetic_code', 11))

        search_params = {
            "object_types": ["taxon"],
            "match_filter": {
                "lookup_in_keys": {
                    "scientific_name": {
                        "value": scientific_name
                    }
                },
                "exclude_subobjects": 1
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "sorting_rules": [{
                "is_object_property": 0,
                "property": "timestamp",
                "ascending": 0
            }]
        }
        objects = self.kbse.search_objects(search_params)['objects']
        if len(objects):
            if len(objects) > 100000:
                raise RuntimeError(
                    "Too many matching taxons returned for {}. "
                    "Potential issue with searchAPI.".format(scientific_name))
            return extract_values(objects[0])
        search_params['match_filter']['lookup_in_keys'] = {
            "aliases": {
                "value": scientific_name
            }
        }
        objects = self.kbse.search_objects(search_params)['objects']
        if len(objects):
            return extract_values(objects[0])
        return default

    @staticmethod
    def determine_tier(source):
        """
        Given a user provided source parameter, assign a source and genome tier
        """
        low_source = source.lower()
        if 'refseq' in low_source:
            if 'reference' in low_source:
                return "RefSeq", ['Reference', 'Representative', 'ExternalDB']
            if 'representative' in low_source:
                return "RefSeq", ['Representative', 'ExternalDB']
            if 'user' in low_source:
                return "RefSeq", ['ExternalDB', 'User']
            return "RefSeq", ['ExternalDB']
        if 'phytozome' in low_source:
            if 'flagship' in source:
                return "Phytosome", [
                    'Reference', 'Representative', 'ExternalDB'
                ]
            return "Phytosome", ['Representative', 'ExternalDB']
        if 'ensembl' in low_source:
            if 'user' in low_source:
                return "Ensembl", ['ExternalDB', 'User']
            return "Ensembl", ['Representative', 'ExternalDB']
        return source, ['User']

    def _update_genome(self, genome):
        """Checks for missing required fields and fixes breaking changes"""
        # do top level updates
        ontologies_present = defaultdict(dict)
        ontologies_present.update(genome.get('ontologies_present', {}))
        ontology_events = genome.get('ontology_events', [])
        if 'genome_tier' not in genome:
            genome['source'], genome['genome_tiers'] = self.determine_tier(
                genome['source'])
        if 'molecule_type' not in genome:
            genome['molecule_type'] = 'Unknown'
        if 'taxon_ref' not in genome:
            genome['taxonomy'], genome['taxon_ref'], genome['domain'], \
            genome['genetic_code'] = self.retrieve_taxon(
                self.taxon_wsname, genome['scientific_name'])

        if any([
                x not in genome
                for x in ('dna_size', 'md5', 'gc_content', 'num_contigs')
        ]):
            if 'assembly_ref' in genome:
                assembly_data = self.dfu.get_objects({
                    'object_refs': [genome['assembly_ref']],
                    'ignore_errors':
                    0
                })['data'][0]['data']
                genome["gc_content"] = assembly_data['gc_content']
                genome["dna_size"] = assembly_data['dna_size']
                genome["md5"] = assembly_data['md5']
                genome["num_contigs"] = assembly_data['num_contigs']
            elif 'contigset_ref' in genome:
                contig_data = self.dfu.get_objects({
                    'object_refs': [genome['contigset_ref']],
                    'included': ['contigs/[*]/length', 'md5'],
                    'ignore_errors':
                    0
                })['data'][0]['data']
                genome["gc_content"] = None
                genome["dna_size"] = sum(
                    (c['length'] for c in contig_data['contigs']))
                genome["md5"] = contig_data['md5']
                genome["num_contigs"] = len(contig_data['contigs'])

        if 'cdss' not in genome:
            genome['cdss'] = []
        if 'mrnas' not in genome:
            genome['mrnas'] = []
        if 'non_coding_features' not in genome:
            genome['non_coding_features'] = []

        # do feature level updates
        retained_features = []
        type_counts = defaultdict(int)
        for field in ('mrnas', 'cdss', 'features'):
            for i, feat in enumerate(genome.get(field, [])):
                if 'function' in feat and not isinstance(feat, list):
                    feat['functions'] = feat['function'].split('; ')
                    del feat['function']
                if 'aliases' in feat:
                    if not feat['aliases']:
                        del feat['aliases']
                    elif not isinstance(feat['aliases'][0], (list, tuple)):
                        feat['aliases'] = [['gene_synonym', x]
                                           for x in feat['aliases']]
                if 'type' in feat:
                    type_counts[feat['type']] += 1
                for ontology, terms in feat.get('ontology_terms', {}).items():
                    for term in terms.values():
                        if isinstance(term, list):
                            continue
                        ontologies_present[ontology][
                            term['id']] = term['term_name']
                        term_evidence = []
                        for ev in term['evidence']:
                            ev['id'] = ontology
                            ev['ontology_ref'] = term["ontology_ref"]
                            if ev not in ontology_events:
                                ontology_events.append(ev)
                            term_evidence.append(ontology_events.index(ev))
                        feat['ontology_terms'][ontology][
                            term['id']] = term_evidence

                # remove deprecated fields
                feat.pop('protein_families', None)
                feat.pop('atomic_regulons', None)
                feat.pop('orthologs', None)
                feat.pop('coexpressed_fids', None)
                feat.pop('publications', None)
                feat.pop('regulon_data', None)
                feat.pop('subsystem_data', None)

                if 'dna_sequence_length' not in feat:
                    feat['dna_sequence_length'] = sum(
                        x[3] for x in feat['location'])

                if 'protein_translation' in feat and 'protein_md5' not in feat:
                    feat['protein_md5'] = hashlib.md5(
                        feat.get('protein_translation',
                                 '').encode('utf8')).hexdigest()

                # split all the stuff lumped together in old versions into the
                # right arrays
                if field == 'features':
                    if feat.get('type', 'gene') == 'gene':
                        if not feat.get('cdss', []):
                            genome['non_coding_features'].append(feat)
                        else:
                            retained_features.append(feat)
                    elif feat.get('type', 'gene') == 'CDS':
                        if 'parent_gene' not in feat:
                            feat['parent_gene'] = ''
                        genome['cdss'].append(feat)
                    elif feat.get('type', 'gene') == 'mRNA':
                        if 'parent_gene' not in feat:
                            feat['parent_gene'] = ''
                        genome['mrnas'].append(feat)

        genome['features'] = retained_features
        if ontology_events:
            genome['ontology_events'] = ontology_events
        if ontologies_present:
            genome['ontologies_present'] = ontologies_present

        type_counts['mRNA'] = len(genome.get('mrnas', []))
        type_counts['CDS'] = len(genome.get('cdss', []))
        type_counts['protein_encoding_gene'] = len(genome['features'])
        type_counts['non_coding_features'] = len(
            genome.get('non_coding_features', []))
        genome['feature_counts'] = type_counts

        return genome

    @staticmethod
    def validate_genome(g):
        """
        Run a series of checks on the genome object and return any warnings
        """
        def _get_size(obj):
            return sys.getsizeof(json.dumps(obj))

        def sizeof_fmt(num):
            for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
                if abs(num) < 1024.0:
                    return "%3.1f %sB" % (num, unit)
                num /= 1024.0
            return "%.1f %sB" % (num, 'Yi')

        allowed_tiers = {'Representative', 'Reference', 'ExternalDB', 'User'}

        log('Validating genome object contents')
        warnings = g.get('warnings', [])

        # this will fire for some annotation methods like PROKKA
        if g['domain'] == "Bacteria" and len(g.get('cdss', [])) != len(
                g['features']):
            warnings.append(
                "For prokaryotes, CDS array should generally be the"
                " same length as the Features array.")

        if g['domain'] == "Eukaryota" and len(g.get('features', [])) == len(
                g.get('cdss', [])):
            warnings.append(
                "For Eukaryotes, CDS array should not be the same "
                "length as the Features array due to RNA splicing.")

        if "molecule_type" in g and g['molecule_type'] not in {
                "DNA", 'ds-DNA'
        }:
            if g.get('domain', '') not in {'Virus', 'Viroid'} and \
                            g['molecule_type'] not in {"DNA", 'ds-DNA'}:
                warnings.append("Genome molecule_type {} is not expected "
                                "for domain {}.".format(
                                    g['molecule_type'], g.get('domain', '')))

        if "genome_tiers" in g and set(g['genome_tiers']) - allowed_tiers:
            warnings.append("Undefined terms in genome_tiers: " +
                            ", ".join(set(g['genome_tiers']) - allowed_tiers))
        if g['taxon_ref'] == "ReferenceTaxons/unknown_taxon":
            warnings.append('Unable to determine organism taxonomy')

        #MAX_GENOME_SIZE = 1 #300000000 # UNCOMMENT TO TEST FAILURE MODE. Set to size needed
        feature_lists = ('mrnas', 'features', 'non_coding_features', 'cdss')
        master_key_sizes = dict()
        # Change want full breakdown to True if want to see break down of sizes.
        # By making this a changebale flag it will run faster for standard uploads.
        want_full_breakdown = False
        for x in feature_lists:
            if x in g:
                need_to_remove_dna_sequence = _get_size(g) > MAX_GENOME_SIZE
                if need_to_remove_dna_sequence or want_full_breakdown:
                    feature_type_dict_keys = dict()
                    for feature in g[x]:
                        for feature_key in list(feature.keys()):
                            if feature_key == "dna_sequence" and need_to_remove_dna_sequence:
                                del (feature["dna_sequence"])
                            else:
                                if feature_key not in feature_type_dict_keys:
                                    feature_type_dict_keys[feature_key] = 0
                                feature_type_dict_keys[
                                    feature_key] += sys.getsizeof(
                                        feature[feature_key])
                    for feature_key in feature_type_dict_keys:
                        feature_type_dict_keys[feature_key] = sizeof_fmt(
                            feature_type_dict_keys[feature_key])
                    master_key_sizes[x] = feature_type_dict_keys
                print("{}: {}".format(x, sizeof_fmt(_get_size(g[x]))))
        total_size = _get_size(g)
        print("Total size {} ".format(sizeof_fmt(total_size)))
        if want_full_breakdown:
            print(
                "Here is the breakdown of the sizes of feature lists elements : {}"
                .format(str(master_key_sizes)))
        if total_size > MAX_GENOME_SIZE:
            print(
                "Here is the breakdown of the sizes of feature lists elements : {}"
                .format(str(master_key_sizes)))
            raise ValueError(
                "This genome size of {} exceeds the maximum permitted size of {}.\nHere "
                "is the breakdown for feature lists and their respective sizes:\n{}"
                .format(sizeof_fmt(total_size), sizeof_fmt(MAX_GENOME_SIZE),
                        str(master_key_sizes)))
        return warnings
示例#15
0
def UploadFrommfmd(callback_url, params):
        """
        :param params: instance of type "UploadmfmdInParams" -> structure:
           parameter "path" of String, parameter "ws_name" of String,
           parameter "obj_name" of String
        :returns: instance of type "UploadOutput" -> structure: parameter
           "obj_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN UploadFrommfmd
        print('Extracting motifs')
        #motifList = MFU.parse_mfmd_output(params['path'])
        motifList = parse_mfmd_output(params['path'])
        print(motifList)
       
        MSO = {}
        MSO=motifList
        '''MSO['Condition'] = 'Temp'
        MSO['SequenceSet_ref'] = '123'
        MSO['Motifs'] = []
        MSO['Alphabet'] = ['A','C','G','T']
        #MSO['Background'] = MSU.GetBackground()
        #for letter in MSO['Alphabet']:
        #    MSO['Background'][letter] = 0.0
        
        #MSU.parseMotifList(motifList,MSO)'''
        
        '''params['min_len']=22   #put dummy value for min and max len
        params['max_len']=22
        #MSU.CheckLength(motifList,params['min_len'],params['max_len'])
        #MSU.CheckLength(MSO,params['min_len'],params['max_len'])
        
        
        for motif in MSO['Motifs']:
            print()
            for letter in MSO['Alphabet']:
                if len(motif['PWM'][letter]) != len(motif['Iupac_sequence']):
                    print('CAUGHT PWM ERROR HERE')
                    exit(1)
        if 'absolute_locations' in params:
            for motif in MSO['Motifs']:
                for loc in motif['Motif_Locations']:
                    if loc['sequence_id'] in params['absolute_locations']:
                        loc['sequence_id'] = params['contig']
                        absStart = int(params['start'])
                        loc['start'] = absStart
                        loc['end'] = absStart + loc['end']
        print("test2")'''
        
        dfu = DataFileUtil(callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name'])
        save_objects_params['objects'] = [{'type': 'KBaseGeneRegulation.MotifSet' , 'data' : MSO , 'name' : params['obj_name']}]

        info = dfu.save_objects(save_objects_params)[0]
        print('SAVED OBJECT')
        print(info)
        motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        print(motif_set_ref)
        output = {'obj_ref' : motif_set_ref}
        print(output)

        
        #exit("test")
        #END UploadFrommfmd

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method UploadFrommfmd return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#16
0
class IntegrateAppImpl:
    @staticmethod
    def _validate_params(params, required, optional=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        required = set(required)
        optional = set(optional)
        pkeys = set(params)
        if required - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(required - pkeys)))
        defined_param = required | optional
        for param in params:
            if param not in defined_param:
                logging.warning(
                    "Unexpected parameter {} supplied".format(param))

    def _build_figure(self, file_path, figure_matrix):

        # Make figure matrix html file and embed
        file_name = 'integrated_scatterplot_output.html'
        figure_html_path = os.path.join(file_path, file_name)
        output_file(figure_html_path)
        save(grid(figure_matrix))

        return file_name

    def _build_table(self, table_dict, stats_df):

        html_lines = list()
        html_lines.append('<table class="table table-bordered table-striped">')

        header_list = [
            "Enzymes", "Compartments", "Reactions", "EC numbers", "Subsystems"
        ] + self.conditions_ids + ["Mahalanobis distance", "p-value"]

        html_lines.append('<thead>')
        internal_header_line = "</td><td>".join(header_list)
        html_lines.append('<tr><td>' + internal_header_line + '</td></tr>')
        html_lines.append('</thead>')

        html_lines.append("<tbody>")
        print_row = True
        for complex_row in sorted(table_dict.keys()):
            print_row = True
            cpts = ", ".join(sorted(list(table_dict[complex_row])))

            ecs = []
            subsystems = []
            reactions = []
            conditions = []
            mahal_list = []
            pvalue_list = []
            mahalanobis_dist = "0.00"
            pvalue = "0.00"
            for cpt in table_dict[complex_row]:
                for rxn in table_dict[complex_row][cpt]:

                    if (rxn not in reactions):
                        reactions.append(rxn)

                    if (len(conditions) == 0):
                        conditions = table_dict[complex_row][cpt][rxn]

                    if (rxn in self.reactions_data):
                        for ss in self.reactions_data[rxn]['subsystems']:
                            ss = ss.replace("_", " ")
                            ss = ss.replace(" in plants", "")
                            if (ss not in subsystems):
                                subsystems.append(ss)

                        for ec in self.reactions_data[rxn]['ecs']:
                            if (ec not in ecs):
                                ecs.append(ec)

                    str_md = "0.00"
                    str_pv = "0.00"
                    if (rxn + '_' + cpt not in stats_df.index):
                        print("MISSING REACTION: ", complex_row,
                              rxn + "_" + cpt)
                        print_row = False
                    else:
                        str_md = "{0:.2f}".format(
                            stats_df.loc[rxn + '_' + cpt]['mahalanobis'])
                        str_pv = "{0:.2f}".format(stats_df.loc[rxn + '_' +
                                                               cpt]['pvalue'])
                        if (str_pv == "0.00"):
                            str_pv = "{0:.2e}".format(
                                stats_df.loc[rxn + '_' + cpt]['pvalue'])
                        if (mahalanobis_dist != "0.00"
                                and str_md != mahalanobis_dist):
                            print(
                                "WARNING: CHANGING STATS FOR SAME PROTEIN COMPLEXES\n"
                            )
                            print(
                                "===================================================\n\n"
                            )
                            print(complex_row, cpts, rxn, conditions,
                                  stats_df.loc[rxn + '_' + cpt]['mahalanobis'],
                                  mahalanobis_dist, "\n")
                            print(
                                "===================================================\n\n"
                            )

                    mahalanobis_dist = str_md
                    pvalue = str_pv

            reactions = ", ".join(sorted(reactions))
            subsystems = ", ".join(sorted(subsystems))
            ecs = ", ".join(sorted(ecs))

            conditions_strings = list()
            for i in range(len(conditions)):
                conditions[i][0] = "{0:.2f}".format(conditions[i][0])
                conditions_strings.append(" | ".join(conditions[i]))

            # some complexes may have zero features predicted
            if (print_row is True):
                html_lines.append("<tr>")
                internal_row_line = "</td><td>".join(
                    [complex_row, cpts, reactions, ecs, subsystems] +
                    conditions_strings + [mahalanobis_dist, pvalue])
                html_lines.append("<td>" + internal_row_line + "</td>")
                html_lines.append("</tr>")

        html_lines.append("</tbody>")
        html_lines.append("</table>")

        return "\n".join(html_lines)

    def _build_report(self, figure_matrix, table_dict, stats_df,
                      saved_object_list, workspace_name):
        """
        _generate_report: generate summary report
        """

        # Make report directory and copy over files
        report_file_path = os.path.join(self.scratch, self.report_uuid)
        os.mkdir(report_file_path)

        table_html_string = self._build_table(table_dict, stats_df)

        if (len(self.conditions_ids) > 1):
            figure_html_file = self._build_figure(report_file_path,
                                                  figure_matrix)
            output_html_files = self._generate_report_html(
                report_file_path,
                figure_html_file=figure_html_file,
                table_string=table_html_string)
        else:
            output_html_files = self._generate_report_html(
                report_file_path, table_string=table_html_string)

        report_params = {
            'direct_html_link_index':
            0,  #Use to refer to index of 'html_links'
            'workspace_name': workspace_name,
            'report_object_name': 'plant_fba_' + self.report_uuid,
            'objects_created': saved_object_list,
            'html_links': output_html_files
        }

        output = self.kbr.create_extended_report(report_params)

        return {'report_name': output['name'], 'report_ref': output['ref']}

    def _generate_report_html(self,
                              file_path,
                              figure_html_file=None,
                              table_string=None):
        """
            _generate_report: generates the HTML for the upload report
        """
        html_report_list = list()

        ##############################################################
        # Write table html file
        ##############################################################
        # Read in template html
        with open(
                os.path.join(
                    '/kb/module/data', 'app_report_templates',
                    'integrate_abundances_report_tables_template.html')
        ) as report_template_file:
            report_template_string = report_template_file.read()

        # Generate and Insert html title
        title_string = "-".join(
            [self.input_params['input_expression_matrix']] +
            self.conditions_ids)
        report_template_string = report_template_string.replace(
            '*TITLE*', title_string)

        # Insert html table
        table_report_string = report_template_string.replace(
            '*TABLES*', table_string)

        # Write html file
        table_html_file = "integrated_table_output.html"
        with open(os.path.join(file_path, table_html_file), 'w') as table_file:
            table_file.write(table_report_string)

        ##############################################################
        # Write summary index.html file
        ##############################################################
        # Begin composing html
        html_lines = list()
        html_lines.append(
            '<h3 style="text-align: center">Integrate Abundances with Metabolism Report</h3>'
        )
        html_lines.append(
            "<p>The \"Integrate Abundances with Metabolism\" app has finished running.</br>"
        )
        html_lines.append("The app integrated the values from the <b>" +
                          self.input_params['input_expression_matrix'] +
                          "</b> ExpressionMatrix")
        html_lines.append(" with the <b>" +
                          self.input_params['input_fbamodel'] +
                          "</b> FBAModel</br>")
        html_lines.append(
            "Specifically, the app integrated the values from these chosen conditions in the ExpressionMatrix: <b>"
            + "</b>, <b>".join(self.conditions_ids) + "</b></br>")
        html_lines.append(
            "The results of the integration are stored in the <b>" +
            self.input_params['output_reaction_matrix'] +
            "</b> ReactionMatrix.</p><br/>")
        html_lines.append(
            'The results of the integration are also tabulated in this <a href="'
            + table_html_file + '" target="_blank">Table</a></br>')

        if (len(self.conditions_ids) > 1):
            html_lines.append(
                'The results of the integration can be also be visualized in these <a href="'
                + figure_html_file + '" target="_blank">Scatterplots</a>')

        # Read in template html
        with open(
                os.path.join('/kb/module/data', 'app_report_templates',
                             'integrate_abundances_report_template.html')
        ) as report_template_file:
            report_template_string = report_template_file.read()

        # Insert html
        summary_report_string = report_template_string.replace(
            '*TEXT*', "\n".join(html_lines))

        summary_html_file = "index.html"
        with open(os.path.join(file_path, summary_html_file),
                  'w') as index_file:
            index_file.write(summary_report_string)

        ##############################################################
        # Upload files and compose html report object
        ##############################################################
        # Cache it in shock as an archive
        upload_info = self.dfu.file_to_shock({
            'file_path': file_path,
            'pack': 'zip'
        })

        # HTML Link objects
        html_link = dict()
        # Index
        # html_link = {'shock_id' : upload_info['shock_id'],
        #              'name' : summary_html_file,
        #              'label' : 'HTML report for integrate_abundances_with_metabolism app',
        #              'description' : 'HTML report for integrate_abundances_with_metabolism app'}
        # html_report_list.append(html_link)

        if (len(self.conditions_ids) > 1):
            # Figures
            html_link = {
                'shock_id':
                upload_info['shock_id'],
                'name':
                figure_html_file,
                'label':
                'Scatterplot figures generated by Integrate Abundances with Metabolism app',
                'description':
                'Scatterplot figures generated by Integrate Abundances with Metabolism app'
            }
            html_report_list.append(html_link)

        # Table
        html_link = {
            'shock_id':
            upload_info['shock_id'],
            'name':
            table_html_file,
            'label':
            'HTML table generated by Integrate Abundances with Metabolism app',
            'description':
            'HTML table generated by Integrate Abundances with Metabolism app'
        }
        html_report_list.append(html_link)

        return html_report_list

    def _load_fbamodel(self, model_ref):

        model_obj = self.dfu.get_objects({'object_refs':
                                          [model_ref]})['data'][0]
        print("Number of reactions: " +
              str(len(model_obj['data']['modelreactions'])))

        model_reaction_lookup_dict = dict()
        for index in range(len(model_obj['data']['modelreactions'])):
            model_reaction_lookup_dict[model_obj['data']['modelreactions']
                                       [index]['id']] = index

        return [model_obj, model_reaction_lookup_dict]

    def _load_expression_matrix(self, expdata_ref):

        expdata_obj = self.dfu.get_objects({'object_refs':
                                            [expdata_ref]})['data'][0]
        conditions_ids = expdata_obj['data']['data']['col_ids']
        features_ids = expdata_obj['data']['data']['row_ids']

        feature_lookup_dict = dict()
        for index in range(len(features_ids)):
            feature_lookup_dict[features_ids[index]] = index

        condition_lookup_dict = dict()
        for index in range(len(conditions_ids)):
            condition_lookup_dict[conditions_ids[index]] = index

        if (len(self.conditions_ids) == 0):
            self.conditions_ids = conditions_ids

        return [
            expdata_obj, features_ids, feature_lookup_dict,
            condition_lookup_dict
        ]

    def _compile_genome_scores(self, data, conditions_indices):

        Feature_Comparison_Dict = dict()
        for feature_index in range(len(data)):

            scores_dict = dict()
            for condition in self.conditions_ids:
                condition_index = conditions_indices[condition]

                #Retrieve value from 2D matrix
                score = data[feature_index][condition_index]

                #Force into string for easier comparison
                str_score = "{0:.2f}".format(score)

                if (str_score == "0.00"):
                    continue

                scores_dict[condition] = score

            #Here we skip features where there aren't enough scores (should be same number of conditions)
            if (len(scores_dict) < len(self.conditions_ids)):
                continue

            for condition in scores_dict:

                if (condition not in Feature_Comparison_Dict):
                    Feature_Comparison_Dict[condition] = list()

                Feature_Comparison_Dict[condition].append(
                    scores_dict[condition])

        return Feature_Comparison_Dict

    def _compile_model_scores_percentiles(self, data):

        # I want to compute percentile rank for each feature under each condition
        # The Conditions_Score_Dicts variable is used to "bin" identical scores
        # (to two decimal points, can be changed)

        # First, we iterate through the conditions for computing percentile rank
        # for each condition
        model_conditions_score_lists = dict()
        model_conditions_score_pct_dicts = dict()
        for condition_index in range(len(self.conditions_ids)):
            condition = self.conditions_ids[condition_index]

            # For each condition, we "bin" the scores

            score_reaction_dict = dict()
            score_reaction_list = list()
            # The counting of features is done independently because we skip scores of zero
            # (which this affect how percentile rank distributes)
            n_ftrs = 0
            for reaction_index in range(len(data)):

                # Retrieve value from 2D matrix
                score = data[reaction_index][condition_index]

                # Many reactions are not assigned a score, and instead have a default tiny score
                if (score == float(-sys.maxsize - 1)):
                    continue

                # Force into string for easier comparison
                str_score = "{0:.2f}".format(score)

                # I skip the relatively large number of reactions that have a value of zero
                # to prevent the computation of the percentile rank skewing towards zero
                if (str_score == "0.00"):
                    continue

                n_ftrs += 1
                if (str_score not in score_reaction_dict):
                    score_reaction_dict[str_score] = list()
                score_reaction_dict[str_score].append(reaction_index)
                score_reaction_list.append(float(str_score))

            model_conditions_score_lists[condition] = score_reaction_list

            # Then for each condition, we use the binned scores to compute
            # percentile rank
            if (condition not in model_conditions_score_pct_dicts):
                model_conditions_score_pct_dicts[condition] = dict()

            sorted_scores = sorted(score_reaction_dict.keys(), key=float)
            less_than_score_ftrs_count = 0
            for score_index in range(len(sorted_scores)):

                n_score_ftrs = len(
                    score_reaction_dict[sorted_scores[score_index]])
                half_n_score_ftrs = float(n_score_ftrs) * 0.5
                cumulative_n_score_ftrs = float(
                    less_than_score_ftrs_count) + half_n_score_ftrs
                percentile_rank = cumulative_n_score_ftrs / float(n_ftrs)

                less_than_score_ftrs_count += len(
                    score_reaction_dict[sorted_scores[score_index]])
                model_conditions_score_pct_dicts[condition][
                    sorted_scores[score_index]] = percentile_rank

        # This next part of the code is to re-iterate through the data and to compose the dicts
        # that become ColumnDataStores, and also with default values

        # The reaction_percentile_comparison_dict is for the reaction percentile plot
        reaction_percentile_comparison_dict = dict()
        if ('All' not in reaction_percentile_comparison_dict):
            reaction_percentile_comparison_dict['All'] = dict()

        # The reaction_score_comparison_dict works for the genome features plot
        reaction_score_comparison_dict = dict()

        for reaction_index in range(len(data)):

            scores_dict = dict()
            for condition_index in range(len(self.conditions_ids)):
                condition = self.conditions_ids[condition_index]

                #Retrieve value from 2D matrix
                score = data[reaction_index][condition_index]

                #Many reactions are not assigned a score, and instead a default tiny score
                if (score == float(-sys.maxsize - 1)):
                    continue

                scores_dict[condition] = score

            #Here we skip reactions where there aren't enough scores (should be same number of conditions)
            if (len(scores_dict) < len(self.conditions_ids)):
                continue

            for condition in scores_dict:

                # Collect reaction scores
                if (condition not in reaction_score_comparison_dict):
                    reaction_score_comparison_dict[condition] = list()
                reaction_score_comparison_dict[condition].append(
                    scores_dict[condition])

                # Collect reaction percentiles
                if (condition
                        not in reaction_percentile_comparison_dict['All']):
                    reaction_percentile_comparison_dict['All'][
                        condition] = list()

                #Force into string for easier comparison
                str_score = "{0:.2f}".format(scores_dict[condition])

                #We skip zero scores when computing the percentiles
                #So we have to check for them here
                condition_pct = 0.00
                if (str_score != '0.00'):
                    condition_pct = model_conditions_score_pct_dicts[
                        condition][str_score]
                reaction_percentile_comparison_dict['All'][condition].append(
                    condition_pct)

                if ('reactions'
                        not in reaction_percentile_comparison_dict['All']):
                    reaction_percentile_comparison_dict['All'][
                        'reactions'] = list()
                if(self.reactions_ids[reaction_index] not in \
                       reaction_percentile_comparison_dict['All']['reactions']):
                    reaction_percentile_comparison_dict['All'][
                        'reactions'].append(self.reactions_ids[reaction_index])

                base_rxn = self.reactions_ids[reaction_index].split('_')[0]
                for ss in self.reactions_data[base_rxn]['subsystems']:
                    if (ss not in reaction_percentile_comparison_dict):
                        reaction_percentile_comparison_dict[ss] = dict()
                    if (condition
                            not in reaction_percentile_comparison_dict[ss]):
                        reaction_percentile_comparison_dict[ss][
                            condition] = list()
                    reaction_percentile_comparison_dict[ss][condition].append(
                        condition_pct)

                    if ('reactions'
                            not in reaction_percentile_comparison_dict[ss]):
                        reaction_percentile_comparison_dict[ss][
                            'reactions'] = list()
                    if(self.reactions_ids[reaction_index] not in \
                           reaction_percentile_comparison_dict[ss]['reactions']):
                        reaction_percentile_comparison_dict[ss][
                            'reactions'].append(
                                self.reactions_ids[reaction_index])

            self.mh_reactions_ids.append(self.reactions_ids[reaction_index])

        # We set the default values here at the end of the loop because we don't know
        # how many reactions there will be for each category
        for category in reaction_percentile_comparison_dict:
            for key in ['color', 'size', 'tooltip', 'fill_alpha']:
                reaction_percentile_comparison_dict[category][key] = list()

            for index in range(
                    len(reaction_percentile_comparison_dict[category][
                        self.conditions_ids[0]])):

                reaction_percentile_comparison_dict[category][
                    'fill_alpha'].append(1.0)

                # format string of subsystems for tooltip
                rxn = reaction_percentile_comparison_dict[category][
                    'reactions'][index]
                base_rxn = rxn.split('_')[0]
                ss_string = ", ".join(
                    self.reactions_data[base_rxn]['subsystems'])
                reaction_percentile_comparison_dict[category][
                    'tooltip'].append(rxn + ", " + ss_string)

                if (category == 'All'):

                    reaction_percentile_comparison_dict[category][
                        'color'].append('black')
                    reaction_percentile_comparison_dict[category][
                        'size'].append(6)

                else:

                    reaction_percentile_comparison_dict[category][
                        'color'].append('red')
                    reaction_percentile_comparison_dict[category][
                        'size'].append(8)

        return [
            reaction_score_comparison_dict, reaction_percentile_comparison_dict
        ]

    def _compile_mahalanobis_dist_pvalue(self, data, threshold):

        data_df = pd.DataFrame(data,
                               columns=self.conditions_ids,
                               index=self.mh_reactions_ids)

        # I don't know the math well enough to follow what's going on, but I used
        # the recipe described here:
        # https://www.machinelearningplus.com/statistics/mahalanobis-distance/

        # Covariance matrix via numpy
        cov_mat = np.cov(data_df.values.T)

        # Inverse covariance matrix via scipy.linalg
        # It won't accept a 1x1 matrix hence the if/else
        if (len(self.conditions_ids) > 1):
            inv_cov_mat = sp.linalg.inv(cov_mat)
        else:
            inv_cov_mat = 1 / cov_mat

        # two terms required, second using dot product
        data_minus_mean = data_df - np.mean(data_df)
        left_term = np.dot(data_minus_mean, inv_cov_mat)

        # dot product
        mahalanobis = np.dot(left_term, data_minus_mean.T)
        data_df['mahalanobis'] = mahalanobis.diagonal()

        # chi-squared p-values with one degree of freedom (two sets of variables)
        data_df['pvalue'] = 1 - sp.stats.chi2.cdf(data_df['mahalanobis'], 1)

        # find the outliers below a given threshold, i.e. p < 0.01
        outliers = data_df.loc[data_df.pvalue < threshold]
        # this is used when you want to just plot the p-values alone
        data_df.index.name = 'reactions'
        outliers.index.name = 'reactions'

        #Need to return the mapping between reactions and the p-values
        return [data_df, outliers]

    def _integrate_abundances(self, model_obj, feature_lookup_dict,
                              expdata_obj, condition_indices):

        reaction_values_matrix = list()
        reactions_ids = list()
        minmax_expscore_dict = dict()
        model_complexes_dict = dict()
        fh = open(self.scratch + '/output.txt', 'w')
        fh2 = open(self.scratch + '/rxn01486.txt', 'w')
        print_data = False
        for mdlrxn in range(len(model_obj['data']['modelreactions'])):
            mdlrxn_obj = model_obj['data']['modelreactions'][mdlrxn]
            reactions_ids.append(mdlrxn_obj['id'])
            [base_rxn, cpt_id] = mdlrxn_obj['id'].split('_')

            #            if(base_rxn == 'rxn01486' or base_rxn == 'rxn37610'):
            #                print_data=True

            rxndata_row = list()
            for condition in self.conditions_ids:
                if (condition not in minmax_expscore_dict):
                    minmax_expscore_dict[condition] = {
                        'max': -sys.maxsize - 1,
                        'min': sys.maxsize
                    }

                condition_index = condition_indices[condition]

                # Maximal gene expression for a reaction
                reaction_score = ['nan', ""]
                prots_str_list = list()
                for prt in mdlrxn_obj['modelReactionProteins']:

                    # Minimal gene expression for a complex
                    complex_score = ['nan', ""]
                    subs_str_list = list()
                    for sbnt in prt['modelReactionProteinSubunits']:

                        # Maximal gene expression for a subunit
                        subunit_score = ['nan', ""]
                        ftrs_str_list = list()
                        for feature in sbnt['feature_refs']:
                            feature = feature.split('/')[-1]
                            ftrs_str_list.append(feature)
                            feature_index = feature_lookup_dict[feature]

                            ftr_score = expdata_obj['data']['data']['values'][
                                feature_index][condition_index]

                            if (print_data is True):
                                fh2.write(mdlrxn_obj['id'] + ':' + feature +
                                          ':' + str(ftr_score) + '\n')

                            if (ftr_score <
                                    minmax_expscore_dict[condition]['min']):
                                minmax_expscore_dict[condition][
                                    'min'] = ftr_score

                            if (ftr_score >
                                    minmax_expscore_dict[condition]['max']):
                                minmax_expscore_dict[condition][
                                    'max'] = ftr_score

                            # Maximal gene expression for a subunit
                            if (subunit_score[0] == 'nan'
                                    or subunit_score[0] < ftr_score):
                                subunit_score = [ftr_score, feature]

                        if (print_data is True):
                            fh2.write(subunit_score, '\n')

                        ftr_str = "(" + ", ".join(ftrs_str_list) + ")"
                        subs_str_list.append(ftr_str)

                        # Minimal gene expression for a complex
                        if (subunit_score[0] != 'nan'):
                            if (complex_score[0] == 'nan'
                                    or complex_score[0] > subunit_score[0]):
                                complex_score[0] = subunit_score[0]
                                complex_score[1] = subunit_score[1]

                    if (print_data is True):
                        fh2.write(complex_score, '\n')

                    sub_str = "[" + ", ".join(subs_str_list) + "]"
                    prots_str_list.append(sub_str)

                    # Maximal gene expression for a reaction
                    if (complex_score[0] != 'nan'):
                        if (reaction_score[0] == 'nan'
                                or reaction_score[0] < complex_score[0]):
                            reaction_score[0] = complex_score[0]
                            reaction_score[1] = complex_score[1]

                if (reaction_score[0] == 'nan'):
                    reaction_score[0] = float(-sys.maxsize - 1)

                if (print_data is True):
                    fh2.write(condition + ':' + str(reaction_score[0]) + '(' +
                              reaction_score[1] + ')\n')

                #Putting together dict for table
                proteins_string = ', '.join(prots_str_list)
                if (len(prots_str_list) > 0 and proteins_string != "[]"
                        and proteins_string != "[()]"):
                    if (proteins_string not in model_complexes_dict):
                        model_complexes_dict[proteins_string] = dict()
                    if (cpt_id not in model_complexes_dict[proteins_string]):
                        model_complexes_dict[proteins_string][cpt_id] = dict()
                    if (base_rxn not in model_complexes_dict[proteins_string]
                        [cpt_id]):
                        model_complexes_dict[proteins_string][cpt_id][
                            base_rxn] = list()
                    fh.write('\t'.join([
                        condition, proteins_string, cpt_id, base_rxn,
                        str(reaction_score[0]), reaction_score[1], '\n'
                    ]))
                    model_complexes_dict[proteins_string][cpt_id][
                        base_rxn].append(reaction_score)

                rxndata_row.append(reaction_score[0])

            print_data = False

            reaction_values_matrix.append(rxndata_row)

        fh.close()

        self.reactions_ids = reactions_ids
        return (reaction_values_matrix, model_complexes_dict)

    def __init__(self, config, ctx, input_params):
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)

        self.scratch = config['scratch']
        self.report_uuid = str(uuid.uuid4())

        # There is a bug in the UI that won't let me collect a
        # a clean list of conditions, so I have to parse them
        # from a comma-separated string
        if ("input_columns" in input_params
                and input_params["input_columns"] != ""):
            conditions = list()
            for condition in input_params["input_columns"].split(','):
                conditions.append(condition)
            input_params["input_columns"] = conditions

        self.input_params = input_params

        # set in _load_expression_matrix()
        self.conditions_ids = list()

        # this is an optional parameter, but restricts the
        # number of chosen columns in the matrix
        if ('input_columns' in input_params
                and len(input_params['input_columns']) > 0):
            self.conditions_ids = input_params['input_columns']

        # set in _integrate_abundances()
        self.reactions_ids = list()

        # set in _compile_model_scores_percentiles
        self.mh_reactions_ids = list()

        with open(
                os.path.join("/kb/module/PlantSEED", "Data/PlantSEED_v3",
                             "PlantSEED_Roles.json")) as plsd_fh:
            PS_Roles = json.load(plsd_fh)

        plantseed = FetchPlantSEEDImpl()
        self.reactions_data = plantseed.fetch_reactions(PS_Roles)

    def integrate_abundances_with_metabolism(self):

        self._validate_params(
            self.input_params, {
                'input_ws', 'input_fbamodel', 'input_expression_matrix',
                'output_reaction_matrix'
            }, {'input_columns'})

        ##############################################################
        # Load model and expression objects
        ##############################################################
        model_ref = self.input_params['input_ws'] + '/' + self.input_params[
            'input_fbamodel']
        [model_obj, reaction_index] = self._load_fbamodel(model_ref)

        # The columns / conditions_ids are set in this function if not set via user parameter
        expression_ref = self.input_params[
            'input_ws'] + '/' + self.input_params['input_expression_matrix']
        [expdata_obj, features_ids, feature_index,
         condition_index] = self._load_expression_matrix(expression_ref)

        ##############################################################
        # Extract expression abundances for use in first scatter plot
        ##############################################################
        feature_comparison_dict = self._compile_genome_scores(
            expdata_obj['data']['data']['values'], condition_index)

        ####################################################################
        # Actually integrate abundances and build new ReactionMatrix object
        ####################################################################
        (reaction_values_matrix,
         model_complexes_dict) = self._integrate_abundances(
             model_obj, feature_index, expdata_obj, condition_index)

        rxndata_obj = {
            'row_ids': self.reactions_ids,
            'col_ids': self.conditions_ids,
            'values': reaction_values_matrix
        }

        ##########################################################################################
        # Extract / organize reaction expression scores for use in first and second scatter plot
        ##########################################################################################
        [reaction_scores_dict, reaction_percentiles_dict
         ] = self._compile_model_scores_percentiles(reaction_values_matrix)

        #############################################################################################################
        # Multi-variate mahalanobis distances computed along with outliers depending on chi-squared p-value of 0.01
        #############################################################################################################
        [mahal_dist_df, outliers] = self._compile_mahalanobis_dist_pvalue(
            reaction_percentiles_dict['All'], 0.01)

        ##############################################################
        # Figure generator
        ##############################################################
        subsystem_select_list = ["None"]
        for category in sorted(list(reaction_percentiles_dict.keys())):
            if (category == 'All'):
                continue
            subsystem_select_list.append(category)

            for rxn_idx in range(
                    len(reaction_percentiles_dict[category]['reactions'])):
                rxn = reaction_percentiles_dict[category]['reactions'][rxn_idx]
                pval = mahal_dist_df.loc[rxn]['pvalue']
                # reaction_percentiles_dict[category]['fill_alpha'][rxn_idx] = 1-pval

        figure_generator = GenerateFigureImpl()
        figure_grid = figure_generator.generate_figure(
            self.conditions_ids,
            category_select=subsystem_select_list,
            genome_features=feature_comparison_dict,
            reaction_scores=reaction_scores_dict,
            reaction_percentiles=reaction_percentiles_dict)

        ##############################################################
        # Finishing and Saving ReactionMatrix
        ##############################################################
        ReactionMatrix_obj = {
            'type': 'KBaseMatrices.ReactionMatrix',
            'name': self.input_params['output_reaction_matrix'],
            'data': {
                'scale': 'raw',
                'description': 'reaction expression score',
                'fbamodel_ref': model_ref,
                'expression_ref': expression_ref,
                'data': rxndata_obj
            }
        }

        ws_id = self.dfu.ws_name_to_id(self.input_params['input_ws'])
        saved_matrix_dict = self.dfu.save_objects({
            'id':
            ws_id,
            'objects': [ReactionMatrix_obj]
        })[0]
        saved_matrix_ref = "{}/{}/{}".format(saved_matrix_dict[6],
                                             saved_matrix_dict[0],
                                             saved_matrix_dict[4])
        saved_matrix_desc = "Reaction matrix: " + self.input_params[
            'output_reaction_matrix']

        #####################################################################
        # Building the report with figures, tables, and saved_objects (to be improved)
        # We pass in a dict where each key is a row for the table
        #####################################################################

        output_object_files = list()
        output_object_files.append({
            'ref': saved_matrix_ref,
            'description': saved_matrix_desc
        })

        return self._build_report(figure_grid, model_complexes_dict,
                                  mahal_dist_df, output_object_files,
                                  self.input_params['input_ws'])
示例#17
0
    def MotifEnsemble(self, ctx, params):
        """
        :param params: instance of type "EnsembleParams" (Internal workflow:
           1. Input - list of motifsets , workspace, threshold consensus 2.
           Download MotifSets -> Utils function 3. Assign motif ids by
           position in list Use refs to identify MSOs internally! Dictionary
           of motifsets key: ref, val set list of match sets: each item in
           the set is a tuple of (ref,index) for each motifset: <- enumerate
           to avoid duplicate for each motif in motifset for each other
           motifset: <- enumerate to avoid duplicate for each motif in other:
           compare(motif1,motif2): if motifs same: search list of sets for
           motif1: if found add  motif2 if not in if not found search list of
           sets for motif2: if found add motif1 else add a new set with
           motif1 + motif2) -> structure: parameter "motifset_refs" of list
           of String, parameter "workspace_name" of String, parameter
           "threshold" of Double, parameter "proportion" of Double
        :returns: instance of type "Ensemble_out" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: out
        #BEGIN MotifEnsemble
        #TODO: ERROR CHECK (MULTIPLE MOTIFSETS, NONEMPTY, SSREF are the same, etc.)

        dms = DownloadMotifSets()
        MotifSetDict = dms.DownloadMotifSet(params['motifset_refs'],
                                            self.callback_url)

        matchSets = []
        threshold = float(params['threshold'])
        fmu = FastaUtils()
        for i, MSR1 in enumerate(MotifSetDict.keys()):
            for j, motif1 in enumerate(MotifSetDict[MSR1]['Motifs']):
                for k, MSR2 in enumerate(MotifSetDict.keys()):
                    if k > i:
                        for l, motif2 in enumerate(
                                MotifSetDict[MSR2]['Motifs']):
                            print(motif1)
                            print(motif2)
                            print(threshold)
                            if fmu.CompareMotifsBP(motif1, motif2, threshold):
                                found1 = False
                                found2 = False
                                index1 = -1
                                index2 = -1
                                for m, mset in enumerate(matchSets):
                                    if (MSR1, j) in mset:
                                        found1 = True
                                        index1 = m
                                    if (MSR2, l) in mset:
                                        found2 = True
                                        index2 = m
                                if not found1 and found2:
                                    matchSets[index2].add((MSR1, j))
                                elif not found2 and found1:
                                    matchSets[index1].add((MSR2, l))
                                elif found1 and found2:
                                    if index1 != index2:
                                        matchSets[index1].union(
                                            matchSets[index2])
                                        matchSets.pop(index2)
                                else:
                                    matchSets.append(
                                        set([(MSR1, j), (MSR2, l)]))
        numMotifSets = len(params['motifset_refs'])
        threshold = float(params['proportion'])
        KeepSets = []
        print('NUM MATCHSETS********')
        print(len(matchSets))
        for i, mset in enumerate(matchSets):
            uniqueRefs = {}
            for tuple in mset:
                if tuple[0] not in uniqueRefs:
                    uniqueRefs[tuple[0]] = tuple[0]
            if float(len(uniqueRefs.keys())) / numMotifSets >= threshold:
                KeepSets.append(i)
        print(len(KeepSets))

        ESO = {}
        for ref in MotifSetDict:
            ESO['Condition'] = MotifSetDict[ref]['Condition']
            ESO['SequenceSet_ref'] = MotifSetDict[ref]['SequenceSet_ref']
            ESO['Alphabet'] = deepcopy(MotifSetDict[ref]['Alphabet'])
            ESO['Background'] = deepcopy(MotifSetDict[ref]['Background'])
            break
        ESO['Motifs'] = []
        #Add motifs
        for keep in KeepSets:
            motif = fmu.merge(matchSets[keep], MotifSetDict)
            ESO['Motifs'].append(deepcopy(motif))

        #upload new MSO
        dfu = DataFileUtil(self.callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        save_objects_params['objects'] = [{
            'type': 'KBaseGeneRegulation.MotifSet',
            'data': ESO,
            'name': 'EnsembleMotifSet'
        }]

        info = dfu.save_objects(save_objects_params)[0]
        obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        htmlDir = self.shared_folder + '/ensemble_html'
        os.mkdir(htmlDir)
        mr = MakeNewReport()
        mr.MakeReport(htmlDir, ESO)

        try:
            html_upload_ret = dfu.file_to_shock({
                'file_path': htmlDir,
                'make_handle': 0,
                'pack': 'zip'
            })
        except:
            raise ValueError('error uploading HTML file to shock')

        reportName = 'MEMEMotifFinder_report_' + str(uuid.uuid4())

        reportObj = {
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Motif Set generated by MEME'
            }],
            'message':
            '',
            'direct_html':
            None,
            'direct_html_link_index':
            0,
            'file_links': [],
            'html_links': [],
            'html_window_height':
            220,
            'workspace_name':
            params['workspace_name'],
            'report_object_name':
            reportName
        }

        # attach to report obj
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            'name': 'index.html',
            'label': 'Save promoter_download.zip'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        report_info = report.create_extended_report(reportObj)
        out = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END MotifEnsemble

        # At some point might do deeper type checking...
        if not isinstance(out, dict):
            raise ValueError('Method MotifEnsemble return value ' +
                             'out is not type dict as required.')
        # return the results
        return [out]
示例#18
0
class plant_fba:
    '''
    Module Name:
    plant_fba

    Module Description:
    A KBase module: plant_fba
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "1.1.1"
    GIT_URL = "[email protected]:kbaseapps/plant_fba.git"
    GIT_COMMIT_HASH = "6f0b5af5a458c5158b9f0007399653a256edcd14"

    #BEGIN_CLASS_HEADER

    def convert_search_role(self, role):

        searchrole = role

        #Remove spaces
        searchrole = searchrole.strip()
        searchrole = searchrole.replace(' ', '')

        #Make all lowercase
        searchrole = searchrole.lower()

        #Remove EC and parentheses
        searchrole = re.sub(r'\(ec[\d-]+\.[\d-]\.[\d-]\.[\d-]\)', '',
                            searchrole)

        return searchrole

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.token = os.environ['KB_AUTH_TOKEN']
        self.shared_folder = config['scratch']
        self.config = config
        self.dfu = DataFileUtil(self.callback_url)
        #END_CONSTRUCTOR
        pass

    def integrate_abundances_with_metabolism(self, ctx, input_params):
        """
        :param input_params: instance of type "IntegrateAbundancesParams"
           (@optional input_columns) -> structure: parameter "input_ws" of
           String, parameter "input_expression_matrix" of String, parameter
           "input_fbamodel" of String, parameter "input_columns" of String,
           parameter "output_reaction_matrix" of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output_report
        #BEGIN integrate_abundances_with_metabolism

        app = IntegrateAppImpl(self.config, ctx, input_params)
        output_report = app.integrate_abundances_with_metabolism()

        #END integrate_abundances_with_metabolism

        # At some point might do deeper type checking...
        if not isinstance(output_report, dict):
            raise ValueError(
                'Method integrate_abundances_with_metabolism return value ' +
                'output_report is not type dict as required.')
        # return the results
        return [output_report]

    def reconstruct_plant_metabolism(self, ctx, input_params):
        """
        :param input_params: instance of type "ReconstructMetabolismParams"
           -> structure: parameter "input_ws" of String, parameter
           "input_genome" of String, parameter "output_ws" of String,
           parameter "output_fbamodel" of String, parameter "template" of
           String, parameter "template_ws" of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output_report
        #BEGIN reconstruct_plant_metabolism

        #Compile biochemistry information
        abbrev_cpt_dict = dict()
        cpt_name_dict = dict()
        with open('/kb/module/data/compartments.txt') as fh:
            for line in fh.readlines():
                line = line.strip('\r\n')
                array = line.split('\t')
                abbrev_cpt_dict[array[3]] = array[0]
                cpt_name_dict[array[0]] = array[2]

        # Fetch and parse biochemistry data
        with open(
                os.path.join("/kb/module/ModelSEEDDatabase", "Biochemistry",
                             "reactions.json")) as msd_rxn_fh:
            MSD_reactions = json.load(msd_rxn_fh)
        MSD_reactions_dict = dict()
        for entry in MSD_reactions:
            MSD_reactions_dict[entry['id']] = entry

        with open(
                os.path.join("/kb/module/ModelSEEDDatabase", "Biochemistry",
                             "compounds.json")) as msd_rxn_fh:
            MSD_compounds = json.load(msd_rxn_fh)
        MSD_compounds_dict = dict()
        for entry in MSD_compounds:
            MSD_compounds_dict[entry['id']] = entry

        # Retrieve Template, and compile indexes of roles and complexes
        if ('template_ws' not in input_params
                or input_params['template_ws'] == ''):
            input_params['template_ws'] = 'NewKBaseModelTemplates'

        if ('template' not in input_params or input_params['template'] == ''):
            input_params['template'] = 'PlantModelTemplate'

        template_ref = input_params['template_ws'] + '/' + input_params[
            'template']
        template_obj = self.dfu.get_objects({'object_refs':
                                             [template_ref]})['data'][0]

        searchroles_dict = dict()
        roles_dict = dict()
        for role in template_obj['data']['roles']:
            searchrole = self.convert_search_role(role['name'])
            searchroles_dict[searchrole] = role['id']
            roles_dict[role['id']] = role

        complex_dict = dict()
        for cpx in template_obj['data']['complexes']:
            complex_dict[cpx['id']] = cpx

        #Retrieve Genome annotation as dict
        role_cpt_ftr_dict = dict()
        genome_ref = input_params['input_ws'] + '/' + input_params[
            'input_genome']
        genome_obj = self.dfu.get_objects({'object_refs':
                                           [genome_ref]})['data'][0]
        for feature in genome_obj['data']['features']:
            if ('functions' in feature and len(feature['functions']) > 0):
                for function_comment in feature['functions']:

                    #Split for comments and retrieve compartments
                    function_cpt_list = function_comment.split("#")
                    for i in range(len(function_cpt_list)):
                        function_cpt_list[i] = function_cpt_list[i].strip()

                    function = function_cpt_list.pop(0)
                    roles = re.split("\s*;\s+|\s+[\@\/]\s+", function)
                    for role in roles:

                        searchrole = self.convert_search_role(role)
                        if (searchrole not in searchroles_dict):
                            continue

                        role_id = searchroles_dict[searchrole]

                        if (role_id not in role_cpt_ftr_dict):
                            role_cpt_ftr_dict[role_id] = dict()

                        #Defaults to cytosol
                        if (len(function_cpt_list) == 0):
                            function_cpt_list.append('cytosol')

                        for cpt in function_cpt_list:
                            abbrev_cpt = cpt
                            if (cpt not in abbrev_cpt_dict):
                                print(
                                    "No compartmental abbreviation found for "
                                    + cpt)
                            else:
                                abbrev_cpt = abbrev_cpt_dict[cpt]

                            if (abbrev_cpt not in role_cpt_ftr_dict[role_id]):
                                role_cpt_ftr_dict[role_id][abbrev_cpt] = dict()

                            role_cpt_ftr_dict[role_id][abbrev_cpt][
                                feature['id']] = 1

        #Default dictionaries for objects needed for a model reaction
        default_mdlcpt_dict = {
            'id': 'u0',
            'label': 'unknown',
            'pH': 7,
            'potential': 0,
            'compartmentIndex': 0,
            'compartment_ref': '~//'
        }

        default_mdlcpd_dict = {
            'id': '',
            'charge': 0,
            'formula': '',
            'name': '',
            'compound_ref': '',
            'modelcompartment_ref': '~/modelcompartments/id/u0'
        }

        default_mdlrxn_dict = {
            'id': '',
            'direction': '',
            'protons': 0,
            'name': '',
            'reaction_ref': '',
            'probability': 0,
            'modelcompartment_ref': '',
            'modelReactionReagents': [],
            'modelReactionProteins': []
        }

        #Lookup dictionaries for compartments and compounds, to avoid duplicating them
        mdlcpts_dict = dict()
        mdlcpds_dict = dict()

        #Reaction complexes for the generated table
        rxncplxs_dict = dict()

        #Create New, but Empty Plant Reconstruction
        new_model_obj = {
            'id': input_params['output_fbamodel'],
            'type': "GenomeScale",
            'source': "KBase",
            'source_id': "PlantSEED_v2",
            'template_ref': template_ref,
            'genome_ref': genome_ref,
            'name': input_params['output_fbamodel'],
            'modelreactions': [],
            'modelcompounds': [],
            'modelcompartments': [],
            'biomasses': [],
            'gapgens': [],
            'gapfillings': []
        }

        for template_rxn in template_obj['data']['reactions']:
            if (template_rxn['type'] == 'gapfilling'):
                continue

            template_rxn_cpt = template_rxn['templatecompartment_ref'].split(
                '/')[-1]

            proteins_list = list()
            prots_str_list = list()
            #complex_ref and source are optional fields
            default_protein_dict = {
                'note': template_rxn['type'],
                'complex_ref': '',
                'modelReactionProteinSubunits': []
            }
            for cpx_ref in template_rxn['templatecomplex_refs']:
                cpx_id = cpx_ref.split('/')[-1]
                model_complex_ref = "~/template/complexes/id/" + cpx_id

                new_protein_dict = copy.deepcopy(default_protein_dict)
                new_protein_dict['complex_ref'] = model_complex_ref

                complex_present = False
                subunits_list = list()
                default_subunit_dict = {
                    'role': '',
                    'triggering': 0,
                    'optionalSubunit': 0,
                    'note': '',
                    'feature_refs': []
                }
                matched_role_dict = dict()

                for cpxrole in complex_dict[cpx_id]['complexroles']:
                    role_id = cpxrole['templaterole_ref'].split('/')[-1]

                    if (role_id in role_cpt_ftr_dict):

                        for role_cpt in role_cpt_ftr_dict[role_id]:
                            role_cpt_present = False
                            if (template_rxn_cpt == role_cpt
                                    and cpxrole['triggering'] == 1):
                                complex_present = True
                                role_cpt_present = True

                            if (role_cpt_present == True):
                                new_subunit_dict = copy.deepcopy(
                                    default_subunit_dict)
                                new_subunit_dict['triggering'] = cpxrole[
                                    'triggering']
                                new_subunit_dict['optionalSubunit'] = cpxrole[
                                    'optional_role']
                                new_subunit_dict['role'] = roles_dict[role_id][
                                    'name']

                                if (len(roles_dict[role_id]['features']) > 0):
                                    new_subunit_dict[
                                        'note'] = 'Features characterized and annotated'
                                else:
                                    #This never happens as of Fall 2019
                                    print("Warning: " +
                                          roles_dict[role_id]['name'] +
                                          " is apparently uncharacterized!")
                                    new_subunit_dict[
                                        'note'] = 'Features uncharacterized but annotated'
                                    pass

                                for ftr in role_cpt_ftr_dict[role_id][
                                        role_cpt]:
                                    feature_ref = "~/genome/features/id/" + ftr
                                    new_subunit_dict['feature_refs'].append(
                                        feature_ref)

                                matched_role_dict[role_id] = 1
                                subunits_list.append(new_subunit_dict)

                    if (role_id not in role_cpt_ftr_dict
                            and template_rxn['type'] == 'universal'):
                        #This should still be added, with zero features to indicate the universality of the role in plant primary metabolism
                        new_subunit_dict = copy.deepcopy(default_subunit_dict)
                        new_subunit_dict['triggering'] = cpxrole['triggering']
                        new_subunit_dict['optionalSubunit'] = cpxrole[
                            'optional_role']
                        new_subunit_dict['role'] = roles_dict[role_id]['name']

                        #Un-necessary, but explicitly stated
                        new_subunit_dict['feature_refs'] = []

                        if (len(roles_dict[role_id]['features']) == 0):
                            new_subunit_dict[
                                'note'] = 'Features uncharacterized and unannotated'
                        else:
                            #As of Fall 2019, this includes two reactions
                            new_subunit_dict[
                                'note'] = "Features characterized but unannotated"
                            print("Missing annotation: ", cpx_id, role_id,
                                  roles_dict[role_id])

                        matched_role_dict[role_id] = 1
                        subunits_list.append(new_subunit_dict)

                if (complex_present == True):
                    #Check to see if members of a detected protein complex are missing
                    #and add them if so, to round off the complex
                    #This will only happen to a complex that is conditional (see above)
                    for cpxrole in complex_dict[cpx_id]['complexroles']:
                        role_id = cpxrole['templaterole_ref'].split('/')[-1]

                        if (role_id not in matched_role_dict):
                            print("Gapfilling complex: ", cpx_id,
                                  roles_dict[role_id])
                            new_subunit_dict = copy.deepcopy(
                                default_subunit_dict)
                            new_subunit_dict['triggering'] = cpxrole[
                                'triggering']
                            new_subunit_dict['optionalSubunit'] = cpxrole[
                                'optional_role']
                            new_subunit_dict[
                                'note'] = "Complex-based-gapfilling"
                            subunits_list.append(new_subunit_dict)

                if (len(subunits_list) > 0):
                    new_protein_dict[
                        'modelReactionProteinSubunits'] = subunits_list

                    #Store features and subunits as complex string for table
                    subs_str_list = list()
                    for subunit in subunits_list:
                        ftrs_str_list = list()
                        for ftr_ref in subunit['feature_refs']:
                            ftr = ftr_ref.split('/')[-1]
                            ftrs_str_list.append(ftr)
                        ftr_str = "(" + ", ".join(ftrs_str_list) + ")"
                        subs_str_list.append(ftr_str)
                    sub_str = "[" + ", ".join(subs_str_list) + "]"
                    prots_str_list.append(sub_str)

                proteins_list.append(new_protein_dict)

            prot_str = ", ".join(prots_str_list)

            #This is important, we need to use role-based annotation to determine whether
            #a reaction should even be added to the model
            if (template_rxn['type'] == 'conditional'
                    and len(proteins_list) == 0):
                continue

            #If the check passes, then, here, we instantiate the actual reaction that goes into the model
            new_mdlrxn_id = template_rxn['id'] + '0'
            new_mdlcpt_id = template_rxn_cpt + '0'
            base_rxn_id = template_rxn['id'].split('_')[0]

            #For table
            rxncplxs_dict[new_mdlrxn_id] = prot_str

            new_mdlrxn_dict = copy.deepcopy(default_mdlrxn_dict)
            new_mdlrxn_dict['id'] = new_mdlrxn_id

            new_mdlrxn_dict['name'] = MSD_reactions_dict[base_rxn_id][
                'abbreviation']
            if (MSD_reactions_dict[base_rxn_id]['abbreviation'] == ""):
                new_mdlrxn_dict['name'] = base_rxn_id

            new_mdlrxn_dict['direction'] = template_rxn['direction']
            new_mdlrxn_dict[
                'reaction_ref'] = '~/template/reactions/id/' + template_rxn[
                    'id']
            new_mdlrxn_dict[
                'modelcompartment_ref'] = '~/modelcompartments/id/' + new_mdlcpt_id

            #Here we check and instantiate a new modelcompartment
            if (new_mdlcpt_id not in mdlcpts_dict):
                new_mdlcpt_dict = copy.deepcopy(default_mdlcpt_dict)
                new_mdlcpt_dict['id'] = new_mdlcpt_id
                new_mdlcpt_dict['label'] = cpt_name_dict[template_rxn_cpt]
                new_mdlcpt_dict[
                    'compartment_ref'] = '~/template/compartments/id/' + template_rxn_cpt
                mdlcpts_dict[new_mdlcpt_id] = new_mdlcpt_dict

            #Add Proteins as previously determined
            new_mdlrxn_dict['modelReactionProteins'] = proteins_list

            #Add Reagents
            for template_rgt in template_rxn['templateReactionReagents']:
                template_rgt_cpd_cpt_id = template_rgt[
                    'templatecompcompound_ref'].split('/')[-1]
                (template_rgt_cpd,
                 template_rgt_cpt) = template_rgt_cpd_cpt_id.split('_')

                #Check and add new model compartment
                new_mdlcpt_id = template_rgt_cpt + '0'
                if (new_mdlcpt_id not in mdlcpts_dict):
                    new_mdlcpt_dict = copy.deepcopy(default_mdlcpt_dict)
                    new_mdlcpt_dict['id'] = new_mdlcpt_id
                    new_mdlcpt_dict['label'] = cpt_name_dict[template_rgt_cpt]
                    new_mdlcpt_dict[
                        'compartment_ref'] = '~/template/compartments/id/' + template_rgt_cpt
                    mdlcpts_dict[new_mdlcpt_id] = new_mdlcpt_dict

                #Add new model compounds
                new_mdlcpd_id = template_rgt_cpd_cpt_id + '0'
                base_cpd_id = template_rgt_cpd_cpt_id.split('_')[0]

                if (new_mdlcpd_id not in mdlcpds_dict):
                    new_mdlcpd_dict = copy.deepcopy(default_mdlcpd_dict)
                    new_mdlcpd_dict['id'] = new_mdlcpd_id
                    new_mdlcpd_dict['name'] = MSD_compounds_dict[base_cpd_id][
                        'name']

                    new_mdlcpd_dict['charge'] = float(
                        MSD_compounds_dict[base_cpd_id]['charge'])
                    new_mdlcpd_dict['formula'] = MSD_compounds_dict[
                        base_cpd_id]['formula']
                    if(MSD_compounds_dict[base_cpd_id]['formula'] == "" or \
                           MSD_compounds_dict[base_cpd_id]['formula'] is None):
                        print("Formula: ", base_cpd_id,
                              MSD_compounds_dict[base_cpd_id])
                        new_mdlcpd_dict['formula'] = ""

                    new_mdlcpd_dict[
                        'compound_ref'] = '~/template/compounds/id/' + template_rgt_cpd
                    new_mdlcpd_dict[
                        'modelcompartment_ref'] = '~/modelcompartments/id/' + new_mdlcpt_id
                    mdlcpds_dict[new_mdlcpd_id] = new_mdlcpd_dict

                new_rgt_dict = {
                    'coefficient': template_rgt['coefficient'],
                    'modelcompound_ref': '~/modelcompounds/id/' + new_mdlcpd_id
                }

                new_mdlrxn_dict['modelReactionReagents'].append(new_rgt_dict)

            new_model_obj['modelreactions'].append(new_mdlrxn_dict)

        #Having populated with list of reactions and biomass (to come), then add all compartments and compounds
        for cpt_id in mdlcpts_dict:
            new_model_obj['modelcompartments'].append(mdlcpts_dict[cpt_id])

        #Last, but key modelcompound is the biomass, need to add it explicitly
        biocpd_id = "cpd11416"
        mdlbiocpd_dict = copy.deepcopy(default_mdlcpd_dict)
        mdlbiocpd_dict['id'] = biocpd_id + '_c0'
        mdlbiocpd_dict['name'] = 'Biomass'
        mdlbiocpd_dict['compound_ref'] = "~/template/compounds/id/" + biocpd_id
        mdlbiocpd_dict['modelcompartment_ref'] = "~/modelcompartments/id/c0"
        mdlcpds_dict[mdlbiocpd_dict['id']] = mdlbiocpd_dict

        for cpd_id in mdlcpds_dict:
            new_model_obj['modelcompounds'].append(mdlcpds_dict[cpd_id])

        default_biomass_dict = {
            'id': 'bio1',
            'name': 'Plant leaf biomass',
            'other': 1,
            'dna': 0,
            'rna': 0,
            'protein': 0,
            'cellwall': 0,
            'lipid': 0,
            'cofactor': 0,
            'energy': 0,
            'biomasscompounds': []
        }

        default_biocpd_dict = {'modelcompound_ref': '', 'coefficient': 0}

        for template_biomass in template_obj['data']['biomasses']:
            new_template_biomass = copy.deepcopy(default_biomass_dict)
            new_template_biomass['id'] = template_biomass['id']
            new_template_biomass['name'] = template_biomass['name']

            for entry in [
                    'dna', 'rna', 'protein', 'cellwall', 'lipid', 'cofactor',
                    'energy', 'other'
            ]:
                new_template_biomass[entry] = template_biomass[entry]

            for template_cpd in template_biomass['templateBiomassComponents']:
                new_biocpd_dict = copy.deepcopy(default_biocpd_dict)
                mdlcpd_id = template_cpd['templatecompcompound_ref'].split(
                    '/')[-1] + '0'
                if (mdlcpd_id not in mdlcpds_dict):
                    print("Missing: ", template_cpd)
                    continue
                new_biocpd_dict[
                    'modelcompound_ref'] = '~/modelcompounds/id/' + mdlcpd_id
                new_biocpd_dict['coefficient'] = template_cpd['coefficient']
                new_template_biomass['biomasscompounds'].append(
                    new_biocpd_dict)

            new_model_obj['biomasses'].append(new_template_biomass)

        print("Saving metabolic reconstruction")
        model_ws_object = {
            'type': 'KBaseFBA.FBAModel',
            'name': input_params['output_fbamodel'],
            'data': new_model_obj
        }

        if ('output_ws' not in input_params
                or input_params['output_ws'] == ''):
            input_params['output_ws'] = input_params['input_ws']

        ws_id = self.dfu.ws_name_to_id(input_params['output_ws'])
        saved_model_list = self.dfu.save_objects({
            'id': ws_id,
            'objects': [model_ws_object]
        })[0]

        #Compose report string
        html_string = "<html><head><title>Reconstruct Plant Metabolism Report</title></head><body>"
        html_string += "<h2>Reconstruct Plant Metabolism Report</h2>"
        html_string += "<p>The \"Reconstruct Plant Metabolism\" app has finished running, "
        html_string += "reconstructing the primary metabolism from the "
        html_string += "enzymatic annotations in " + input_params[
            'input_genome'] + "</p>"
        html_string += "<p>Below we present the table of compartmentalized reactions in the metabolic reconstruction, "
        html_string += "it is similar to what you can see in the FBAModel viewer widget that appears "
        html_string += "below the report, but it has some additional information. Each row in the table is unique "
        html_string += "to each combination of reaction and compartment.</p>"
        html_string += "<p><ul>"
        html_string += "<li><b>Subsystems and Classes:</b> The table contains the metabolic subsystems and "
        html_string += "the general class of metabolism they fall into.</li>"
        html_string += "<li><b>Metabolic functions and EC numbers:</b> The table contains the original enzymatic "
        html_string += "annotation ('Roles') and their EC numbers that were associated with each biochemical reaction.</li>"
        html_string += "<li><b>Complexes:</b> The table contains the genes that were annotated with the metabolic functions. "
        html_string += "These genes that are associated with each reaction can be seen in the FBAModel viewer widget, but here "
        html_string += " one can see how they may be organized into protein complexes. Each set of parentheses '()' "
        html_string += "represents a single protein subunit (which may be the entire enzyme, or part of a large enzymatic "
        html_string += "complex). Each set of square brackets '[]' represents an entire enzyme, regardless of how many "
        html_string += "subunits it consists of. Each reaction may be catalyzed by different enzymes, each in turn composed "
        html_string += "of different subunits. The complexes reflect how the enzymes were curated in <i>Arabidopsis thaliana</i> "
        html_string += " so if any complex is shown to be empty, this means that the enzymatic annotation was not propagated "
        html_string += "from the original Arabidopsis gene. The original Arabidopsis curation also included protein localization "
        html_string += "so if a reaction has empty complexes in some compartments as opposed to others, this is an indication "
        html_string += "that annotation was only propagated for some localized Arabidopsis enzymes, and not others."
        html_string += "</ul></p>"

        # Fetch PlantSEED Data
        with open(
                os.path.join("/kb/module/PlantSEED", "Data/PlantSEED_v3",
                             "PlantSEED_Roles.json")) as plsd_fh:
            PS_Roles = json.load(plsd_fh)

        plantseed = FetchPlantSEEDImpl()
        reactions_data = plantseed.fetch_reactions(PS_Roles)

        table = GenerateTableImpl()
        table_html_string = table.generate_table(reactions_data,
                                                 complexes=rxncplxs_dict)

        with open(
                os.path.join(
                    '/kb/module/data', 'app_report_templates',
                    'integrate_abundances_report_tables_template.html')
        ) as report_template_file:
            report_template_string = report_template_file.read()

        # Generate and insert html Title
        report_template_string = report_template_string.replace(
            '*TITLE*', input_params['output_fbamodel'])

        # Insert html table
        table_report_string = report_template_string.replace(
            '*TABLES*', html_string + table_html_string)

        #Make folder for report files
        uuid_string = str(uuid.uuid4())
        report_file_path = os.path.join(self.shared_folder, uuid_string)
        os.mkdir(report_file_path)

        #Write html files
        with open(os.path.join(report_file_path, "index.html"),
                  'w') as index_file:
            index_file.write(table_report_string)

        #Cache it in shock as an archive
        upload_info = self.dfu.file_to_shock({
            'file_path': report_file_path,
            'pack': 'zip'
        })

        #Prepare report parameters
        report_params = {
            'direct_html_link_index':
            0,  #Use to refer to index of 'html_links'
            'workspace_name': input_params['input_ws'],
            'report_object_name': 'plant_fba_' + uuid_string,
            'objects_created': [],
            'html_links': []
        }

        #Html Link object
        html_link = {
            'shock_id': upload_info['shock_id'],
            'name': 'index.html',
            'label': 'html files',
            'description': 'HTML files'
        }
        report_params['html_links'].append(html_link)

        #Objects created object
        saved_model_ref = "{}/{}/{}".format(saved_model_list[6],
                                            saved_model_list[0],
                                            saved_model_list[4])
        saved_model_desc = "FBAModel: " + input_params['output_fbamodel']
        report_params['objects_created'].append({
            'ref': saved_model_ref,
            'description': saved_model_desc
        })

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        report_client_output = kbase_report_client.create_extended_report(
            report_params)

        output_report = dict()
        output_report['report_name'] = report_client_output['name']
        output_report['report_ref'] = report_client_output['ref']

        #END reconstruct_plant_metabolism

        # At some point might do deeper type checking...
        if not isinstance(output_report, dict):
            raise ValueError(
                'Method reconstruct_plant_metabolism return value ' +
                'output_report is not type dict as required.')
        # return the results
        return [output_report]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
    def run_FamaGenomeProfiling(self, ctx, params):
        """
        Run genome functional profiling module of Fama.
        :param params: instance of type "FamaGenomeProfilingParams"
           (Parameters for genome functional profiling. workspace_name - the
           name of the workspace for input/output genome_refs - references to
           a genome object ref_dataset - the name of Fama reference dataset
           output_result_name - the name of the output DomainAnnotation) ->
           structure: parameter "workspace_name" of String, parameter
           "genome_ref" of list of String, parameter "ref_dataset" of String,
           parameter "output_feature_set_name" of String, parameter
           "output_annotation_name" of String
        :returns: instance of type "ReportResults" (Output report parameters
           report_name - the name of the report object report_ref - the
           reference to the report object) -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_FamaGenomeProfiling
        # Import protein sequences from input genome_ref
        ws_client = Workspace(self.ws_url)
        input_genome_refs = params['genome_ref']
        fama_reference = params['ref_dataset']
        input_proteins = {}
        name2ref = {}
        for input_genome_ref in input_genome_refs:
            ret = ws_client.get_objects2(
                {'objects': [{
                    'ref': input_genome_ref
                }]})['data'][0]
            obj_data = ret['data']
            obj_name = ret['info'][1]
            obj_type = ret['info'][2].split('.')[1].split('-')[0]
            if obj_type == 'GenomeSet':
                print('GenomeSet data', obj_data)
                genome_refs = []
                if 'elements' in obj_data:
                    genome_refs = [
                        item['ref'] for item in obj_data['elements'].values()
                    ]
                elif 'items' in obj_data:
                    genome_refs = [item['ref'] for item in obj_data['items']]
                for sub_obj_ref in genome_refs:
                    ret = ws_client.get_objects2(
                        {'objects': [{
                            'ref': sub_obj_ref
                        }]})['data'][0]
                    genome_data = ret['data']
                    genome_name = ret['info'][1]
                    if genome_name in name2ref:
                        raise ServerError(
                            'All input genome names must be unique. Check ' +
                            genome_name)
                    name2ref[genome_name] = sub_obj_ref
                    proteins = genome_proteins_to_fasta(
                        genome_data, self.shared_folder)
                    input_proteins[genome_name] = {}
                    input_proteins[genome_name]['fwd'] = proteins
            elif obj_type == 'Genome':
                if obj_name in name2ref:
                    raise ServerError('All input genome names must be unique')
                name2ref[obj_name] = input_genome_ref
                proteins = genome_proteins_to_fasta(obj_data,
                                                    self.shared_folder)
                input_proteins[obj_name] = {}
                input_proteins[obj_name]['fwd'] = proteins
            else:
                raise ServerError('Incompatible object: ' + input_genome_ref +
                                  ' (' + obj_name + ')')

        self.log('Input sequence files:', str(input_proteins))
        self.log('reference: ', fama_reference)
        # Run Fama
        fama_params = {
            'input_proteins': input_proteins,
            'work_dir': self.shared_folder,
            'reference': fama_reference,
            'ws_name': params['workspace_name'],
            'ws_client': ws_client,
            'featureset_name': params['output_feature_set_name'],
            'annotation_prefix': params['output_annotation_name'],
            'name2ref': name2ref
        }
        fama_output = protein_functional_profiling_pipeline(fama_params)
        objects_created = fama_output['objects_created']

        dfu = DataFileUtil(self.callback_url)
        workspace_id = dfu.ws_name_to_id(params['workspace_name'])

        object_type = 'KBaseCollections.FeatureSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': fama_output['feature_set_data'],
                'name': params['output_feature_set_name']
            }]
        }

        try:
            dfu_oi = dfu.save_objects(save_object_params)[0]
        except ServerError as dfue:
            # not really any way to test this block
            self.log('Logging exception saving feature set')
            self.log(str(dfue))
            raise
        feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0],
                                                dfu_oi[4])
        objects_created.append({
            'ref': feature_set_obj_ref,
            'description': 'Filtered genome features'
        })

        self.log('FeatureSet saved to ' + feature_set_obj_ref)

        # Write HTML output to workspace
        message = 'Fama protein functional profiling finished successfully'

        try:
            dfu_output = dfu.file_to_shock(
                {'file_path': fama_output['html_report']})
        except ServerError as dfue:
            # not really any way to test this block
            self.log('Logging exception loading results to shock')
            self.log(str(dfue))
            raise
        self.log('HTML report saved: ' + str(dfu_output))

        html_links = [{
            'shock_id': dfu_output['shock_id'],
            'description': 'HTML report for Fama App',
            'name': 'fama_report.html',
            'label': 'Fama_report'
        }]
        for krona_file in fama_output['krona_charts']:
            try:
                dfu_output = dfu.file_to_shock({'file_path': krona_file})
                html_links.append({
                    'shock_id':
                    dfu_output['shock_id'],
                    'description':
                    'Krona chart for function taxonomy profile',
                    'name':
                    fama_output['krona_charts'][krona_file][0],
                    'label':
                    fama_output['krona_charts'][krona_file][1]
                })
            except ServerError as dfue:
                # not really any way to test this block
                self.log('Logging exception loading results to shock')
                self.log(str(dfue))
                raise

        self.log('Krona chart saved: ' + str(dfu_output))

        # Save report
        report_params = {
            'message': message,
            'objects_created': objects_created,
            'direct_html_link_index': 0,
            'html_links': html_links,
            'file_links': fama_output['report_files'],
            'report_object_name': 'fama_profiling_report_' + str(uuid.uuid4()),
            'workspace_name': params['workspace_name'],
            'html_window_height': 460
        }
        try:
            self.log('Call KBaseReport at ' + str(self.callback_url))
            report = KBaseReport(self.callback_url)
            self.log('Ready to save KBase report: ' + str(report_params))
            report_info = report.create_extended_report(report_params)
        except ServerError as kre:
            # not really any way to test this block
            self.log('Logging exception saving report')
            self.log(str(kre))
            raise

        report_info['report_params'] = report_params
        self.log('KBase report saved: ' + str(report_info))
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END run_FamaGenomeProfiling

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_FamaGenomeProfiling return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#20
0
class BiomUtil:
    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _process_params(self, params):
        logging.info('start validating import_matrix_from_biom params')

        # check for required parameters
        for p in [
                'obj_type', 'matrix_name', 'workspace_name', 'scale',
                'amplicon_set_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        obj_type = params.get('obj_type')
        if obj_type not in self.matrix_types:
            raise ValueError('Unknown matrix object type: {}'.format(obj_type))

        scale = params.get('scale')
        if scale not in SCALE_TYPES:
            raise ValueError('Unknown scale type: {}'.format(scale))

        biom_file = None
        tsv_file = None
        fasta_file = None
        metadata_keys = DEFAULT_META_KEYS

        if params.get('biom_tsv'):
            biom_tsv = params.get('biom_tsv')
            biom_file = biom_tsv.get('biom_file_biom_tsv')
            tsv_file = biom_tsv.get('tsv_file_biom_tsv')

            if not (biom_file and tsv_file):
                raise ValueError('missing BIOM or TSV file')

            biom_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                biom_file
            }).get('copy_file_path')

            tsv_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                tsv_file
            }).get('copy_file_path')
            mode = 'biom_tsv'
        elif params.get('biom_fasta'):
            biom_fasta = params.get('biom_fasta')
            biom_file = biom_fasta.get('biom_file_biom_fasta')
            fasta_file = biom_fasta.get('fasta_file_biom_fasta')

            if not (biom_file and fasta_file):
                raise ValueError('missing BIOM or FASTA file')

            biom_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                biom_file
            }).get('copy_file_path')

            fasta_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                fasta_file
            }).get('copy_file_path')
            mode = 'biom_fasta'
        elif params.get('tsv_fasta'):
            tsv_fasta = params.get('tsv_fasta')
            tsv_file = tsv_fasta.get('tsv_file_tsv_fasta')
            fasta_file = tsv_fasta.get('fasta_file_tsv_fasta')

            if not (tsv_file and fasta_file):
                raise ValueError('missing TSV or FASTA file')

            tsv_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                tsv_file
            }).get('copy_file_path')

            fasta_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                fasta_file
            }).get('copy_file_path')

            metadata_keys_str = tsv_fasta.get('metadata_keys_tsv_fasta')
            if metadata_keys_str:
                metadata_keys += [
                    x.strip() for x in metadata_keys_str.split(',')
                ]
            mode = 'tsv_fasta'
        elif params.get('tsv'):
            tsv = params.get('tsv')
            tsv_file = tsv.get('tsv_file_tsv')

            if not tsv_file:
                raise ValueError('missing TSV file')

            tsv_file = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                tsv_file
            }).get('copy_file_path')

            metadata_keys_str = tsv.get('metadata_keys_tsv')
            if metadata_keys_str:
                metadata_keys += [
                    x.strip() for x in metadata_keys_str.split(',')
                ]

            mode = 'tsv'
        else:
            raise ValueError('missing valide file group type in parameters')

        return (biom_file, tsv_file, fasta_file, mode,
                list(set(metadata_keys)))

    def _retrieve_value(self,
                        biom_metadata_dict,
                        tsv_metadata_df,
                        key,
                        required=False):

        if key in biom_metadata_dict:
            return {k.lower(): v
                    for k, v in biom_metadata_dict.items()}.get(key)
        elif key in tsv_metadata_df:
            return {k.lower(): v for k, v in tsv_metadata_df.items()}.get(key)
        elif required:
            raise ValueError('missing necessary [{}] from file'.format(key))
        else:
            return None

    def _search_taxon(self, scientific_name):
        """
        logic borrowed from: GFU.GenomeInterface
        https://github.com/kbaseapps/GenomeFileUtil/blob/master/lib/GenomeFileUtil/core/GenomeInterface.py#L216
        """
        taxon_id = None

        search_params = {
            "object_types": ["taxon"],
            "match_filter": {
                "lookup_in_keys": {
                    "scientific_name": {
                        "value": scientific_name
                    }
                },
                "exclude_subobjects": 1
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "sorting_rules": [{
                "is_object_property": 0,
                "property": "timestamp",
                "ascending": 0
            }]
        }

        objects = self.kbse.search_objects(search_params)['objects']

        if not objects:
            search_params['match_filter']['lookup_in_keys'] = {
                "aliases": {
                    "value": scientific_name
                }
            }
            objects = self.kbse.search_objects(search_params)['objects']
        if objects:
            taxon_id = objects[0].get('object_name')

        return taxon_id

    def _fetch_taxon_level(self, taxon_char):

        taxon_level_mapping = {
            'l': 'Life',
            'd': 'Domain',
            'k': 'Kingdom',
            'p': 'Phylum',
            'c': 'Class',
            'o': 'Order',
            'f': 'Family',
            'g': 'Genus',
            's': 'Species'
        }

        return taxon_level_mapping.get(taxon_char[0].lower(), 'Unknown')

    def _fetch_taxonomy(self, datarow):
        lineage = self._retrieve_value([], datarow, 'taxonomy')
        if isinstance(lineage, str):
            delimiter = csv.Sniffer().sniff(lineage).delimiter
            lineage = [x.strip() for x in lineage.split(delimiter)]

        taxonomy = {'lineage': lineage}

        for key in ['score', 'taxonomy_source', 'species_name']:
            val = self._retrieve_value([], datarow, key)
            if val:
                taxonomy[key] = val

        for item in lineage[::-1]:
            scientific_name = item.split('_')[-1]
            taxon_level_char = item.split('_')[0]
            if scientific_name:
                taxon_id = self._search_taxon(scientific_name)
                if taxon_id:
                    taxon_ref = f"{self.taxon_wsname}/{taxon_id}"
                    taxon_level = self._fetch_taxon_level(taxon_level_char)

                    taxonomy.update({
                        'taxon_ref': taxon_ref,
                        'taxon_id': taxon_id,
                        'scientific_name': scientific_name,
                        'taxon_level': taxon_level
                    })
                    break

        return taxonomy

    def _retrieve_tsv_amplicon_set_data(self, tsv_file):
        amplicons = dict()

        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide TSV file')

        if 'consensus_sequence' not in df.columns.tolist():
            raise ValueError('TSV file does not include consensus_sequence')

        logging.info('start processing each row in TSV')
        for observation_id in df.index:
            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {
                'consensus_sequence': df.loc[observation_id,
                                             'consensus_sequence'],
                'taxonomy': taxonomy
            }

            amplicons.update({observation_id: amplicon})

        logging.info('finished parsing TSV file')

        return amplicons

    def _retrieve_tsv_fasta_amplicon_set_data(self, tsv_file, fasta_file):
        amplicons = dict()
        try:
            logging.info('start parsing FASTA file')
            fastq_dict = SeqIO.index(fasta_file, "fasta")
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide FASTA file')

        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide TSV file')

        logging.info('start processing files')
        for observation_id in df.index:
            if observation_id not in fastq_dict:
                raise ValueError('FASTA file does not have [{}] OTU id'.format(
                    observation_id))

            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {
                'consensus_sequence': str(fastq_dict.get(observation_id).seq),
                'taxonomy': taxonomy
            }
            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        return amplicons

    def _retrieve_biom_fasta_amplicon_set_data(self, biom_file, fasta_file):
        amplicons = dict()
        try:
            logging.info('start parsing FASTA file')
            fastq_dict = SeqIO.index(fasta_file, "fasta")
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide FASTA file')

        logging.info('start parsing BIOM file')
        table = biom.load_table(biom_file)

        observation_ids = table._observation_ids.tolist()
        observation_metadata = table._observation_metadata

        logging.info('start processing files')
        for index, observation_id in enumerate(observation_ids):
            if observation_id not in fastq_dict:
                raise ValueError('FASTA file does not have [{}] OTU id'.format(
                    observation_id))

            taxonomy = self._fetch_taxonomy(observation_metadata[index])

            amplicon = {
                'consensus_sequence': str(fastq_dict.get(observation_id).seq),
                'taxonomy': taxonomy
            }

            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        return amplicons

    def _retrieve_biom_tsv_amplicon_set_data(self, biom_file, tsv_file):
        amplicons = dict()
        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide tsv file')

        if 'consensus_sequence' not in df.columns.tolist():
            raise ValueError('TSV file does not include consensus_sequence')

        logging.info('start parsing BIOM file')
        table = biom.load_table(biom_file)

        observation_ids = table._observation_ids.tolist()
        observation_metadata = table._observation_metadata

        logging.info('start processing files')
        for index, observation_id in enumerate(observation_ids):
            if observation_id not in df.index:
                raise ValueError('TSV file does not have [{}] OTU id'.format(
                    observation_id))

            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {
                'consensus_sequence': df.loc[observation_id,
                                             'consensus_sequence'],
                'taxonomy': taxonomy
            }

            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        return amplicons

    def _file_to_amplicon_set_data(self, biom_file, tsv_file, fasta_file, mode,
                                   refs, description, matrix_obj_ref):

        logging.info('start parsing amplicon_set_data')

        amplicon_set_data = dict()

        if mode == 'biom_tsv':
            amplicons = self._retrieve_biom_tsv_amplicon_set_data(
                biom_file, tsv_file)
        elif mode == 'biom_fasta':
            amplicons = self._retrieve_biom_fasta_amplicon_set_data(
                biom_file, fasta_file)
        elif mode == 'tsv_fasta':
            amplicons = self._retrieve_tsv_fasta_amplicon_set_data(
                tsv_file, fasta_file)
        elif mode == 'tsv':
            amplicons = self._retrieve_tsv_amplicon_set_data(tsv_file)
        else:
            raise ValueError(
                'error parsing _file_to_amplicon_set_data, mode: {}'.format(
                    mode))

        amplicon_set_data.update({'amplicons': amplicons})

        if 'reads_set_ref' in refs:
            amplicon_set_data['reads_set_ref'] = refs.get('reads_set_ref')

        if description:
            amplicon_set_data['description'] = description

        matrix_obj_ref_array = matrix_obj_ref.split('/')
        amplicon_set_data['amplicon_matrix_ref'] = '{}/{}'.format(
            matrix_obj_ref_array[0], matrix_obj_ref_array[1])

        return amplicon_set_data

    def _file_to_amplicon_data(self,
                               biom_file,
                               tsv_file,
                               mode,
                               refs,
                               matrix_name,
                               workspace_id,
                               scale,
                               description,
                               metadata_keys=None):

        amplicon_data = refs

        if mode.startswith('biom'):
            logging.info('start parsing BIOM file for matrix data')
            table = biom.load_table(biom_file)
            observation_metadata = table._observation_metadata
            sample_metadata = table._sample_metadata

            matrix_data = {
                'row_ids': table._observation_ids.tolist(),
                'col_ids': table._sample_ids.tolist(),
                'values': table.matrix_data.toarray().tolist()
            }

            logging.info('start building attribute mapping object')
            amplicon_data.update(
                self.get_attribute_mapping("row", observation_metadata,
                                           matrix_data, matrix_name, refs,
                                           workspace_id))
            amplicon_data.update(
                self.get_attribute_mapping("col", sample_metadata, matrix_data,
                                           matrix_name, refs, workspace_id))

            amplicon_data['attributes'] = {}
            for k in ('create_date', 'generated_by'):
                val = getattr(table, k)
                if not val:
                    continue
                if isinstance(val, bytes):
                    amplicon_data['attributes'][k] = val.decode('utf-8')
                else:
                    amplicon_data['attributes'][k] = str(val)
        elif mode.startswith('tsv'):
            observation_metadata = None
            sample_metadata = None
            try:
                logging.info('start parsing TSV file for matrix data')
                reader = pd.read_csv(tsv_file, sep=None, iterator=True)
                inferred_sep = reader._engine.data.dialect.delimiter
                df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
            except Exception:
                raise ValueError(
                    'Cannot parse file. Please provide valide tsv file')
            else:
                metadata_df = None
                if metadata_keys:
                    shared_metadata_keys = list(
                        set(metadata_keys) & set(df.columns))
                    if mode == 'tsv' and 'consensus_sequence' not in shared_metadata_keys:
                        raise ValueError(
                            'TSV file does not include consensus_sequence')
                    if shared_metadata_keys:
                        metadata_df = df[shared_metadata_keys]
                        df.drop(columns=shared_metadata_keys, inplace=True)
                try:
                    df = df.astype(float)
                except ValueError:
                    err_msg = 'Found some non-float values. Matrix contains only numeric values\n'
                    err_msg += 'Please list any non-numeric column names in  Metadata Keys field'
                    raise ValueError(err_msg)
                df.fillna(0, inplace=True)
                matrix_data = {
                    'row_ids': df.index.tolist(),
                    'col_ids': df.columns.tolist(),
                    'values': df.values.tolist()
                }

            logging.info('start building attribute mapping object')
            amplicon_data.update(
                self.get_attribute_mapping("row", observation_metadata,
                                           matrix_data, matrix_name, refs,
                                           workspace_id, metadata_df))
            amplicon_data.update(
                self.get_attribute_mapping("col", sample_metadata, matrix_data,
                                           matrix_name, refs, workspace_id))

            amplicon_data['attributes'] = {}
        else:
            raise ValueError(
                'error parsing _file_to_amplicon_data, mode: {}'.format(mode))

        amplicon_data.update({'data': matrix_data})

        amplicon_data['search_attributes'] = [
            f'{k}|{v}' for k, v in amplicon_data['attributes'].items()
        ]

        amplicon_data['scale'] = scale
        if description:
            amplicon_data['description'] = description

        return amplicon_data

    def get_attribute_mapping(self,
                              axis,
                              metadata,
                              matrix_data,
                              matrix_name,
                              refs,
                              workspace_id,
                              metadata_df=None):
        mapping_data = {}
        axis_ids = matrix_data[f'{axis}_ids']
        if refs.get(f'{axis}_attributemapping_ref'):
            am_data = self.dfu.get_objects(
                {'object_refs':
                 [refs[f'{axis}_attributemapping_ref']]})['data'][0]['data']
            unmatched_ids = set(axis_ids) - set(am_data['instances'].keys())
            if unmatched_ids:
                name = "Column" if axis == 'col' else "Row"
                raise ValueError(
                    f"The following {name} IDs from the uploaded matrix do not match "
                    f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}"
                    f"\nPlease verify the input data or upload an excel file with a"
                    f"{name} mapping tab.")
            else:
                mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}

        elif metadata:
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[
                f'{axis}_attributemapping_ref'] = self._metadata_to_attribute_mapping(
                    axis_ids, metadata, name, workspace_id)
            # if coming from biom file, metadata and axis IDs are guaranteed to match
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}
        elif metadata_df is not None:
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[
                f'{axis}_attributemapping_ref'] = self._meta_df_to_attribute_mapping(
                    axis_ids, metadata_df, name, workspace_id)
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}

        return mapping_data

    def _meta_df_to_attribute_mapping(self, axis_ids, metadata_df, obj_name,
                                      ws_id):
        data = {'ontology_mapping_method': "TSV file", 'instances': {}}
        attribute_keys = metadata_df.columns.tolist()
        data['attributes'] = [{
            'attribute': key,
            'source': 'upload'
        } for key in attribute_keys]

        for axis_id in axis_ids:
            data['instances'][axis_id] = metadata_df.loc[axis_id].tolist()

        logging.info(
            'start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id":
            ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": data,
                "name": obj_name
            }]
        })[0]

        return f'{info[6]}/{info[0]}/{info[4]}'

    def _metadata_to_attribute_mapping(self, instances, metadata, obj_name,
                                       ws_id):
        data = {'ontology_mapping_method': "BIOM file", 'instances': {}}
        sample_set = metadata[0:min(len(metadata), 25)]
        metadata_keys = sorted(
            set((k for m_dict in sample_set for k in m_dict)))
        data['attributes'] = [{
            'attribute': key,
            'source': 'upload'
        } for key in metadata_keys]
        for inst, meta in zip(instances, metadata):
            data['instances'][inst] = [
                str(meta[attr]) for attr in metadata_keys
            ]

        logging.info(
            'start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id":
            ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": data,
                "name": obj_name
            }]
        })[0]
        return f'{info[6]}/{info[0]}/{info[4]}'

    def _generate_report(self, matrix_obj_ref, amplicon_set_obj_ref,
                         new_row_attr_ref, new_col_attr_ref, workspace_name):
        """
        _generate_report: generate summary report
        """

        objects_created = [{
            'ref': matrix_obj_ref,
            'description': 'Imported Amplicon Matrix'
        }, {
            'ref': amplicon_set_obj_ref,
            'description': 'Imported Amplicon Set'
        }]

        if new_row_attr_ref:
            objects_created.append({
                'ref':
                new_row_attr_ref,
                'description':
                'Imported Amplicons(Row) Attribute Mapping'
            })

        if new_col_attr_ref:
            objects_created.append({
                'ref':
                new_col_attr_ref,
                'description':
                'Imported Samples(Column) Attribute Mapping'
            })

        report_params = {
            'message': '',
            'objects_created': objects_created,
            'workspace_name': workspace_name,
            'report_object_name':
            'import_matrix_from_biom_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _df_to_tsv(self, amplicon_set_df, result_dir, amplicon_set_ref):
        logging.info('writting amplicon set data frame to tsv file')
        amplicon_set_obj = self.dfu.get_objects(
            {'object_refs': [amplicon_set_ref]})['data'][0]
        amplicon_set_info = amplicon_set_obj['info']
        amplicon_set_name = amplicon_set_info[1]

        file_path = os.path.join(result_dir, amplicon_set_name + ".tsv")

        amplicon_set_df.to_csv(file_path, sep='\t', index=True, header=True)

        return file_path

    def _amplicon_set_to_df(self, amplicon_set_ref):
        logging.info('converting amplicon set to data frame')
        am_set_data = self.dfu.get_objects({'object_refs': [amplicon_set_ref]
                                            })['data'][0]['data']

        amplicon_matrix_ref = am_set_data.get('amplicon_matrix_ref')
        matrix_data = self.dfu.get_objects(
            {'object_refs': [amplicon_matrix_ref]})['data'][0]['data']
        matrix_value_data = matrix_data.get('data')

        index = matrix_value_data.get('row_ids')
        columns = matrix_value_data.get('col_ids')
        values = matrix_value_data.get('values')

        df = pd.DataFrame(values, index=index, columns=columns)

        amplicons = am_set_data.get('amplicons')

        meta_index = list()

        meta_columns = [
            'taxonomy', 'taxon_id', 'taxon_ref', 'taxon_level', 'score',
            'taxonomy_source', 'species_name', 'consensus_sequence'
        ]
        meta_values = list()
        for otu_id, amplicon in amplicons.items():
            meta_index.append(otu_id)

            taxonomy_data = amplicon.get('taxonomy')

            taxonomy = taxonomy_data.get('lineage')
            taxon_id = taxonomy_data.get('taxon_id')
            taxon_ref = taxonomy_data.get('taxon_ref')
            taxon_level = taxonomy_data.get('taxon_level')
            score = taxonomy_data.get('score')
            taxonomy_source = taxonomy_data.get('taxonomy_source')
            species_name = taxonomy_data.get('species_name')

            consensus_sequence = amplicon.get('consensus_sequence')

            meta_values.append([
                taxonomy, taxon_id, taxon_ref, taxon_level, score,
                taxonomy_source, species_name, consensus_sequence
            ])

        meta_df = pd.DataFrame(meta_values,
                               index=meta_index,
                               columns=meta_columns)

        merged_df = df.merge(meta_df,
                             left_index=True,
                             right_index=True,
                             how='left',
                             validate='one_to_one')

        return merged_df

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.data_util = DataUtil(config)
        self.attr_util = AttributesUtil(config)
        self.matrix_util = MatrixUtil(config)
        self.matrix_types = [
            x.split(".")[1].split('-')[0]
            for x in self.data_util.list_generic_types()
        ]
        self.taxon_wsname = config['taxon-workspace-name']
        self.kbse = KBaseSearchEngine(config['search-url'])

    def import_matrix_from_biom(self, params):
        """
        arguments:
        obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix
        matrix_name: matrix object name
        workspace_name: workspace name matrix object to be saved to
        input_shock_id: file shock id
        or
        input_file_path: absolute file path
        or
        input_staging_file_path: staging area file path

        optional arguments:
        col_attributemapping_ref: column AttributeMapping reference
        row_attributemapping_ref: row AttributeMapping reference
        genome_ref: genome reference
        matrix_obj_ref: Matrix reference
        """

        (biom_file, tsv_file, fasta_file, mode,
         metadata_keys) = self._process_params(params)

        workspace_name = params.get('workspace_name')
        matrix_name = params.get('matrix_name')
        amplicon_set_name = params.get('amplicon_set_name')
        obj_type = params.get('obj_type')
        scale = params.get('scale')
        description = params.get('description')
        refs = {k: v for k, v in params.items() if "_ref" in k}

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        amplicon_data = self._file_to_amplicon_data(biom_file, tsv_file, mode,
                                                    refs, matrix_name,
                                                    workspace_id, scale,
                                                    description, metadata_keys)

        new_row_attr_ref = None
        if not params.get('row_attributemapping_ref'):
            new_row_attr_ref = amplicon_data.get('row_attributemapping_ref')

        new_col_attr_ref = None
        if not params.get('col_attributemapping_ref'):
            new_col_attr_ref = amplicon_data.get('col_attributemapping_ref')

        logging.info('start saving Matrix object: {}'.format(matrix_name))
        matrix_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(obj_type),
            'obj_name':
            matrix_name,
            'data':
            amplicon_data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        amplicon_set_data = self._file_to_amplicon_set_data(
            biom_file, tsv_file, fasta_file, mode, refs, description,
            matrix_obj_ref)

        logging.info(
            'start saving AmpliconSet object: {}'.format(amplicon_set_name))
        amplicon_set_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseExperiments.AmpliconSet',
            'obj_name':
            amplicon_set_name,
            'data':
            amplicon_set_data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        logging.info(
            'start resaving Matrix object with amplicon set: {}'.format(
                matrix_name))
        amplicon_data['amplicon_set_ref'] = '{}/{}'.format(
            workspace_id, amplicon_set_name)
        matrix_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(obj_type),
            'obj_name':
            matrix_name,
            'data':
            amplicon_data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        returnVal = {
            'matrix_obj_ref': matrix_obj_ref,
            'amplicon_set_obj_ref': amplicon_set_obj_ref
        }

        report_output = self._generate_report(matrix_obj_ref,
                                              amplicon_set_obj_ref,
                                              new_row_attr_ref,
                                              new_col_attr_ref, workspace_name)

        returnVal.update(report_output)

        return returnVal

    def export_amplicon_set_tsv(self, params):
        """
        export AmpliconSet as TSV
        """
        logging.info('start exporting amplicon set object')
        amplicon_set_ref = params.get('input_ref')

        amplicon_set_df = self._amplicon_set_to_df(amplicon_set_ref)

        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)

        self._df_to_tsv(amplicon_set_df, result_dir, amplicon_set_ref)

        package_details = self.dfu.package_for_download({
            'file_path':
            result_dir,
            'ws_refs': [amplicon_set_ref]
        })

        return {'shock_id': package_details['shock_id']}
示例#21
0
class DataUtil:

    @staticmethod
    def _find_between(s, start, end):
        """
        _find_between: find string in between start and end
        """

        return re.search('{}(.*){}'.format(start, end), s).group(1)

    def _find_constraints(self, obj_type):
        """
        _find_constraints: retrieve constraints (@contains, rowsum, unique, conditionally_required)
        """

        type_info = self.wsClient.get_type_info(obj_type)
        type_desc = type_info.get('description')
        constraints = {}

        for tag in ('contains', 'rowsum', 'unique', 'conditionally_required'):
            constraints[tag] = [line.strip().split()[1:] for line in type_desc.split("\n")
                                if line.startswith(f'@{tag}')]

        return constraints

    def _filter_constraints(self, constraints, data):
        """filters out constraints with missing keys"""
        contains_constraints = constraints.get('contains')

        filtered_constraints = []
        for contains_constraint in contains_constraints:
            in_values = contains_constraint[1:]
            missing_key = True
            for in_value in in_values:
                if in_value.startswith('values'):
                    search_value = re.search('{}(.*){}'.format('\(', '\)'), in_value).group(1)
                    unique_list = search_value.split('.')
                    key = unique_list[0]
                elif ':' in in_value:
                    key = in_value.split(':')[0]
                else:
                    unique_list = in_value.split('.')
                    key = unique_list[0]

                if key in data:
                    missing_key = False
                    break

            if missing_key:
                filtered_constraints.append(contains_constraint)

        for x in filtered_constraints:
            contains_constraints.remove(x)

        return constraints

    def _retrieve_value(self, data, value):
        """Parse the provided 'data' object to retrieve the item in 'value'."""
        logging.info('Getting value for {}'.format(value))
        retrieve_data = []
        m_data = DotMap(data)
        if value.startswith('set('):
            retrieve_data = value[4:-1].split(",")
        elif value.startswith('values('):  # TODO: nested values e.g. values(values(ids))
            search_value = re.search('{}(.*){}'.format('\(', '\)'), value).group(1)
            unique_list = search_value.split('.')
            m_data_cp = m_data.copy()
            for attr in unique_list:
                m_data_cp = getattr(m_data_cp, attr)
            retrieve_data = list(m_data_cp.values())
        elif ':' in value:
            obj_ref = getattr(m_data, value.split(':')[0])
            if obj_ref:
                included = value.split(':')[1]
                included = '/' + included.replace('.', '/')
                ref_data = self.wsClient.get_objects2({'objects': [{'ref': obj_ref,
                                                       'included': [included]}]})['data'][0]['data']
                m_ref_data = DotMap(ref_data)
                if ref_data:
                    if '*' not in included:
                        for key in included.split('/')[1:]:
                            m_ref_data = getattr(m_ref_data, key)
                    else:
                        keys = included.split('/')[1:]
                        m_ref_data = [x.get(keys[2]) for x in ref_data.get(keys[0])]  # TODO: only works for 2 level nested data like '/features/[*]/id'

                retrieve_data = list(m_ref_data)
        else:
            unique_list = value.split('.')
            m_data_cp = m_data.copy()
            for attr in unique_list:
                m_data_cp = getattr(m_data_cp, attr)
            retrieve_data = list(m_data_cp)

        logging.info('Retrieved value (first 20):\n{}\n'.format(retrieve_data[:20]))

        return retrieve_data

    def _validate(self, constraints, data):
        """
        _validate: validate data
        """

        validated = True
        failed_constraints = defaultdict(list)

        unique_constraints = constraints.get('unique')
        for unique_constraint in unique_constraints:
            retrieved_value = self._retrieve_value(data, unique_constraint[0])
            if len(set(retrieved_value)) != len(retrieved_value):
                validated = False
                failed_constraints['unique'].append(unique_constraint[0])

        contains_constraints = constraints.get('contains')
        for contains_constraint in contains_constraints:
            value = contains_constraint[0]
            in_values = contains_constraint[1:]
            retrieved_in_values = []
            for in_value in in_values:
                retrieved_in_values += self._retrieve_value(data, in_value)
            if not (set(self._retrieve_value(data, value)) <= set(retrieved_in_values)):
                validated = False
                failed_constraints['contains'].append(" ".join(contains_constraint))

        conditional_constraints = constraints.get('conditionally_required')
        for conditional_constraint in conditional_constraints:
            trigger = conditional_constraint[0]
            required_keys = conditional_constraint[1:]
            if trigger in data:
                missing_keys = [key for key in required_keys if key not in data]
                if missing_keys:
                    validated = False
                    failed_constraints['conditionally_required'].append(
                        (trigger, required_keys, missing_keys))

        return validated, failed_constraints

    @staticmethod
    def _mkdir_p(path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    @staticmethod
    def _raise_validation_error(params, validate):
        """Raise a meaningful error message for failed validation"""
        logging.error('Data failed type checking')
        failed_constraints = validate.get('failed_constraints')
        error_msg = ['Object {} failed type checking:'.format(params.get('obj_name'))]
        if failed_constraints.get('unique'):
            unique_values = failed_constraints.get('unique')
            error_msg.append('Object should have unique field: {}'.format(unique_values))
        if failed_constraints.get('contains'):
            contained_values = failed_constraints.get('contains')
            for contained_value in contained_values:
                subset_value = contained_value.split(' ')[0]
                super_value = ' '.join(contained_value.split(' ')[1:])
                if 'col_mapping' in super_value:
                    error_msg.append('Column attribute mapping instances should contain all '
                                     'column index from original data')

                if 'row_mapping' in super_value:
                    error_msg.append('Row attribute mapping instances should contain all row '
                                     'index from original data')

                error_msg.append('Object field [{}] should contain field [{}]'.format(
                    super_value,
                    subset_value))
        for failure in failed_constraints.get('conditionally_required', []):
            error_msg.append('If object field "{}" is present than object field(s) {} should '
                             'also be present. Object is missing {}'.format(*failure))
        raise ValueError('\n'.join(error_msg))

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']
        self.serviceWizardURL = config['srv-wiz-url']
        self.wsClient = workspaceService(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.generics_service = GenericsService(self.serviceWizardURL)
        self.ws_large_data = WsLargeDataIO(self.callback_url)

    def list_generic_types(self, params=None):
        """
        *Not yet exposed in spec*
        list_generic_types: lists the current valid generics types

        arguments:
            none

        return:
            A list of generic types in the current environment
        """
        returnVal = [x['type_def'] for module in GENERICS_MODULES
                     for x in self.wsClient.get_all_type_info(module)]
        return returnVal

    def fetch_data(self, params):
        """
        fetch_data: fetch generics data as pandas dataframe for a generics data object

        arguments:
        obj_ref: generics object reference

        optional arguments:
        generics_module: the generics data module to be retrieved from
                        e.g. for an given data type like below:
                        typedef structure {
                          FloatMatrix2D data;
                          condition_set_ref condition_set_ref;
                        } SomeGenericsMatrix;
                        generics_module should be
                        {'data': 'FloatMatrix2D',
                         'condition_set_ref': 'condition_set_ref'}

        return:
        data_matrix: a pandas dataframe in json format
        """
        for p in ['obj_ref']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        return self.generics_service.fetch_data(params)

    def validate_data(self, params):
        """
        validate_data: validate data

        arguments:
        obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1'
        data: obj data to be validated

        return:
        validated: True or False
        """

        constraints = self._find_constraints(params.get('obj_type'))
        data = params.get('data')

        constraints = self._filter_constraints(constraints, data)

        validated, failed_constraints = self._validate(constraints, data)

        return {'validated': validated,
                'failed_constraints': failed_constraints}

    def save_object(self, params):
        """
        save_object: validate data constraints and save matrix object

        arguments:
        obj_type: saving object data type
        obj_name: saving object name
        data: data to be saved
        workspace_name: workspace name matrix object to be saved to

        return:
        obj_ref: object reference
        """
        logging.info('Starting validating and saving object data')

        obj_type = params.get('obj_type').split('-')[0]

        module_name = obj_type.split('.')[0]
        type_name = obj_type.split('.')[1]

        types = self.wsClient.get_module_info({'mod': module_name}).get('types')

        for module_type in types:
            if self._find_between(module_type, '\.', '\-') == type_name:
                obj_type = module_type
                break

        data = dict((k, v) for k, v in params.get('data').items() if v)
        validate = self.validate_data({'obj_type': obj_type,
                                       'data': data})

        if not validate.get('validated'):
            self._raise_validation_error(params, validate)

        # make sure users with shared object have access to the handle file upon saving
        handle = data.get('sequencing_file_handle')
        if handle:
            output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
            logging.info('Downloading consensus sequence file in {}'.format(output_directory))
            self._mkdir_p(output_directory)
            matrix_fasta_file = self.dfu.shock_to_file({
                'handle_id': handle,
                'file_path': self.scratch}).get('file_path')
            logging.info('Saving consensus sequence file to shock: {}'.format(matrix_fasta_file))
            handle_id = self.dfu.file_to_shock({'file_path': matrix_fasta_file,
                                                'make_handle': True})['handle']['hid']
            data['sequencing_file_handle'] = handle_id

        # cast data
        int_data_names = ['sequencing_quality_filter_cutoff', 'read_length_cutoff']
        for data_name in int_data_names:
            if data_name in data:
                try:
                    logging.info('Casting {} to int'.format(data_name))
                    data[data_name] = int(data[data_name])
                except Exception as e:
                    err_msg = 'Unexpected data type {}. '.format(data_name)
                    err_msg += 'Data type {} requests {} to be an integer value. '.format(
                        obj_type, data_name)
                    err_msg += 'Provided [{}] {} instead'.format(
                        type(data[data_name]), data[data_name])
                    raise ValueError(err_msg) from e

        float_data_names = ['barcode_error_rate', 'sequence_error_cutoff', 'clustering_cutoff']
        for data_name in float_data_names:
            if data_name in data:
                try:
                    logging.info('Casting {} to float'.format(data_name))
                    data[data_name] = float(data[data_name])
                except Exception as e:
                    err_msg = 'Unexpected data type {}. '.format(data_name)
                    err_msg += 'Data type {} requests {} to be a float value. '.format(
                        obj_type, data_name)
                    err_msg += 'Provided [{}] {} instead'.format(
                        type(data[data_name]), data[data_name])
                    raise ValueError(err_msg) from e

        ws_name_id = params.get('workspace_id')
        workspace_name = params.get('workspace_name')
        if not ws_name_id:
            if not isinstance(workspace_name, int):
                ws_name_id = self.dfu.ws_name_to_id(workspace_name)
            else:
                ws_name_id = workspace_name

        try:
            logging.info('Starting saving object via DataFileUtil')
            info = self.dfu.save_objects({
                "id": ws_name_id,
                "objects": [{
                    "type": obj_type,
                    "data": data,
                    "name": params.get('obj_name')
                }]
            })[0]
        except Exception:
            logging.info('Saving object via DataFileUtil failed')
            logging.info('Starting saving object via WsLargeDataIO')
            data_path = os.path.join(self.scratch,
                                     params.get('obj_name') + "_" + str(uuid.uuid4()) + ".json")
            json.dump(data, open(data_path, 'w'))

            info = self.ws_large_data.save_objects({
                "id": ws_name_id,
                "objects": [{
                    "type": obj_type,
                    "data_json_file": data_path,
                    "name": params.get('obj_name')
                }]
            })[0]

        return {"obj_ref": "%s/%s/%s" % (info[6], info[0], info[4])}
示例#22
0
class PDBUtil:

    # “Expect Value” threshold to restrict which alignments will be significant
    E_VALUE_THRESH = 1e-20

    # BLAST sequence identity threshold to determine which pdb structures will be
    # matched to a KBase genome/feature
    B_IDENTITY_THRESH = 0.6

    def _validate_import_pdb_file_params(self, params):
        """
            _validate_import_pdb_file_params:
                validates input params to import_model_pdb_file and import_experiment_pdb_file
        """
        # check for required parameters
        for p in ['structure_name', 'workspace_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        if params.get('input_file_path'):
            file_path = params.get('input_file_path')
        elif params.get('input_shock_id'):
            file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        elif params.get('input_staging_file_path'):
            file_path = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                params.get('input_staging_file_path')
            }).get('copy_file_path')
        else:
            error_msg = "Must supply either a input_shock_id or input_file_path "
            error_msg += "or input_staging_file_path"
            raise ValueError(error_msg)

        return file_path, params.get('workspace_name'), params.get(
            'structure_name')

    def _model_file_to_data(self, file_path, params):
        """
            _model_file_to_data:
                Do the PDB conversion--parse the model pdb file for creating a pdb data object
        """
        logging.info(
            f'Parsing pdb file {file_path} to a pdb structure with params: {params}'
        )

        parser = PDB.PDBParser(PERMISSIVE=1)
        pdb1 = file_path
        pp_no = 0
        data = {}

        try:
            structure = parser.get_structure("test", pdb1)
        except (RuntimeError, TypeError, KeyError, ValueError) as e:
            logging.info(f'PDBParser errored with message: {e.message}')
            raise
        else:
            ppb = PPBuilder()
            for pp in ppb.build_peptides(structure):
                pp_no += 1

            # logging.info(f'Getting pdb structure data for {structure}!')
            (compound, source) = self._get_compound_source(structure)
            (num_models,
             model_ids) = self._get_models_from_structure(structure)
            (num_chains,
             chain_ids) = self._get_chains_from_structure(structure)
            (num_residues,
             residue_ids) = self._get_residues_from_structure(structure)
            (num_atoms, atom_ids) = self._get_atoms_from_structure(structure)
            model = structure[0]
            protein_data = self._get_proteins_by_structure(
                structure, model.get_id(), file_path)
            (protein_data, params) = self._match_features(params, protein_data)

            pdb_info = params.get('pdb_info', None)
            if pdb_info and pdb_info.get('sequence_identities', None):
                data = {
                    'name': structure.header.get('name', ''),
                    'num_chains': num_chains,
                    'num_residues': num_residues,
                    'num_atoms': num_atoms,
                    'compound': compound,
                    'source': source,
                    'proteins': protein_data
                }
            else:
                logging.info(
                    f'Parsing pdb file {file_path} failed to match KBase genome/features!'
                )
                data = {}
        finally:
            return data, pp_no, params

    def _exp_file_to_data(self, file_path, params):
        """
            _exp_file_to_data:
                Do the PDB conversion--parse the experiment pdb file for creating a pdb data object
        """
        logging.info(
            f'Parsing pdb file {file_path} to a pdb structure with params: {params}'
        )

        parser = PDB.MMCIFParser()
        cif = file_path
        pp_no = 0
        mmcif_data = None

        try:
            structure = parser.get_structure("PHA-L", cif)
        except (RuntimeError, TypeError, KeyError, ValueError) as e:
            logging.info(f'MMCIFParser errored with message: {e.message}')
            raise
        else:
            ppb = PPBuilder()
            for pp in ppb.build_peptides(structure):
                pp_no += 1

            struc_name = structure.header.get('name', '')
            hd = self._upload_to_shock(file_path)

            # logging.info(f'Getting pdb structure data for {structure}!')
            (cpd, src) = self._get_compound_source(structure)
            (num_models,
             model_ids) = self._get_models_from_structure(structure)
            (num_chains,
             chain_ids) = self._get_chains_from_structure(structure)
            (num_residues,
             residue_ids) = self._get_residues_from_structure(structure)
            (num_atoms, atom_ids) = self._get_atoms_from_structure(structure)
            protein_data = self._get_proteins_by_structure(
                structure, model_ids[0], file_path)
            (protein_data, params) = self._match_features(params, protein_data)

            pdb_info = params.get('pdb_info', None)
            if pdb_info and pdb_info.get('sequence_identities', None):
                mmcif_data = {
                    'name':
                    struc_name,
                    'head':
                    structure.header.get('head', ''),
                    'rcsb_id':
                    structure.header.get('rcsb_id', ''),
                    'deposition_date':
                    structure.header.get('deposition_date', ''),
                    'release_date':
                    structure.header.get('release_date', ''),
                    'structure_method':
                    structure.header.get('structure_method', ''),
                    'resolution':
                    structure.header.get('resolution', 0.0),
                    'structure_reference':
                    structure.header.get('structure_reference', []),
                    'keywords':
                    structure.header.get('keywords', ''),
                    'author':
                    structure.header.get('author', ''),
                    'compound':
                    cpd,
                    'source':
                    src,
                    'num_models':
                    num_models,
                    'num_chains':
                    num_chains,
                    'num_residues':
                    num_residues,
                    'num_atoms':
                    num_atoms,
                    'num_het_atoms':
                    structure.header.get('num_het_atoms', 0),
                    'num_water_atoms':
                    structure.header.get('num_water_atoms', 0),
                    'num_disordered_atoms':
                    structure.header.get('num_disordered_atoms', 0),
                    'num_disordered_residues':
                    structure.header.get('num_disordered_residues', 0),
                    'pdb_handle':
                    hd,
                    'mmcif_handle':
                    hd,
                    'xml_handle':
                    hd,
                    'proteins':
                    protein_data
                }
            else:
                mmcif_data = {}
                logging.info(
                    f'Parsing pdb file {file_path} failed to match KBase genome/features!'
                )
        finally:
            return mmcif_data, pp_no, params

    def _match_features(self, params, protein_data):
        """
            _match_features: match the protein_translation in feature_id with chain sequences in
                             protein_data and compute the seq_identity and determine the exact_match
            example (in appdev):
                    genome_obj = '57196/6/1', genome_name = 'Synthetic_bacterium_JCVI_Syn3.0_genome'
                    feature_id = 'JCVISYN3_0004_CDS_1', feature_type = 'CDS' OR
                    feature_id = 'JCVISYN3_0004', feature_type = 'gene'
        """
        pdb_info = params.get('pdb_info', None)
        if pdb_info:
            kb_feature_type = ''
            kb_feature_seq = ''
            genome_name = pdb_info['genome_name']
            narr_id = pdb_info['narrative_id']
            feature_id = pdb_info['feature_id']

            logging.info(
                f"Looking up for feature {feature_id} in genome {genome_name}'s features"
            )
            # 1. Get the genome's features and reference
            (gn_ref, kb_genome_features) = self._get_genome_ref_features(
                narr_id, genome_name)
            if not gn_ref:
                logging.info(
                    f"Given genome {genome_name} does not exist in workspace {narr_id}!"
                )
                return protein_data, params

            pdb_info['genome_ref'] = gn_ref
            # 2. Match the genome features with the specified feature_id to obtain feature sequence
            for feat in kb_genome_features:
                if feat['id'] == feature_id:
                    logging.info(
                        f'Found genome feature match for {feature_id}')
                    kb_feature_type = self._get_feature_type(feat)
                    kb_feature_seq = feat.get('protein_translation', '')
                    break

            pdb_info['feature_type'] = kb_feature_type

            # 3. Call self._compute_sequence_identity with the feature sequence and the the pdb
            # proteins' translations to to get the seq_identity and exact_match
            if kb_feature_seq:
                logging.info(
                    f"Finding seq_identity and exact_match for feature {feature_id}"
                    f" in genome {genome_name}'s features...")
                pdb_chain_ids = []
                pdb_model_ids = []
                pdb_seq_idens = []
                pdb_exact_matches = []
                for prot in protein_data:
                    seq_idens, seq_mats = self._compute_sequence_identity(
                        kb_feature_seq, prot.get('sequence', ''))
                    if seq_idens:
                        seq_idens.sort()
                        max_iden = seq_idens.pop()
                        if max_iden >= self.B_IDENTITY_THRESH:  # get the good matches
                            prot['seq_identity'] = max_iden
                            prot['exact_match'] = 1 if max_iden > 0.99 else 0
                            prot['genome_ref'] = gn_ref
                            prot['feature_id'] = feature_id
                            prot['feature_type'] = kb_feature_type
                            pdb_chain_ids.append(prot['chain_id'])
                            pdb_model_ids.append(str(prot['model_id']))
                            pdb_seq_idens.append(str(prot['seq_identity']))
                            pdb_exact_matches.append(str(prot['exact_match']))

                if pdb_seq_idens:
                    pdb_info['sequence_identities'] = ','.join(pdb_seq_idens)
                if pdb_chain_ids:
                    pdb_info['chain_ids'] = ','.join(pdb_chain_ids)
                if pdb_model_ids:
                    pdb_info['model_ids'] = ','.join(pdb_model_ids)
                if pdb_exact_matches:
                    pdb_info['exact_matches'] = ','.join(pdb_exact_matches)
            else:
                logging.info(
                    f'Found NO feature in genome that matches with {feature_id}'
                )
        else:
            logging.info(
                'NO KBase genome/feature object info were given for uploading')

        return protein_data, params

    def _compute_sequence_identity(self, seq1, seq2):
        """
            _compute_sequence_identity: Given two input sequences, do a blast identity check and
                                        then compute and return the matching percentage.
        """
        # Create two sequence files
        Seq1 = SeqRecord(Seq(seq1), id="query_seq")
        Seq2 = SeqRecord(Seq(seq2), id="subject_seq")

        blast_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(blast_dir)
        query_seq = os.path.join(blast_dir, 'seq_qry.fasta')
        subject_seq = os.path.join(blast_dir, 'seq_sbj.fasta')
        SeqIO.write(Seq1, query_seq, "fasta")
        SeqIO.write(Seq2, subject_seq, "fasta")

        # on my laptop: blastp_path = '/Users/qzhang/miniconda3/bin/blastp'
        blastp_path = 'blastp'
        output_file_path = os.path.join(blast_dir, 'blast_output.xml')

        # Build the BLASTp command
        blastp_cmd = [blastp_path]
        blastp_cmd.append('-out')
        blastp_cmd.append(output_file_path)
        blastp_cmd.append('-outfmt')
        blastp_cmd.append('5')
        blastp_cmd.append('-query')
        blastp_cmd.append(query_seq)
        blastp_cmd.append('-subject')
        blastp_cmd.append(subject_seq)

        # Run BLASTp and parse the output as XML and then parse the xml file for identity matches
        exact_matches = []
        idens = []
        try:
            p = subprocess.Popen(blastp_cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 universal_newlines=True)
            output, errors = p.communicate()
            if not output:
                logging.info(f'BLASTp returned: {p.returncode}')
                logging.info(f'OK> output: {output}')
            if errors:
                e = subprocess.CalledProcessError(p.returncode,
                                                  blastp_cmd,
                                                  output=output)
                raise e
        except OSError as e:
            logging.info(f'OSError > {e.errno}')
            logging.info(f'OSError > {e.strerror}')
            logging.info(f'OSError > {e.filename}')
        except subprocess.CalledProcessError as e:
            logging.info(f'CalledError > {e.returncode}')
            logging.info(f'CalledError > {e.output}')
        except:
            logging.info(f'Unexpected error > {sys.exc_info()[0]}')
        else:
            with open(output_file_path) as blast_fhd:
                blast_record = NCBIXML.read(blast_fhd)
                if blast_record:
                    logging.info(f'query: {blast_record.query[:100]}')
                    for alignment in blast_record.alignments:
                        for hsp in alignment.hsps:
                            if hsp.expect < self.E_VALUE_THRESH:
                                logging.info('****Alignment****')
                                logging.info(f'sequence: {alignment.title}')
                                logging.info(f'length: {alignment.length}')
                                logging.info(f'e value: {hsp.expect}')
                                logging.info(f'hsp query: {hsp.query}')
                                logging.info(f'hsp match: {hsp.match}')
                                logging.info(f'hsp subject: {hsp.sbjct}')
                                logging.info(
                                    f'hsp identities: {hsp.identities}')
                                logging.info(f'hsp positives: {hsp.positives}')
                                iden = round(hsp.identities / hsp.positives, 6)
                                logging.info(f'identity={iden}')
                                idens.append(iden)
                                if hsp.positives == hsp.identities:
                                    exact_matches.append(alignment.title[:100])
        return idens, exact_matches

    def _get_genome_ref_features(self, narr_id, genome_name):
        """
            _get_genome_ref_features: Get the genome reference and features for genome_name
        """
        genome_ref = ''
        genome_features = []
        (genome_info,
         genome_data) = self._get_object_info_data(narr_id, genome_name)
        if genome_info and genome_data:
            genome_ref = '/'.join(
                [str(narr_id),
                 str(genome_info[0]),
                 str(genome_info[4])])
            genome_features = genome_data['features']

        return (genome_ref, genome_features)

    def _get_feature_type(self, feature_obj):
        """
            _get_feature_type: Get the type for the feature object of given feature_obj
        """
        feat_type = feature_obj.get('type', '')
        if not feat_type:
            if feature_obj.get('protein_translation'):
                feat_type = 'gene'
            else:
                feat_type = 'other'

        return feat_type

    def _get_object_info_data(self, narr_id, obj_name):
        """
            _get_object_info_data: Get the object info/data with given obj_name in narrative narr_id
        """
        obj_info = None
        obj_data = None
        if narr_id and obj_name:
            try:
                obj_data_res = self.ws_client.get_objects2(
                    {'objects': [{
                        'wsid': narr_id,
                        'name': obj_name
                    }]})['data'][0]
                obj_info = obj_data_res['info']
                obj_data = obj_data_res['data']
            except:
                logging.info(
                    f'No object with name {obj_name} exists in workspace {narr_id}'
                )
                logging.info(
                    f'Unexpected error occurred while getting object for {obj_name}'
                )
                pass

        return (obj_info, obj_data)

    def _get_atoms_from_structure(self, pdb_structure):
        """
            _get_atoms_from_structure: Given a pdb_structure object, parse atoms into a list of
                                        atoms and return it
        """
        atom_ids = []
        num_atoms = 0
        my_residues = pdb_structure.get_residues()
        for r_ele in my_residues:
            for a_ele in r_ele.get_atoms():
                num_atoms += 1
                atom_ids.append(a_ele.get_id())

        return (num_atoms, atom_ids)

    def _get_residues_from_structure(self, pdb_structure):
        """
            _get_residues_from_structure: Given a pdb_structure object, parse residues into a list
                                          and return it
        """
        res_ids = []
        num_res = 0
        my_res = pdb_structure.get_residues()
        for r_ele in my_res:
            if PDB.is_aa(r_ele):
                num_res += 1
                res_ids.append(r_ele.get_id())

        return (num_res, res_ids)

    def _get_chains_from_structure(self, pdb_structure):
        """
            _get_chains: Given a pdb_structure object, parse chain ids into a list and return it
        """
        chain_ids = []
        num_chains = 0
        my_chains = pdb_structure.get_chains()
        for c_ele in my_chains:
            if (c_ele):
                num_chains += 1
                chain_ids.append(c_ele.get_id())

        return (num_chains, chain_ids)

    def _get_models_from_structure(self, pdb_structure):
        """
            _get_models_from_structure: Given a pdb_structure object, parse model ids into a list
                                        and return it
        """
        model_ids = []
        num_models = 0
        my_models = pdb_structure.get_models()
        for m_ele in my_models:
            if (m_ele):
                num_models += 1
                model_ids.append(m_ele.get_id())

        return (num_models, model_ids)

    def _get_compound_source(self, structure):
        """
            _get_compound_source: Parse data from given structure for compound and source
        """
        cpd_dict = dict()
        cpd = structure.header.get('compound', {})
        # logging.info(f'Compound:\n {cpd}')
        if cpd and cpd.get('1'):
            cpd_dict = cpd.get('1')

        src_dict = dict()
        src = structure.header.get('source', {})
        # logging.info(f'Source:\n {src}')
        if src and src.get('1'):
            src_dict = src.get('1')

        return (cpd_dict, src_dict)

    def _get_proteins_by_structure(self, pdb_structure, model, file_path):
        """
            _get_proteins_by_structure: Given a pdb_structure, parse the essential protein data
        """
        ppb = PPBuilder()
        protein_data = []

        # Parse for the chain_id and chain sequence
        for c_ele in pdb_structure.get_chains():
            if (c_ele):
                c_ppd_list = []
                for c_ppd in ppb.build_peptides(c_ele):
                    c_pp_seq = str(c_ppd.get_sequence())
                    c_ppd_list.append(c_pp_seq)
                c_seq = ''.join(c_ppd_list)
                protein_data.append({
                    'id':
                    os.path.basename(file_path),
                    'model_id':
                    model,
                    'chain_id':
                    c_ele.get_id(),
                    'sequence':
                    c_seq,
                    'md5':
                    hashlib.md5(c_seq.encode()).hexdigest()
                })

        return protein_data

    def _validate_file(self, file_path):
        """
            _validate_file: Check if file_path is accessable, if yes, return the handle
        """
        try:
            fh = open(file_path, 'r')
        except IOError as e:
            if e.errno == errno.ENOENT:  # No such file or directory
                raise ValueError(f'"{file_path}" does not exist!')
            elif e.errno == errno.EACCES:  # Permission denied
                raise ValueError(f'"{file_path}" cannot be read!')
            else:
                raise ValueError(f'"{e.strerror}" error occurred')
        else:
            fh.close()
            return True

    def _dfu_get_objects(self, obj_ref):
        """
            _dfu_get_objects: call dfu.get_objects to return object data and info
        """
        obj = self.dfu.get_objects({"object_refs": [obj_ref]})['data'][0]
        return obj['data'], obj['info']

    def _get_pdb_shock_id(self, obj_ref):
        """
            _get_pdb_shock_id: Return the shock id for the PDB file
        """
        obj_data, obj_info = self._dfu_get_objects(obj_ref)
        return self.hs.hids_to_handles([obj_data['pdb_handle']])[0]['id']

    def _upload_to_shock(self, file_path):
        """
            _upload_to_shock: upload target file to shock using DataFileUtil
        """
        logging.info(f'Start uploading file to shock: {file_path}')

        file_to_shock_params = {
            'file_path': file_path,
            'pack': 'gzip',
            'make_handle': True,
        }
        shock_id = self.dfu.file_to_shock(
            file_to_shock_params)['handle']['hid']

        return shock_id

    def _generate_report_html(self, pdb_name, pdb_path):
        """
            _generate_report_html: generates the HTML for the upload report
        """
        html_report = list()

        # Make report directory and copy over files
        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(output_directory)
        result_file_path = os.path.join(output_directory, 'viewer.html')
        new_pdb_path = os.path.join(output_directory,
                                    os.path.basename(pdb_path))
        shutil.copy(pdb_path, new_pdb_path)

        # Fill in template HTML
        with open(
                os.path.join(os.path.dirname(__file__), 'templates',
                             'viewer_template.html')) as report_template_file:
            report_template = report_template_file.read()\
                .replace('*PDB_NAME*', pdb_name)\
                .replace('*PDB_PATH*', os.path.basename(pdb_path))

        with open(result_file_path, 'w') as result_file:
            result_file.write(report_template)

        html_report.append({
            'path': output_directory,
            'name': os.path.basename(result_file_path),
            'description': 'HTML report for PDB upload'
        })

        return html_report

    def _generate_report(self, method_name, pdb_obj_ref, workspace_name,
                         n_poly_pep, pdb_name, pdb_path):
        """
            _generate_report: generate summary report for upload
        """
        output_html_files = self._generate_report_html(pdb_name, pdb_path)

        report_params = {
            'message':
            f'You uploaded a PDB file. {n_poly_pep} polypeptides detected.',
            'html_links':
            output_html_files,
            'direct_html_link_index':
            0,
            'objects_created': [{
                'ref': pdb_obj_ref,
                'description': 'Imported PDB'
            }],
            'workspace_name':
            workspace_name,
            'report_object_name':
            method_name + '_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _validate_batch_import_pdbs_params(self, params):
        """
            _validate_batch_import_pdbs_params:
                validates params passed to batch_import_pdbs method
        """
        # check for required parameters
        for p in [
                'structures_name', 'workspace_name',
                'metadata_staging_file_path'
        ]:
            if p not in params:
                raise ValueError(f'"{p}" parameter is required, but missing')

        # metadata_staging_file_path must be from the staging area--must have the staging dir prefix
        if params.get('metadata_staging_file_path', None):
            staging_file_path = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                params.get('metadata_staging_file_path')
            }).get('copy_file_path')
            return (staging_file_path, params['workspace_name'],
                    params['structures_name'])
        else:
            error_msg = "Must supply a 'metadata_staging_file_path'"
            raise ValueError(error_msg)

    def _read_file_by_type(self, file_path):
        """
            _read_file_by_type: read the file given by file_path depending on its type,
                               return a DataFrame object
        """
        logging.info(f'Reading input from file: {file_path}...')

        if not self._validate_file(file_path):
            raise ValueError('Input file is invalid or not found!')

        df = None
        file_ext = pathlib.Path(file_path).suffix
        try:  # read the data from file_path depending on its extension
            if 'csv' in file_ext:
                df = pd.read_csv(file_path)
            elif 'tsv' in file_ext:
                df = pd.read_csv(file_path, '\t')
            elif 'xls' in file_ext or 'od' in file_ext:
                # handle xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions
                df = pd.read_excel(file_path,
                                   index_col=None,
                                   engine='openpyxl')
            else:  # invalid file type
                error_msg = "Invalid input file type, only 'csv/tsv/xlsx' are accepted!"
                raise ValueError(error_msg)
            # strip off the leading and trailing whitespaces of the column names
            df.columns = df.columns.str.strip()
        except (RuntimeError, TypeError, KeyError, ValueError,
                WorkspaceError) as e:
            logging.info(
                f'Reading file {file_path} errored with message: {e.message} and data: {e.data}'
            )
            raise
        return df

    def _parse_metadata_file(self, metadata_file_path, ws_id):
        """
            _parse_metadata_file:
                From metadata_file_path, a spreadsheet file, sort out the model_pdb_file_paths,
            exp_pdb_file_paths and the kbase_meta_data

            return: lists model_pdb_file_paths, exp_pdb_file_paths and dict kbase_meta_data
        """
        logging.info(
            f'parsing metadata from input file {metadata_file_path}...')

        required_columns = [
            'Narrative ID', 'Object name (Genome AMA feature set)',
            'Feature ID', 'PDB filename', 'Is model', 'From RCSB'
        ]

        pdb_file_paths = list()
        narrative_ids = list()
        genome_names = list()
        feature_ids = list()

        # df_meta_data is a Panda DataFrame object
        df_meta_data = self._read_file_by_type(metadata_file_path)
        df_col_list = df_meta_data.columns.values.tolist()

        # check if required columns are read in correctly
        for col in required_columns:
            if col not in df_col_list:
                missing_required = f"Required column '{col}' is missing!"
                raise ValueError(missing_required)

        df_indexes = df_meta_data.columns
        for i in range(len(df_meta_data[df_indexes[0]])):
            narr_id = int(df_meta_data[df_indexes[0]][i])
            if not pd.isna(narr_id):
                narrative_ids.append(narr_id)
            else:
                missing_narr_id = "Please fill all the rows in column 'Narrative ID'!"
                raise ValueError(missing_narr_id)

            obj_name = df_meta_data[df_indexes[1]][i]
            if not pd.isna(obj_name):
                genome_names.append(obj_name)
            else:
                missing_obj_name = "Please fill all the rows in column 'Object name'!"
                raise ValueError(missing_obj_name)

            feat_id = df_meta_data[df_indexes[2]][i]
            if not pd.isna(feat_id):
                feature_ids.append(feat_id)
            else:
                missing_feature_id = f"Please fill all the rows in column '{required_columns[2]}'!"
                raise ValueError(missing_feature_id)

            pdb_fn = df_meta_data[df_indexes[3]][
                i]  # pdb_fn does not have staging dir prefix
            if pd.isna(pdb_fn):
                missing_pdb_file = f"Please fill all the rows in column '{required_columns[3]}'!"
                raise ValueError(missing_pdb_file)
            (struct_name, ext) = os.path.splitext(os.path.basename(pdb_fn))

            from_rcsb = df_meta_data[df_indexes[5]][
                i]  # pdb file source, default to 'yes'
            if pd.isna(from_rcsb):
                from_rcsb = 'yes'

            is_model = df_meta_data[df_indexes[4]][i]
            if not pd.isna(is_model):
                pdb_file_paths.append({
                    'file_path':
                    pdb_fn,
                    'structure_name':
                    struct_name,
                    'narrative_id':
                    narr_id,
                    'genome_name':
                    obj_name,
                    'feature_id':
                    feat_id,
                    'is_model':
                    'y' in is_model or 'Y' in is_model,
                    'from_rcsb':
                    'y' in from_rcsb or 'Y' in from_rcsb
                })
            else:
                missing_pdb_md = f"Please fill all the rows in columns '{required_columns[4]}'!"
                raise ValueError(missing_pdb_md)

        if not pdb_file_paths:
            error_msg = "No PDB file info is provided!"
            raise ValueError(error_msg)

        return (pdb_file_paths, narrative_ids, genome_names, feature_ids)

    def _generate_batch_report(self, workspace_name, structs_ref, structs_name,
                               pdb_infos, failed_pdbs):
        """
            _generate_batch_report: generate summary report for upload
        """

        output_html_files = self._generate_batch_report_html(
            structs_name, pdb_infos)

        description = (
            f'Imported PDBs into a ProteinStructures object "{structs_ref}", '
            f'named "{structs_name}".')

        if failed_pdbs:
            failed_files = ','.join(failed_pdbs)
            description += f' These files "{failed_files}" failed to load.'

        report_params = {
            'message':
            f'You have uploaded a batch of PDB files into {structs_name}.',
            'html_links':
            output_html_files,
            'direct_html_link_index':
            0,
            'objects_created': [{
                'ref': structs_ref,
                'description': description
            }],
            'workspace_name':
            workspace_name,
            'report_object_name':
            'batch_import_pdb_files_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _write_pdb_htmls(self, output_dir, succ_pdb_infos):
        """
            _write_pdb_htmls: write the batch pdb info as a jQuery DataTable into HTML files
        """

        pdb_html = ''
        srv_domain = urlparse(
            self.shock_url).netloc  # parse url to get the domain portion
        srv_base_url = f'https://{srv_domain}'
        logging.info(f'Get the url for building the anchors: {srv_base_url}')

        dir_name = os.path.dirname(__file__)
        molstar_html_file = os.path.join(dir_name, 'templates',
                                         'molstar_viewer.html')
        molstar_js_file = os.path.join(dir_name, 'templates', 'molstar.js')
        molstar_css_file = os.path.join(dir_name, 'templates', 'molstar.css')
        shutil.copy(molstar_html_file,
                    os.path.join(output_dir, 'molstar_viewer.html'))
        shutil.copy(molstar_js_file, os.path.join(output_dir, 'molstar.js'))
        shutil.copy(molstar_css_file, os.path.join(output_dir, 'molstar.css'))

        for succ_pdb in succ_pdb_infos:
            row_html = '<tr>'
            file_path = succ_pdb['file_path']
            pdb_file_path = succ_pdb[
                'scratch_path']  # This is the scratch path for this pdb file
            new_pdb_path = os.path.join(output_dir,
                                        os.path.basename(file_path))
            shutil.copy(pdb_file_path, new_pdb_path)

            struct_nm = succ_pdb['structure_name'].upper()
            genome_name = succ_pdb['genome_name']
            genome_ref = succ_pdb['genome_ref']
            feat_id = succ_pdb['feature_id']
            feat_type = succ_pdb['feature_type']
            src_rcsb = succ_pdb['from_rcsb']

            pdb_chains = []
            pdb_models = []
            seq_idens = []
            if succ_pdb.get('chain_ids', None):
                pdb_chains = succ_pdb['chain_ids'].split()
            if succ_pdb.get('model_ids', None):
                pdb_models = succ_pdb['model_ids'].split()
            if succ_pdb.get('sequence_identities', None):
                seq_idens = succ_pdb['sequence_identities'].split()

            if src_rcsb:
                row_html += (
                    f'<td>{struct_nm}<a href="https://www.rcsb.org/3d-view/{struct_nm}"'
                    f' target="_blank"> RCSB Structure</a></td>')
            else:
                row_html += (f'<td>{struct_nm}<a href="./molstar_viewer.html"'
                             f' or <a href="molstar_viewer.html"'
                             f' target="_blank"> MolStar Viewer</a></td>')

            row_html += (f'<td><a href="{srv_base_url}/#dataview/{genome_ref}"'
                         f' target="_blank">{genome_name}</a></td>'
                         f'<td>{feat_id}</td><td>{feat_type}</td>')
            row_html += f'<td>{pdb_models}</td>'
            row_html += f'<td>{pdb_chains}</td>'
            row_html += f'<td>{seq_idens}</td>'
            row_html += '</tr>'
            pdb_html += row_html
        return pdb_html

    def _generate_batch_report_html(self, prot_structs_name, succ_pdb_infos):
        """
            _generate_batch_report_html: generates the HTML for the upload report
        """
        html_report = list()

        # Make report directory and copy over uploaded pdb files
        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(output_directory)

        # Create the template html file for reporting batch-uploaded pdb files
        batch_html_report_path = os.path.join(output_directory,
                                              'batch_pdb_viewer.html')

        pdb_html = self._write_pdb_htmls(output_directory, succ_pdb_infos)

        # Fetch & fill in detailed info into template HTML
        with open(
                os.path.join(
                    os.path.dirname(__file__), 'templates',
                    'batch_pdb_template.html')) as batch_template_html:
            batch_html_report = batch_template_html.read()\
                .replace('<!--replace this content-->', pdb_html)

        with open(batch_html_report_path, 'w') as html_report_file:
            html_report_file.write(batch_html_report)
        print(
            f'Full batch_html_report has been written to {batch_html_report_path}'
        )

        html_report.append({
            'path': output_directory,
            'name': os.path.basename(batch_html_report_path),
            'description': 'HTML report for PDB upload'
        })

        return html_report

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.user_id = config['USER_ID']
        self.dfu = DataFileUtil(self.callback_url)
        self.hs = AbstractHandle(config['handle-service-url'])
        self.ws_client = Workspace(config['workspace-url'])
        self.shock_url = config['shock-url']

    def import_model_pdb_file(self, params, create_report=True):
        """
            import_model_pdb_file: upload an experiment pdb file and convert into a
                                  KBaseStructure.ModelProteinStructure object
        """
        logging.info(
            f'import_model_pdb_file to a pdb data structure with params: {params}'
        )

        # file_path is the pdb file's working area path (after dfu.download_staging_file call)
        file_path, workspace_name, pdb_name = self._validate_import_pdb_file_params(
            params)

        (data, n_polypeptides,
         params) = self._model_file_to_data(file_path, params)
        if not data:
            logging.info(
                f'PDB file {file_path} import with "Import ModelProteinStructure" failed!'
            )
            return {}, {}

        data['pdb_handle'] = self._upload_to_shock(file_path)
        data['user_data'] = params.get('description', '')
        pdb_info = params.get('pdb_info', None)
        if pdb_info:
            pdb_info['scratch_path'] = file_path
        logging.info(f'Model structure data:{data}')
        return data, pdb_info

    def import_experiment_pdb_file(self, params, create_report=True):
        """
            import_experiment_pdb_file: upload an experiment pdb file and convert into a
                                       KBaseStructure.ExperimentalProteinStructure object
        """
        logging.info(
            f'import_experiment_pdb_file to a pdb structure with params: {params}'
        )

        # file_path is the pdb file's working area path (after dfu.download_staging_file call)
        file_path, workspace_name, mmcif_name = self._validate_import_pdb_file_params(
            params)

        # Parse the experimental pdb file for an experimental data structure
        (data, n_polypeptides,
         params) = self._exp_file_to_data(file_path, params)
        if not data:
            logging.info(
                f'Import {file_path} with "Import ExperimentalProteinStructure" failed!'
            )
            return {}, {}

        data['pdb_handle'] = self._upload_to_shock(file_path)
        data['user_data'] = params.get('description', '')
        pdb_info = params.get('pdb_info', None)
        if pdb_info:
            pdb_info['scratch_path'] = file_path
        logging.info(data)
        return data, pdb_info

    def _export_pdb(self, params):
        """
            _export_pdb: return the shock_id of the uploaded pdb object
        """
        if "input_ref" not in params:
            raise ValueError("'input_ref' not in supplied params")

        return {'shock_id': self._get_pdb_shock_id(params['input_ref'])}

    def _structure_to_pdb_file(self, params):
        """
            _structure_to_pdb_file: get the file path for the given pdb object
        """
        if "input_ref" not in params:
            raise ValueError("input_ref not in supplied params")
        if "destination_dir" not in params:
            raise ValueError("destination_dir not in supplied params")

        shock_id = self._get_pdb_shock_id(params['input_ref'])
        file_path = self.dfu.shock_to_file({
            'shock_id':
            shock_id,
            'file_path':
            params['destination_dir'],
            'unpack':
            'uncompress'
        })['file_path']

        return {'file_path': file_path}

    def export_pdb_structures(self, params):
        """
            export_pdb_structures: return the shock_ids of the ProteinStructures object
        """
        if 'input_ref' not in params:
            raise ValueError("'input_ref' not in supplied params")

        model_pdbs = []
        exp_pdbs = []
        # shock_ids = []
        for m_pdb in model_pdbs:
            pass
        for e_pdb in exp_pdbs:
            pass

        return {'shock_id': self._get_pdb_shock_id(params['input_ref'])}

    def batch_import_pdbs(self, params):
        """
            batch_import_pdbs: upload two sets of pdb files and create a
                                   KBaseStructure.ProteinStructures object
            required params:
                metadata_staging_file_path: a metafile from the user's staging area that must be a
                    subdirectory file path in staging area,
                    e.g., /data/bulk/user_name/metadata_staging_file_path
                          staging_file_subdir_path is metadata_staging_file_path
                structures_name: name of the ProteinStructures object to be generated
                workspace_name: workspace name that the protein structure(s) will be saved
            return:
                structures_ref: return ProteinStructures object reference
                report_name: name of generated report (if any)
                report_ref: report reference (if any)

            1. call _validate_batch_import_pdbs_params to validate input params
            2. call _parse_metadata to parse for model_pdb_files, exp_pdb_files and kbase_meta_data
            3. call import_model_pdb_file on each entry in model_pdb_paths, and
               call import_experiment_pdb_file on each entry in exp_pdb_paths
            4. assemble the data for a ProteinStructures and save the data object
            5. call _generate_batch_report to generate a report for batch_import_pdbs' result
        """

        (metadata_file_path, workspace_name,
         structures_name) = self._validate_batch_import_pdbs_params(params)

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name
        params['workspace_id'] = workspace_id

        (pdb_file_paths, narrative_ids, genome_names,
         feature_ids) = self._parse_metadata_file(metadata_file_path,
                                                  workspace_id)

        model_pdb_objects = list()
        exp_pdb_objects = list()
        pdb_infos = list()
        successful_files = list()
        failed_files = list()
        protein_structures = dict()
        total_structures = 0

        pdb_params = {}
        # loop through the list of pdb_file_paths
        for pdb in pdb_file_paths:
            pdb_params['pdb_info'] = pdb
            pdb_params['input_staging_file_path'] = pdb['file_path']
            pdb_params['input_file_path'] = None
            pdb_params['input_shock_id'] = None
            pdb_params['workspace_name'] = workspace_name
            pdb_params['structure_name'] = pdb['structure_name']

            if pdb['is_model']:
                model_pdb_data, pdb_info = self.import_model_pdb_file(
                    pdb_params, False)
                if model_pdb_data:
                    model_pdb_objects.append(model_pdb_data)
                    pdb_infos.append(pdb_info)
                    successful_files.append(pdb['file_path'])
                    total_structures += 1
                else:
                    failed_files.append(pdb['file_path'])
            else:
                exp_pdb_data, pdb_info = self.import_experiment_pdb_file(
                    pdb_params, False)
                if exp_pdb_data:
                    exp_pdb_objects.append(exp_pdb_data)
                    pdb_infos.append(pdb_info)
                    successful_files.append(pdb['file_path'])
                    total_structures += 1
                else:
                    failed_files.append(pdb['file_path'])

        if not model_pdb_objects:
            logging.info("No model pdb structure was created/saved!")
            return {}

        protein_structures['model_structures'] = model_pdb_objects
        protein_structures['experimental_structures'] = exp_pdb_objects
        protein_structures['total_structures'] = total_structures
        protein_structures['description'] = (
            f'Created {total_structures} '
            f'structures in {structures_name}')
        logging.info(
            f'ProteinStructures data structure to be saved:\n{protein_structures}'
        )
        returnVal = {}
        try:
            info = self.dfu.save_objects({
                'id':
                workspace_id,
                'objects': [{
                    'type': 'KBaseStructure.ProteinStructures',
                    'name': structures_name,
                    'data': protein_structures
                }]
            })[0]
        except (RuntimeError, TypeError, KeyError, ValueError,
                WorkspaceError) as e:
            err_msg = f'DFU.save_objects errored with message: {e.message} and data: {e.data}'
            logging.info(err_msg)
            raise ValueError(err_msg)
        else:
            structs_ref = f"{info[6]}/{info[0]}/{info[4]}"
            returnVal = {'structures_ref': structs_ref}
            report_output = self._generate_batch_report(
                workspace_name, structs_ref, structures_name, pdb_infos,
                failed_files)
            returnVal.update(report_output)
        finally:
            return returnVal
示例#23
0
    def test_genome_set_input(self):

        # Setup: copy data file to workspace and get workspace id
        path = "data/TestGenome.json"
        ws_path = '/kb/module/work/tmp'
        shutil.copy2(path, ws_path)
        dfu = DataFileUtil(self.callback_url)
        wsName = self.getWsName()
        ws_id = dfu.ws_name_to_id(wsName)

        # Initiate Dictionaries
        genome_dict, genome_set_dict, dfu_genomeset_dict, dfu_genomeset_dict_2, dfu_genome_search_dict, dfu_genome_search_dict_2 = {}, {}, {}, {}, {}, {}

        # Upload genome & genome data dictionary input
        data = json.load(open(path))
        objs1 = [{
            'name': 'genome_test',
            'type': 'KBaseGenomes.Genome',
            'data': data
        }]
        # Create .Genome object in workspace with save_objects
        genome_obj = dfu.save_objects({'id': ws_id, 'objects': objs1})

        # Get .Genome object reference
        genome_info = genome_obj[0]
        genome_ref = str(genome_info[6]) + '/' + str(
            genome_info[0]) + '/' + str(genome_info[4])

        # Create genome object info dictionary
        genome_dict.update({"label": "GenomeSetTest", "ref": genome_ref})

        # Create genome set object dictionary
        genome_set_dict.update({"description": " ", "items": [genome_dict]})

        # Create DataFileUtil dictionaries for genome set data
        dfu_genomeset_dict.update({
            "type": "KBaseSets.GenomeSet",
            "data": genome_set_dict,
            "name": "Genome_Set_Test"
        })
        dfu_genomeset_dict_2.update({
            'id': ws_id,
            'objects': [dfu_genomeset_dict]
        })

        # Lastly, create .GenomeSet object with save_objects and get GenomeSet object reference
        genome_set_obj = dfu.save_objects(dfu_genomeset_dict_2)
        genome_set_info = genome_set_obj[0]
        genome_set_ref = str(genome_set_info[6]) + '/' + str(
            genome_set_info[0]) + '/' + str(genome_set_info[4])

        # Test KBaseSearch.GenomeSet
        genome_set_dict.pop('items', None)
        genome_set_dict['elements'] = {"Set1": genome_dict}

        # Create DataFileUtil dictionaries for KBaseSearch.GenomeSet data
        dfu_genome_search_dict.update({
            "type": "KBaseSearch.GenomeSet",
            "data": genome_set_dict,
            "name": "Genome_Set_Test_2"
        })
        dfu_genome_search_dict_2.update({
            'id': ws_id,
            'objects': [dfu_genome_search_dict]
        })

        # Lastly, create .GenomeSet object with save_objects and get GenomeSet object reference
        search_genome_obj = dfu.save_objects(dfu_genome_search_dict_2)
        search_genome_info = search_genome_obj[0]
        search_set_ref = str(search_genome_info[6]) + '/' + str(
            search_genome_info[0]) + '/' + str(search_genome_info[4])

        # Get FASTAS
        ret = self.getImpl().get_fastas(self.callback_url,
                                        [genome_set_ref, search_set_ref])
示例#24
0
  def run_MotifSuite(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "motifsuite_seq_input" -> structure:
           parameter "workspace_name" of String, parameter "genome_ref" of
           String, parameter "SS_ref" of String, parameter "promoter_length"
           of Long, parameter "motif_min_length" of Long, parameter
           "motif_max_length" of Long, parameter "obj_name" of String,
           parameter "prb" of Double, parameter "motif_length" of Long,
           parameter "background" of Long, parameter "mask_repeats" of Long,
           parameter "background_group" of mapping from String to String,
           parameter "threshold" of Double, parameter "proportion" of Double
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_MotifSuite
        report = KBaseReport(self.callback_url)
        mfmd_obj = MotifFindermfmd(self.callback_url)
        homer_obj = MotifFinderHomer(self.callback_url)
        meme_obj =  MotifFinderMEME(self.callback_url)
        gibbs_obj = MotifFinderGibbs(self.callback_url)
        ensemble_obj = MotifEnsemble(self.callback_url)
        mdscan_obj = MotifFinderMdscan(self.callback_url)
        sampler_obj =  MotifFinderSampler(self.callback_url)
        
        p1 = Process(target=homer_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p1.start()
        p1.join()

        p2 = Process(target=mfmd_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p2.start()
        p2.join()

        p3 = Process(target=meme_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p3.start()
        p3.join()
        
        p4 = Process(target=gibbs_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p4.start()
        p4.join()

        p5 = Process(target=mdscan_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p5.start()
        p5.join()

        p6 = Process(target=sampler_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p6.start()
        p6.join()
 
        
        MSU=MotifSuiteUtil()
        params['motifset_refs']= MSU.get_obj_refs()
        #params['motifset_refs'] =['29716/72/131','29716/72/132','29716/72/133','29716/72/134','29716/72/135','29716/72/136']
        #params['motifset_refs'] =['29716/72/131','29716/72/132','29716/72/133']
        print(params['motifset_refs'])
        #result = ensemble_obj.MotifEnsemble(params)
        #print('Ensemble RESULT:')
        #print(result)


        dms=DownloadMotifSets()
        MotifSetDict = dms.DownloadMotifSet(params['motifset_refs'],self.callback_url)

        matchSets = []
        threshold = float(params['threshold'])
        fmu=FastaUtils()
        for i,MSR1 in enumerate(MotifSetDict.keys()):
            for j,motif1 in enumerate(MotifSetDict[MSR1]['Motifs']):
                for k,MSR2 in enumerate(MotifSetDict.keys()):
                    if k > i:
                        for l,motif2 in enumerate(MotifSetDict[MSR2]['Motifs']):
                            if fmu.CompareMotifsBP(motif1,motif2,threshold):
                                found1 = False
                                found2 = False
                                index1 = -1
                                index2 = -1
                                for m,mset in enumerate(matchSets):
                                    if (MSR1,j) in mset:
                                        found1 = True
                                        index1 = m
                                    if(MSR2,l) in mset:
                                        found2 = True
                                        index2 = m
                                if not found1 and found2:
                                    matchSets[index2].add((MSR1,j))
                                elif not found2 and found1:
                                    matchSets[index1].add((MSR2,l))
                                elif found1 and found2:
                                    if index1 != index2:
                                        matchSets[index1].union(matchSets[index2])
                                        matchSets.pop(index2)
                                else:
                                    matchSets.append(set([(MSR1,j),(MSR2,l)]))
        numMotifSets = len(params['motifset_refs'])
        threshold = float(params['proportion'])
        KeepSets = []
        print('NUM MATCHSETS********')
        print(len(matchSets))
        for i,mset in enumerate(matchSets):
            uniqueRefs = {}
            for tuple in mset:
                if tuple[0] not in uniqueRefs:
                    uniqueRefs[tuple[0]] = tuple[0]
            if float(len(uniqueRefs.keys()))/numMotifSets >= threshold:
                KeepSets.append(i)
        print(len(KeepSets))

        ESO = {}
        for ref in MotifSetDict:
            ESO['Condition'] = MotifSetDict[ref]['Condition']
            ESO['SequenceSet_ref'] = MotifSetDict[ref]['SequenceSet_ref']
            ESO['Alphabet'] = deepcopy(MotifSetDict[ref]['Alphabet'])
            ESO['Background'] = deepcopy(MotifSetDict[ref]['Background'])
            break
        ESO['Motifs'] = []
        #Add motifs
        for keep in KeepSets:
            motif = fmu.merge(matchSets[keep],MotifSetDict)
            ESO['Motifs'].append(deepcopy(motif))


        #upload new MSO
        dfu = DataFileUtil(self.callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        save_objects_params['objects'] = [{'type': 'KBaseGeneRegulation.MotifSet' , 'data' : ESO , 'name' : 'EnsembleMotifSet'}]

        info = dfu.save_objects(save_objects_params)[0]
        obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        htmlDir = self.shared_folder + '/ensemble_html'
        os.mkdir(htmlDir)
        mr=MakeNewReport()
        mr.MakeReport(htmlDir,ESO)


        try:
            html_upload_ret = dfu.file_to_shock({'file_path': htmlDir ,'make_handle': 0, 'pack': 'zip'})
        except:
            raise ValueError ('error uploading HTML file to shock')


        reportName = 'MEMEMotifFinder_report_'+str(uuid.uuid4())

        reportObj = {'objects_created': [{'ref' : obj_ref, 'description' : 'Motif Set generated by MEME'}],
                     'message': '',
                     'direct_html': None,
                     'direct_html_link_index': 0,
                     'file_links': [],
                     'html_links': [],
                     'html_window_height': 220,
                     'workspace_name': params['workspace_name'],
                     'report_object_name': reportName
                     }


        # attach to report obj
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'],
                                    'name': 'index.html',
                                    'label': 'Save promoter_download.zip'
                                    }
                                   ]


        report = KBaseReport(self.callback_url, token=ctx['token'])
        report_info = report.create_extended_report(reportObj)
        output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] }

        
        #END run_MotifSuite

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_MotifSuite return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#25
0
    def MotifEnsemble(self, ctx, params):
        """
        :param params: instance of type "EnsembleParams" (Internal workflow:
           1. Input - list of motifsets , workspace, threshold consensus 2.
           Download MotifSets -> Utils function 3. Assign motif ids by
           position in list Use refs to identify MSOs internally! Dictionary
           of motifsets key: ref, val set list of match sets: each item in
           the set is a tuple of (ref,index) for each motifset: <- enumerate
           to avoid duplicate for each motif in motifset for each other
           motifset: <- enumerate to avoid duplicate for each motif in other:
           compare(motif1,motif2): if motifs same: search list of sets for
           motif1: if found add  motif2 if not in if not found search list of
           sets for motif2: if found add motif1 else add a new set with
           motif1 + motif2) -> structure: parameter "motifset_refs" of list
           of String, parameter "workspace_name" of String, parameter
           "threshold" of Double
        :returns: instance of type "Ensemble_out" -> structure: parameter
           "motifset_ref" of String
        """
        # ctx is the context object
        # return variables are: out
        #BEGIN MotifEnsemble
        #TODO: ERROR CHECK (MULTIPLE MOTIFSETS, NONEMPTY, SSREF are the same, etc.)

        MotifSetDict = DownloadMotifSet(params['motifset_refs'],self.callback_url)

        matchSets = []
        threshold = float(params['threshold'])

        for i,MSR1 in enumerate(MotifSetDict.keys()):
            for j,motif1 in enumerate(MotifSetDict[MSR1]['Motifs']):
                for k,MSR2 in enumerate(MotifSetDict.keys()):
                    if k > i:
                        for l,motif2 in enumerate(MotifSetDict[MSR2]['Motifs']):
                            if CompareMotifsBP(motif1,motif2,threshold):
                                found1 = False
                                found2 = False
                                index1 = -1
                                index2 = -1
                                for m,mset in enumerate(matchSets):
                                    if (MSR1,j) in mset:
                                        found1 = True
                                        index1 = m
                                    if(MSR2,l) in mset:
                                        found2 = True
                                        index2 = m
                                if not found1 and found2:
                                    matchSets[index2].add((MSR1,j))
                                elif not found2 and found1:
                                    matchSets[index1].add((MSR2,l))
                                elif found1 and found2:
                                    if index1 != index2:
                                        matchSets[index1].union(matchSets[index2])
                                        matchSets.pop(index2)
                                else:
                                    matchSets.append(set([(MSR1,j),(MSR2,l)]))
        numMotifSets = len(params['motifset_refs'])
        threshold = float(params['proportion'])
        KeepSets = []
        print('NUM MATCHSETS********')
        print(len(matchSets))
        for i,mset in enumerate(matchSets):
            uniqueRefs = {}
            for tuple in mset:
                if tuple[0] not in uniqueRefs:
                    uniqueRefs[tuple[0]] = tuple[0]
            if float(len(uniqueRefs.keys()))/numMotifSets >= threshold:
                KeepSets.append(i)
        print(len(KeepSets))


        #handle duplicates...
        #for i,tuple1 in enumerate(matchSets):
        #    for j,tuple2 in enumerate(matchSets):
        #        if j > i:
        #            if tuple1[0] == tuple2[0]:
                        #handle this....
                        #how...?
                        #merge locations if theyre different
                        #pick one motif by default(p-val)
                        #run motif compare to ensure theyre actually similar enough
        #                print('duplicate')

        #create new MSO
        ESO = {}
        for ref in MotifSetDict:
            ESO['Condition'] = MotifSetDict[ref]['Condition']
            ESO['SequenceSet_ref'] = MotifSetDict[ref]['SequenceSet_ref']
            ESO['Alphabet'] = deepcopy(MotifSetDict[ref]['Alphabet'])
            ESO['Background'] = deepcopy(MotifSetDict[ref]['Background'])
            break
        ESO['Motifs'] = []
        #Add motifs
        for keep in KeepSets:
            motif = merge(matchSets[keep],MotifSetDict)
            ESO['Motifs'].append(deepcopy(motif))


        #upload new MSO
        dfu = DataFileUtil(self.callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        #save_objects_params['id'] = params['workspace_name']
        save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : ESO , 'name' : 'EnsembleMotifSet'}]

        info = dfu.save_objects(save_objects_params)[0]
        obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        #create report
        htmlDir = self.shared_folder + '/ensemble_html'
        os.mkdir(htmlDir)
        MakeReport(htmlDir,ESO)


        try:
            html_upload_ret = dfu.file_to_shock({'file_path': htmlDir ,'make_handle': 0, 'pack': 'zip'})
        except:
            raise ValueError ('error uploading HTML file to shock')



        #Create motif set object from MotifList
        #TODO set parameters correctly
        #add narrative support to set
        #MSO = {}
        #MSO['Condition'] = 'Temp'
        #MSO['FeatureSet_ref'] = '123'
        #MSO['Motifs'] = []
        #MSO['Alphabet'] = ['A','C','G','T']
        #MSO['Background'] = {}
        #for letter in MSO['Alphabet']:
        #    MSO['Background'][letter] = 0.0

        #MSU.parseMotifList(fullMotifList,MSO)
        #objname = 'MotifSet' + str(int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000))

        #Pass motif set into this
        #save_objects_params = {}
        #save_objects_params['id'] = self.ws_info[0]
        #save_objects_params['id'] = long(params['workspace_name'].split('_')[1])
        #save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        #save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : MSO , 'name' : objname}]

        #info = dfu.save_objects(save_objects_params)[0]
        #motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        #object_upload_ret = dfu.file_to_shock()

        reportName = 'MEMEMotifFinder_report_'+str(uuid.uuid4())

        reportObj = {'objects_created': [{'ref' : obj_ref, 'description' : 'Motif Set generated by MEME'}],
                     'message': '',
                     'direct_html': None,
                     'direct_html_link_index': 0,
                     'file_links': [],
                     'html_links': [],
                     'html_window_height': 220,
                     'workspace_name': params['workspace_name'],
                     'report_object_name': reportName
                     }


        # attach to report obj
        #reportObj['direct_html'] = None
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'],
                                    #'name': 'promoter_download.zip',
                                    'name': 'index.html',
                                    'label': 'Save promoter_download.zip'
                                    }
                                   ]


        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']})
        report_info = report.create_extended_report(reportObj)
        out = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] }

        #END MotifEnsemble

        # At some point might do deeper type checking...
        if not isinstance(out, dict):
            raise ValueError('Method MotifEnsemble return value ' +
                             'out is not type dict as required.')
        # return the results
        return [out]
class FeatureSetBuilder:

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_upload_featureset_from_diff_expr_params(self, params):
        """
        _validate_upload_featureset_from_diff_expr_params:
                validates params passed to upload_featureset_from_diff_expr method
        """

        log('start validating upload_featureset_from_diff_expr params')

        # check for required parameters
        for p in ['diff_expression_ref', 'workspace_name',
                  'p_cutoff', 'q_cutoff', 'fold_change_cutoff']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        p = params.get('fold_scale_type')
        if p and p != 'logarithm':
            raise ValueError('"fold_scale_type" parameter must be set to "logarithm", if used')

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning("Unexpected parameter {} supplied".format(param))

    def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list,
                         filtered_expression_matrix_ref_list, workspace_name):
        """
        _generate_report: generate summary report
        """

        log('start creating report')

        output_html_files = self._generate_html_report(up_feature_set_ref_list,
                                                       down_feature_set_ref_list)

        objects_created = list()
        for up_feature_set_ref in up_feature_set_ref_list:
            objects_created += [{'ref': up_feature_set_ref,
                                 'description': 'Upper FeatureSet Object'}]
        for down_feature_set_ref in down_feature_set_ref_list:
            objects_created += [{'ref': down_feature_set_ref,
                                 'description': 'Lower FeatureSet Object'}]

        for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list:
            objects_created += [{'ref': filtered_expression_matrix_ref,
                                 'description': 'Filtered ExpressionMatrix Object'}]

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'objects_created': objects_created,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 333,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _generate_html_report(self, up_feature_set_ref_list, down_feature_set_ref_list):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        uppper_feature_content = ''
        for up_feature_set_ref in up_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2({'objects':
                                                    [{'ref':
                                                     up_feature_set_ref}]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name,
                                                                               len(feature_ids))

        lower_feature_content = ''
        for down_feature_set_ref in down_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2({'objects':
                                                    [{'ref':
                                                     down_feature_set_ref}]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name,
                                                                              len(feature_ids))

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<tr><td>Upper_FeatureSet</td></tr>',
                                                          uppper_feature_content)

                report_template = report_template.replace('<tr><td>Lower_FeatureSet</td></tr>',
                                                          lower_feature_content)

                result_file.write(report_template)

        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report'})
        return html_report

    def _process_diff_expression(self, diff_expression_set_ref, result_directory,
                                 condition_label_pair):
        """
        _process_diff_expression: process differential expression object info
        """

        log('start processing differential expression object')

        diff_expr_set_data = self.ws.get_objects2({'objects':
                                                  [{'ref':
                                                   diff_expression_set_ref}]})['data'][0]['data']

        set_items = diff_expr_set_data['items']

        diff_expr_matrix_file_name = 'gene_results.csv'
        diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name)

        with open(diff_expr_matrix_file, 'w') as csvfile:
            fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

        for set_item in set_items:
            diff_expression_ref = set_item['ref']

            diff_expression_data = self.ws.get_objects2({'objects':
                                                        [{'ref':
                                                         diff_expression_ref}]})['data'][0]['data']

            label_string = set_item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_1 = label_list[0]
            condition_2 = label_list[1]

            if condition_1 in condition_label_pair and condition_2 in condition_label_pair:
                genome_id = diff_expression_data['genome_ref']
                matrix_data = diff_expression_data['data']
                selected_diff_expression_ref = diff_expression_ref

                with open(diff_expr_matrix_file, 'a') as csvfile:
                    row_ids = matrix_data.get('row_ids')
                    row_values = matrix_data.get('values')
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                    for pos, row_id in enumerate(row_ids):
                        row_value = row_values[pos]
                        writer.writerow({'gene_id': row_id,
                                         'log2_fold_change': row_value[0],
                                         'p_value': row_value[1],
                                         'q_value': row_value[2]})

        return diff_expr_matrix_file, genome_id, selected_diff_expression_ref

    def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name):
        """
        _generate_feature_set: generate FeatureSet object

        KBaseCollections.FeatureSet type:
        typedef structure {
            string description;
            list<feature_id> element_ordering;
            mapping<feature_id, list<genome_ref>> elements;
        } FeatureSet;
        """

        log('start saving KBaseCollections.FeatureSet object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        elements = {feature_id: [genome_id] for feature_id in feature_ids}
        feature_set_data = {'description': 'Generated FeatureSet from DifferentialExpression',
                            'element_ordering': feature_ids,
                            'elements': elements}

        object_type = 'KBaseCollections.FeatureSet'
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': object_type,
                         'data': feature_set_data,
                         'name': feature_set_name}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        return feature_set_obj_ref

    def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value,
                             comp_fold_change_cutoff):
        """
        _process_matrix_file: filter matrix file by given cutoffs
        """

        log('start processing matrix file')

        up_feature_ids = []
        down_feature_ids = []

        if comp_fold_change_cutoff < 0:
            comp_fold_change_cutoff = -comp_fold_change_cutoff

        with open(diff_expr_matrix_file, 'r') as file:
            reader = csv.DictReader(file)

            for row in reader:
                feature_id = row['gene_id']
                row_p_value = row['p_value']
                row_q_value = row['q_value']
                row_fold_change_cutoff = row['log2_fold_change']

                null_value = {'NA', 'null', ''}
                col_value = {row_p_value, row_q_value, row_fold_change_cutoff}

                if not col_value.intersection(null_value):
                    p_value_condition = float(row_p_value) <= comp_p_value
                    q_value_condition = float(row_q_value) <= comp_q_value

                    up_matches_condition = (p_value_condition and q_value_condition and
                                                         (float(row_fold_change_cutoff) >=
                                                         comp_fold_change_cutoff))

                    down_matches_condition = (p_value_condition and q_value_condition and
                                             (float(row_fold_change_cutoff) <=
                                             -comp_fold_change_cutoff))

                    if up_matches_condition:
                        up_feature_ids.append(feature_id)
                    elif down_matches_condition:
                        down_feature_ids.append(feature_id)

        return list(set(up_feature_ids)), list(set(down_feature_ids))

    def _filter_expression_matrix(self, expression_matrix_ref, feature_ids,
                                  workspace_name, filtered_expression_matrix_suffix="",
                                  diff_expression_matrix_ref=None,
                                  filtered_expression_matrix_name=None):
        """
        _filter_expression_matrix: generated filtered expression matrix
        """

        log('start saving ExpressionMatrix object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_matrix_obj = self.dfu.get_objects({'object_refs':
                                                     [expression_matrix_ref]})['data'][0]

        expression_matrix_info = expression_matrix_obj['info']
        expression_matrix_data = expression_matrix_obj['data']

        expression_matrix_name = expression_matrix_info[1]

        if not filtered_expression_matrix_name:
            if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name):
                filtered_expression_matrix_name = re.sub('_*[Ee]xpression_*[Mm]atrix',
                                                         filtered_expression_matrix_suffix,
                                                         expression_matrix_name)
            else:
                filtered_expression_matrix_name = expression_matrix_name + \
                    filtered_expression_matrix_suffix

        filtered_expression_matrix_data = expression_matrix_data.copy()

        data = filtered_expression_matrix_data['data']

        row_ids = data['row_ids']
        values = data['values']
        filtered_data = data.copy()

        filtered_row_ids = list()
        filtered_values = list()
        for pos, row_id in enumerate(row_ids):
            if row_id in feature_ids:
                filtered_row_ids.append(row_id)
                filtered_values.append(values[pos])

        filtered_data['row_ids'] = filtered_row_ids
        filtered_data['values'] = filtered_values
        filtered_expression_matrix_data['data'] = filtered_data

        expression_obj = {'type': expression_matrix_info[2], 'data': filtered_expression_matrix_data,
                          'name': filtered_expression_matrix_name}
        # we now save the filtering DEM in a EM field added for this purpose
        if diff_expression_matrix_ref:
            expression_obj['data']['diff_expr_matrix_ref'] = diff_expression_matrix_ref
            expression_obj['extra_provenance_input_refs'] = [diff_expression_matrix_ref]

        save_object_params = {
            'id': workspace_id,
            'objects': [expression_obj]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        filtered_expression_matrix_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        return filtered_expression_matrix_ref

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _check_input_labels(self, condition_pairs, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        for condition_pair in condition_pairs:

            label_string = condition_pair['label_string'][0].strip()
            label_list = [x.strip() for x in label_string.split(',')]
            first_label = label_list[0]
            second_label = label_list[1]

            if first_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(first_label)
                error_msg += 'Available conditions: {}'.format(available_condition_labels)
                raise ValueError(error_msg)

            if second_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(second_label)
                error_msg += 'Available conditions: {}'.format(available_condition_labels)
                raise ValueError(error_msg)

            if first_label == second_label:
                raise ValueError('Input conditions are the same')

        return checked

    def _get_condition_labels(self, diff_expression_set_ref):
        """
        _get_condition_labels: get all possible condition label pairs
        """
        log('getting all possible condition pairs')

        condition_label_pairs = list()
        available_condition_labels = set()
        diff_expression_set_obj = self.ws.get_objects2({'objects':
                                                       [{'ref': diff_expression_set_ref}]
                                                        })['data'][0]
        diff_expression_set_data = diff_expression_set_obj['data']
        items = diff_expression_set_data.get('items')
        for item in items:
            label_string = item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_label_pairs.append(label_list)
            available_condition_labels |= set(label_list)

        log('all possible condition pairs:\n{}'.format(condition_label_pairs))

        return condition_label_pairs, available_condition_labels

    def _get_feature_ids(self, genome_ref, ids):
        """
        _get_feature_ids: get feature ids from genome
        """

        genome_features = self.gsu.search({'ref': genome_ref,
                                           'limit': len(ids),
                                           'structured_query': {"$or": [{"feature_id": x}
                                                                        for x in ids]},
                                           'sort_by': [['feature_id', True]]})['features']

        features_ids = set((feature.get('feature_id') for feature in genome_features))

        return features_ids

    def _build_fs_obj(self, params):
        new_feature_set = {
            'description': '',
            'element_ordering': [],
            'elements': {}
        }
        genome_ref = params['genome']
        if params.get('base_feature_sets', []) and None not in params['base_feature_sets']:
            base_feature_sets = self.dfu.get_objects(
                {'object_refs': params['base_feature_sets']}
            )['data']
            for ret in base_feature_sets:
                base_set = ret['data']
                base_set_name = ret['info'][1]

                new_feature_set['element_ordering'] += [x for x in base_set['element_ordering']
                                                        if x not in new_feature_set['elements']]
                for element, genome_refs in base_set['elements'].items():
                    if element in new_feature_set['elements']:
                        new_feature_set['elements'][element] += [x for x in genome_refs if x not in
                                                                 new_feature_set['elements'][
                                                                     element]]
                    else:
                        new_feature_set['elements'][element] = genome_refs
                new_feature_set['description'] += 'From FeatureSet {}: {}\n'.format(
                    base_set_name, base_set.get('description'))
        new_feature_ids = []
        if params.get('feature_ids'):
            if isinstance(params['feature_ids'], str):
                new_feature_ids += params['feature_ids'].split(',')
            else:
                new_feature_ids += params['feature_ids']
        if params.get('feature_ids_custom'):
            new_feature_ids += params['feature_ids_custom'].split(',')
        if new_feature_ids:
            genome_feature_ids = self._get_feature_ids(genome_ref, new_feature_ids)
        for new_feature in new_feature_ids:
            if new_feature not in genome_feature_ids:
                raise ValueError('Feature ID {} does not exist in the supplied genome {}'.format(
                    new_feature, genome_ref))
            if new_feature in new_feature_set['elements']:
                if genome_ref not in new_feature_set['elements'][new_feature]:
                    new_feature_set['elements'][new_feature].append(genome_ref)
            else:
                new_feature_set['elements'][new_feature] = [genome_ref]
                new_feature_set['element_ordering'].append(new_feature)

        if params.get('description'):
            new_feature_set['description'] = params['description']

        return new_feature_set

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.ws = Workspace(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.scratch = config['scratch']

    def upload_featureset_from_diff_expr(self, params):
        """
        upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression
                                          based on given threshold cutoffs

        required params:
        diff_expression_ref: DifferetialExpressionMatrixSet object reference
        expression_matrix_ref: ExpressionMatrix object reference
        p_cutoff: p value cutoff
        q_cutoff: q value cutoff
        fold_scale_type: one of ["linear", "log2+1", "log10+1"]
        fold_change_cutoff: fold change cutoff
        feature_set_suffix: Result FeatureSet object name suffix
        filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix
        workspace_name: the name of the workspace it gets saved to

        return:
        result_directory: folder path that holds all files generated
        up_feature_set_ref_list: list of generated upper FeatureSet object reference
        down_feature_set_ref_list: list of generated down FeatureSet object reference
        filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        self._validate_upload_featureset_from_diff_expr_params(params)

        diff_expression_set_ref = params.get('diff_expression_ref')
        diff_expression_set_info = self.ws.get_object_info3({"objects":
                                                            [{"ref": diff_expression_set_ref}]}
                                                            )['infos'][0]
        diff_expression_set_name = diff_expression_set_info[1]

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        (available_condition_label_pairs,
         available_condition_labels) = self._get_condition_labels(diff_expression_set_ref)

        run_all_combinations = params.get('run_all_combinations')
        condition_pairs = params.get('condition_pairs')
        if not self._xor(run_all_combinations, condition_pairs):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide partial condition pairs. Don't do both or neither"
            raise ValueError(error_msg)

        if run_all_combinations:
            condition_label_pairs = available_condition_label_pairs
        else:
            if self._check_input_labels(condition_pairs, available_condition_labels):
                condition_label_pairs = list()
                for condition_pair in condition_pairs:
                    label_string = condition_pair['label_string'][0].strip()
                    condition_labels = [x.strip() for x in label_string.split(',')]
                    condition_label_pairs.append(condition_labels)

        up_feature_set_ref_list = list()
        down_feature_set_ref_list = list()
        filtered_expression_matrix_ref_list = list()

        for condition_label_pair in condition_label_pairs:
            condition_string = '-'.join(reversed(condition_label_pair))
            diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression(
                                                                diff_expression_set_ref,
                                                                result_directory,
                                                                condition_label_pair)
            up_feature_ids, down_feature_ids = self._process_matrix_file(
                                                                diff_expr_matrix_file,
                                                                params.get('p_cutoff'),
                                                                params.get('q_cutoff'),
                                                                params.get('fold_change_cutoff'))
            filtered_em_name = _sanitize_name(condition_string) + params.get('filtered_expression_matrix_suffix')
            if params.get('expression_matrix_ref'):
                filtered_expression_matrix_ref = self._filter_expression_matrix(
                                                params.get('expression_matrix_ref'),
                                                up_feature_ids + down_feature_ids,
                                                params.get('workspace_name'), "",
                                                diff_expr_matrix_ref, filtered_em_name)
                filtered_expression_matrix_ref_list.append(filtered_expression_matrix_ref)

            feature_set_suffix = params.get('feature_set_suffix', "")
            up_feature_set_name = "{}_{}_up{}".format(
                diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix)
            up_feature_set_ref = self._generate_feature_set(up_feature_ids,
                                                            genome_id,
                                                            params.get('workspace_name'),
                                                            up_feature_set_name)
            up_feature_set_ref_list.append(up_feature_set_ref)

            down_feature_set_name = "{}_{}_down{}".format(
                diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix)
            down_feature_set_ref = self._generate_feature_set(down_feature_ids,
                                                              genome_id,
                                                              params.get('workspace_name'),
                                                              down_feature_set_name)
            down_feature_set_ref_list.append(down_feature_set_ref)

        returnVal = {'result_directory': result_directory,
                     'up_feature_set_ref_list': up_feature_set_ref_list,
                     'down_feature_set_ref_list': down_feature_set_ref_list,
                     'filtered_expression_matrix_ref_list': filtered_expression_matrix_ref_list}

        report_output = self._generate_report(up_feature_set_ref_list, down_feature_set_ref_list,
                                              filtered_expression_matrix_ref_list,
                                              params.get('workspace_name'))
        returnVal.update(report_output)

        return returnVal

    def filter_matrix_with_fs(self, params):
        self.validate_params(params, ('feature_set_ref', 'workspace_name',
                                      'expression_matrix_ref', 'filtered_expression_matrix_suffix'))
        ret = self.dfu.get_objects(
            {'object_refs': [params['feature_set_ref']]}
        )['data'][0]
        feature_set = ret['data']
        feature_set_name = ret['info'][1]
        feature_ids = set(feature_set['elements'].keys())
        filtered_matrix_ref = self._filter_expression_matrix(
            params['expression_matrix_ref'], feature_ids, params['workspace_name'],
            params['filtered_expression_matrix_suffix'])

        objects_created = [{'ref': filtered_matrix_ref,
                            'description': 'Filtered ExpressionMatrix Object'}]
        message = "Filtered Expression Matrix based of the {} feature ids present in {}"\
            .format(len(feature_ids), feature_set_name)

        report_params = {'message': message,
                         'workspace_name': params['workspace_name'],
                         'objects_created': objects_created,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {'filtered_expression_matrix_ref': filtered_matrix_ref,
                'report_name': output['name'], 'report_ref': output['ref']}

    def build_feature_set(self, params):
        self.validate_params(params, {'output_feature_set', 'workspace_name', },
                             {'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets',
                              'description'})
        feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets')
        if not any([params.get(x) for x in feature_sources]):
            raise ValueError("You must supply at least one feature source: {}".format(
                ", ".join(feature_sources)))
        workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])

        new_feature_set = self._build_fs_obj(params)
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': 'KBaseCollections.FeatureSet',
                         'data': new_feature_set,
                         'name': params['output_feature_set']}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        objects_created = [{'ref': feature_set_obj_ref,
                            'description': 'Feature Set'}]
        message = 'A new feature set containing {} features was created.'.format(
            len(new_feature_set['elements']))

        report_params = {'message': message,
                         'workspace_name': params['workspace_name'],
                         'objects_created': objects_created,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {'feature_set_ref': feature_set_obj_ref,
                'report_name': output['name'], 'report_ref': output['ref']}
 def test_update_taxon_assignments_valid(self):
     """
     Test a valid call to the update_taxon_assignments method.
     """
     taxon_key = str(uuid4())
     taxon_val = str(uuid4())
     taxon_val_new = str(uuid4())
     # Copy the object to test workspace
     dfu = DataFileUtil(self.callbackURL)
     obj_ref = f"{_WORKSPACE_NAME}/{_OBJECT_NAME}"
     result = dfu.get_objects({'object_refs': [obj_ref]})['data'][0]
     obj_data = result['data']
     # crate user owned handle in the object and update it
     hs = HandleService(self.handleURL)
     prev_handle_id = obj_data['genbank_handle_ref']
     prev_shock_id = hs.hids_to_handles([prev_handle_id])[0]['id']
     new_handle_id = dfu.own_shock_node({
         'shock_id': prev_shock_id,
         'make_handle': 1
     })['handle']['hid']
     obj_data['genbank_handle_ref'] = new_handle_id
     # Save new object in test workspace
     obj_info = result['info']
     new_obj = {
         'type': obj_info[2],
         'data': obj_data,
         'name': 'GCF_002287175.1'
     }
     test_ws_id = dfu.ws_name_to_id(self.wsName)
     infos = dfu.save_objects({'id': test_ws_id, 'objects': [new_obj]})
     obj_ref = f"{infos[0][6]}/{infos[0][0]}/{infos[0][4]}"
     new_ws_id = infos[0][6]
     new_obj_id = infos[0][0]
     get_obj_params = {
         'wsid': new_ws_id,
         'objid': new_obj_id,
         'included': ['/taxon_assignments']
     }
     # Add a new assignment
     self.serviceImpl.update_taxon_assignments(
         self.ctx, {
             'workspace_id': new_ws_id,
             'object_id': new_obj_id,
             'taxon_assignments': {
                 taxon_key: taxon_val
             }
         })
     # Fetch the object and check the mapping
     obj = self.wsClient.get_objects2({'objects':
                                       [get_obj_params]})['data'][0]['data']
     self.assertTrue(taxon_key in obj['taxon_assignments'])
     self.assertEqual(obj['taxon_assignments'][taxon_key], taxon_val)
     # Update the assignment we just added
     self.serviceImpl.update_taxon_assignments(
         self.ctx, {
             'workspace_id': new_ws_id,
             'object_id': new_obj_id,
             'taxon_assignments': {
                 taxon_key: taxon_val_new
             }
         })
     # Fetch the object and check the mapping
     obj = self.wsClient.get_objects2({'objects':
                                       [get_obj_params]})['data'][0]['data']
     self.assertTrue(taxon_key in obj['taxon_assignments'])
     self.assertEqual(obj['taxon_assignments'][taxon_key], taxon_val_new)
     # Remove the assignment we just added
     self.serviceImpl.update_taxon_assignments(
         self.ctx, {
             'workspace_id': new_ws_id,
             'object_id': new_obj_id,
             'remove_assignments': [taxon_key]
         })
     # Fetch the object and check the mapping
     obj = self.wsClient.get_objects2({'objects':
                                       [get_obj_params]})['data'][0]['data']
     self.assertTrue(taxon_key not in obj['taxon_assignments'])
     self.assertEqual(obj['taxon_assignments'].get(taxon_key), None)
示例#28
0
class DataUtil:
    @staticmethod
    def _find_between(s, start, end):
        """
        _find_between: find string in between start and end
        """

        return re.search('{}(.*){}'.format(start, end), s).group(1)

    def _find_constraints(self, obj_type):
        """
        _find_constraints: retrieve constraints (@contains, rowsum, unique, conditionally_required)
        """

        type_info = self.wsClient.get_type_info(obj_type)
        type_desc = type_info.get('description')
        constraints = {}

        for tag in ('contains', 'rowsum', 'unique', 'conditionally_required'):
            constraints[tag] = [
                line.strip().split()[1:] for line in type_desc.split("\n")
                if line.startswith(f'@{tag}')
            ]

        return constraints

    def _filter_constraints(self, constraints, data):
        """filters out constraints with missing keys"""
        contains_constraints = constraints.get('contains')
        # exit(contains_constraints)  [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']]

        filtered_constraints = []
        for contains_constraint in contains_constraints:
            in_values = contains_constraint[1:]
            #exit(in_values)  ['row_mapping']
            missing_key = True
            for in_value in in_values:
                # exit(in_value)  row_mapping
                if in_value.startswith('values'):
                    search_value = re.search('{}(.*){}'.format('\(', '\)'),
                                             in_value).group(1)
                    unique_list = search_value.split('.')
                    key = unique_list[0]
                elif ':' in in_value:
                    key = in_value.split(':')[0]
                else:
                    unique_list = in_value.split('.')
                    key = unique_list[0]

                if key in data:
                    missing_key = False
                    break

            if missing_key:
                filtered_constraints.append(contains_constraint)

        for x in filtered_constraints:
            contains_constraints.remove(x)
        #exit(constraints)
        '''
        {'contains': [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']], 'rowsum': [], 'unique': [['data.row_ids'], ['data.col_ids']], 'conditionally_required': [['row_attributemapping_ref', 'row_mapping'], ['col_attributemapping_ref', 'col_mapping']]}
        '''
        return constraints

    def _retrieve_value(self, data, value):
        """Parse the provided 'data' object to retrieve the item in 'value'."""
        logging.info('Getting value for {}'.format(value))
        retrieve_data = []
        #exit(data)
        '''
        {'row_attributemapping_ref': '44071/19/157', 'row_mapping': {'GG_OTU_1': 'GG_OTU_1', 'GG_OTU_2': 'GG_OTU_2', 'GG_OTU_3': 'GG_OTU_3', 'GG_OTU_4': 'GG_OTU_4', 'GG_OTU_5': 'GG_OTU_5'}, 'col_attributemapping_ref': '44071/20/79', 'col_mapping': {'Sample1': 'Sample1', 'Sample2': 'Sample2', 'Sample3': 'Sample3', 'Sample4': 'Sample4', 'Sample5': 'Sample5', 'Sample6': 'Sample6'}, 'attributes': {'generated_by': 'QIIME revision XYZ'}, 'data': {'row_ids': ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], 'col_ids': ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], 'values': [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]}, 'search_attributes': ['generated_by|QIIME revision XYZ'], 'scale': 'raw', 'description': 'OTU data'}
        '''
        m_data = DotMap(data)
        #exit(m_data)
        '''
        DotMap(row_attributemapping_ref='44071/19/158', row_mapping=DotMap(GG_OTU_1='GG_OTU_1', GG_OTU_2='GG_OTU_2', GG_OTU_3='GG_OTU_3', GG_OTU_4='GG_OTU_4', GG_OTU_5='GG_OTU_5'), col_attributemapping_ref='44071/20/80', col_mapping=DotMap(Sample1='Sample1', Sample2='Sample2', Sample3='Sample3', Sample4='Sample4', Sample5='Sample5', Sample6='Sample6'), attributes=DotMap(generated_by='QIIME revision XYZ'), data=DotMap(row_ids=['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], col_ids=['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], values=[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]), search_attributes=['generated_by|QIIME revision XYZ'], scale='raw', description='OTU data')
        '''
        #exit(value) data.row_ids
        if value.startswith('set('):
            retrieve_data = value[4:-1].split(",")
        elif value.startswith(
                'values('):  # TODO: nested values e.g. values(values(ids))
            search_value = re.search('{}(.*){}'.format('\(', '\)'),
                                     value).group(1)
            unique_list = search_value.split('.')
            m_data_cp = m_data.copy()

            for attr in unique_list:
                m_data_cp = getattr(m_data_cp, attr)
            retrieve_data = list(m_data_cp.values())
        elif ':' in value:
            obj_ref = getattr(m_data, value.split(':')[0])
            if obj_ref:
                included = value.split(':')[1]
                included = '/' + included.replace('.', '/')
                ref_data = self.wsClient.get_objects2(
                    {'objects': [{
                        'ref': obj_ref,
                        'included': [included]
                    }]})['data'][0]['data']
                m_ref_data = DotMap(ref_data)
                if ref_data:
                    if '*' not in included:
                        for key in included.split('/')[1:]:
                            m_ref_data = getattr(m_ref_data, key)
                    else:
                        keys = included.split('/')[1:]
                        m_ref_data = [
                            x.get(keys[2]) for x in ref_data.get(keys[0])
                        ]  # TODO: only works for 2 level nested data like '/features/[*]/id'

                retrieve_data = list(m_ref_data)
        else:
            unique_list = value.split('.')
            m_data_cp = m_data.copy()
            for attr in unique_list:
                m_data_cp = getattr(m_data_cp, attr)
            retrieve_data = list(m_data_cp)

        logging.info('Retrieved value (first 20):\n{}\n'.format(
            retrieve_data[:20]))
        #exit(retrieve_data)   ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5']

        return retrieve_data

    def _validate(self, constraints, data):
        """
        _validate: validate data
        """
        #exit(constraints)
        '''
        {'contains': [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']], 'rowsum': [], 'unique': [['data.row_ids'], ['data.col_ids']], 'conditionally_required': [['row_attributemapping_ref', 'row_mapping'], ['col_attributemapping_ref', 'col_mapping']]}
        '''
        validated = True
        failed_constraints = defaultdict(list)

        unique_constraints = constraints.get('unique')
        #exit(unique_constraints)  [['data.row_ids'], ['data.col_ids']]
        for unique_constraint in unique_constraints:
            retrieved_value = self._retrieve_value(data, unique_constraint[0])
            #exit(retrieved_value)  ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5']
            if len(set(retrieved_value)) != len(retrieved_value):
                validated = False
                failed_constraints['unique'].append(unique_constraint[0])

        contains_constraints = constraints.get('contains')
        #exit(contains_constraints) [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']]
        for contains_constraint in contains_constraints:
            value = contains_constraint[0]
            in_values = contains_constraint[1:]
            retrieved_in_values = []
            for in_value in in_values:
                retrieved_in_values += self._retrieve_value(data, in_value)
            if not (set(self._retrieve_value(data, value)) <=
                    set(retrieved_in_values)):
                validated = False
                failed_constraints['contains'].append(
                    " ".join(contains_constraint))

        conditional_constraints = constraints.get('conditionally_required')
        #exit(conditional_constraints)  [['row_attributemapping_ref', 'row_mapping'], ['col_attributemapping_ref', 'col_mapping']]
        for conditional_constraint in conditional_constraints:
            trigger = conditional_constraint[0]
            required_keys = conditional_constraint[1:]
            if trigger in data:
                missing_keys = [
                    key for key in required_keys if key not in data
                ]
                if missing_keys:
                    validated = False
                    failed_constraints['conditionally_required'].append(
                        (trigger, required_keys, missing_keys))

        return validated, failed_constraints

    @staticmethod
    def _raise_validation_error(params, validate):
        """Raise a meaningful error message for failed validation"""
        logging.error('Data failed type checking')
        failed_constraints = validate.get('failed_constraints')
        error_msg = [
            'Object {} failed type checking:'.format(params.get('obj_name'))
        ]
        if failed_constraints.get('unique'):
            unique_values = failed_constraints.get('unique')
            error_msg.append(
                'Object should have unique field: {}'.format(unique_values))
        if failed_constraints.get('contains'):
            contained_values = failed_constraints.get('contains')
            for contained_value in contained_values:
                subset_value = contained_value.split(' ')[0]
                super_value = ' '.join(contained_value.split(' ')[1:])
                if 'col_mapping' in super_value:
                    error_msg.append(
                        'Column attribute mapping instances should contain all '
                        'column index from original data')

                if 'row_mapping' in super_value:
                    error_msg.append(
                        'Row attribute mapping instances should contain all row '
                        'index from original data')

                error_msg.append(
                    'Object field [{}] should contain field [{}]'.format(
                        super_value, subset_value))
        for failure in failed_constraints.get('conditionally_required', []):
            error_msg.append(
                'If object field "{}" is present than object field(s) {} should '
                'also be present. Object is missing {}'.format(*failure))

        raise ValueError('\n'.join(error_msg))

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']
        self.serviceWizardURL = config['srv-wiz-url']
        self.wsClient = workspaceService(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.generics_service = GenericsService(self.serviceWizardURL)

    def list_generic_types(self, params=None):
        """
        *Not yet exposed in spec*
        list_generic_types: lists the current valid generics types

        arguments:
            none

        return:
            A list of generic types in the current environment
        """
        returnVal = [
            x['type_def'] for module in GENERICS_MODULES
            for x in self.wsClient.get_all_type_info(module)
        ]
        return returnVal

    def fetch_data(self, params):
        #exit(params) {'obj_ref': '44071/21/241'}
        """
        fetch_data: fetch generics data as pandas dataframe for a generics data object

        arguments:
        obj_ref: generics object reference

        optional arguments:
        generics_module: the generics data module to be retrieved from
                        e.g. for an given data type like below:
                        typedef structure {
                          FloatMatrix2D data;
                          condition_set_ref condition_set_ref;
                        } SomeGenericsMatrix;
                        generics_module should be
                        {'data': 'FloatMatrix2D',
                         'condition_set_ref': 'condition_set_ref'}

        return:
        data_matrix: a pandas dataframe in json format
        """
        for p in ['obj_ref']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))
        #exit(self.generics_service.fetch_data(params))  {'data_matrix': '{"Sample1":{"GG_OTU_1":0.0,"GG_OTU_2":5.0,"GG_OTU_3":0.0,"GG_OTU_4":2.0,"GG_OTU_5":0.0},"Sample2":{"GG_OTU_1":0.0,"GG_OTU_2":1.0,"GG_OTU_3":0.0,"GG_OTU_4":1.0,"GG_OTU_5":1.0},"Sample3":{"GG_OTU_1":1.0,"GG_OTU_2":0.0,"GG_OTU_3":1.0,"GG_OTU_4":1.0,"GG_OTU_5":1.0},"Sample4":{"GG_OTU_1":0.0,"GG_OTU_2":2.0,"GG_OTU_3":4.0,"GG_OTU_4":0.0,"GG_OTU_5":0.0},"Sample5":{"GG_OTU_1":0.0,"GG_OTU_2":3.0,"GG_OTU_3":2.0,"GG_OTU_4":0.0,"GG_OTU_5":0.0},"Sample6":{"GG_OTU_1":0.0,"GG_OTU_2":1.0,"GG_OTU_3":0.0,"GG_OTU_4":1.0,"GG_OTU_5":0.0}}'}
        return self.generics_service.fetch_data(params)

    def validate_data(self, params):
        """
        validate_data: validate data

        arguments:
        obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1'
        data: obj data to be validated

        return:
        validated: True or False
        """

        constraints = self._find_constraints(params.get('obj_type'))
        data = params.get('data')

        constraints = self._filter_constraints(constraints, data)

        validated, failed_constraints = self._validate(constraints, data)

        return {
            'validated': validated,
            'failed_constraints': failed_constraints
        }

    def save_object(self, params):
        """
        save_object: validate data constraints and save matrix object

        arguments:
        obj_type: saving object data type
        obj_name: saving object name
        data: data to be saved
        workspace_name: workspace name matrix object to be saved to

        return:
        obj_ref: object reference
        """
        logging.info('Starting saving object')

        obj_type = params.get('obj_type')

        module_name = obj_type.split('.')[0]
        type_name = obj_type.split('.')[1]

        types = self.wsClient.get_module_info({
            'mod': module_name
        }).get('types')

        for module_type in types:
            if self._find_between(module_type, '\.', '\-') == type_name:
                obj_type = module_type
                break

        data = dict((k, v) for k, v in params.get('data').items() if v)
        validate = self.validate_data({'obj_type': obj_type, 'data': data})

        if not validate.get('validated'):
            self._raise_validation_error(params, validate)

        workspace_name = params.get('workspace_name')
        if not isinstance(workspace_name, int):
            ws_name_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            ws_name_id = workspace_name

        info = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type": obj_type,
                "data": data,
                "name": params.get('obj_name')
            }]
        })[0]

        return {"obj_ref": "%s/%s/%s" % (info[6], info[0], info[4])}
示例#29
0
class FileUtil:
    def _validate_import_file_params(self, params):
        """
        _validate_import_matrix_from_excel_params:
            validates params passed to import_matrix_from_excel method
        """
        # check for required parameters
        for p in ['msa_name', 'workspace_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        if params.get('input_file_path'):
            file_path = params.get('input_file_path')
        elif params.get('input_shock_id'):
            file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        elif params.get('input_staging_file_path'):
            file_path = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                params.get('input_staging_file_path')
            }).get('copy_file_path')
        else:
            error_msg = "Must supply either a input_shock_id or input_file_path "
            error_msg += "or input_staging_file_path"
            raise ValueError(error_msg)

        return file_path, params['workspace_name'], params['msa_name']

    def _upload_to_shock(self, file_path):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
        """
        logging.info('Start uploading file to shock: {}'.format(file_path))

        file_to_shock_params = {
            'file_path': file_path,
            'pack': 'gzip',
            'make_handle': True,
        }
        shock_id = self.dfu.file_to_shock(file_to_shock_params)['shock_id']

        return shock_id

    @staticmethod
    def _infer_seq_type(msa):
        dna_set = {"A", "C", "G", "T", "-"}
        seq_chars = {char for record in msa for char in record.seq}
        if seq_chars - dna_set:
            return "protein"
        else:
            return "dna"

    def _file_to_data(self, file_path, format='fasta'):
        """Do the file conversion"""

        data = {
            'alignment': {},
            'default_row_labels': {},
            'row_order': [],
        }

        msa = AlignIO.read(file_path, format)
        data['alignment_length'] = msa.get_alignment_length()
        data['sequence_type'] = self._infer_seq_type(msa)

        for record in msa:
            data['row_order'].append(record.id)
            data['default_row_labels'][record.id] = record.description
            data['alignment'][record.id] = str(record.seq)

        message = f'A Multiple Sequence Alignment with {len(data["alignment"])} sequences and ' \
                  f'an alignment length of {data["alignment_length"]} was produced'

        return data, message

    def _generate_report(self, msa_ref, workspace_name, message):
        """
        _generate_report: generate summary report for upload
        """
        report_params = {
            'message': message,
            'objects_created': [{
                'ref': msa_ref,
                'description': 'Imported MSA'
            }],
            'workspace_name': workspace_name,
            'report_object_name': f'import_msa_file_{uuid.uuid4()}'
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _get_object(self, params):
        ret = self.dfu.get_objects({'object_refs':
                                    [params['input_ref']]})['data'][0]
        obj_name = ret['info'][1]
        obj_data = ret['data']
        return obj_name, obj_data

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)

    def import_fasta_file(self, params):

        file_path, workspace_name, msa_name = self._validate_import_file_params(
            params)

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        data, message = self._file_to_data(file_path,
                                           params.get('file_format', 'fasta'))
        data['description'] = params.get('description', '')

        info = self.dfu.save_objects({
            'id':
            workspace_id,
            'objects': [{
                'type': 'KBaseTrees.MSA',
                'name': msa_name,
                'data': data
            }]
        })[0]
        obj_ref = f"{info[6]}/{info[0]}/{info[4]}"

        returnVal = {'msa_obj_ref': obj_ref}

        report_output = self._generate_report(obj_ref, workspace_name, message)

        returnVal.update(report_output)

        return returnVal

    def msa_to_file(self, params, file_type='fasta'):
        if "input_ref" not in params:
            raise ValueError("input_ref not in supplied params")
        if "destination_dir" not in params:
            raise ValueError("destination_dir not in supplied params")

        obj_name, obj_data = self._get_object(params)
        keys = obj_data.get('row_order', obj_data['alignment'].keys)
        row_labels = obj_data.get('default_row_labels', {})
        file_path = os.path.join(self.scratch, f'{obj_name}.{file_type}')
        seq_type = generic_protein if obj_data.get(
            'sequence_type') == "protein" else generic_dna

        msa = MultipleSeqAlignment([
            SeqRecord(Seq(obj_data['alignment'][key], seq_type),
                      id=key,
                      description=row_labels[key]) for key in keys
        ])
        AlignIO.write(msa, file_path, file_type)

        return {'file_path': file_path}

    def msa_to_clustal_file(self, params):
        raise NotImplementedError

    def export_file(self, params, file_type='fasta'):
        params['destination_dir'] = os.path.join(self.scratch,
                                                 str(uuid.uuid4()))
        os.mkdir(params['destination_dir'])

        file_path = self.msa_to_file(params, file_type)['file_path']

        return {'shock_id': self._upload_to_shock(file_path)}
示例#30
0
class GFFUtils2:
    def __init__(self, config):
        self.callback_url = config['callback_url']
        self.shared_folder = config['scratch']
        #self.shared_folder = "/kb/module/work"
        self.ws_url = config['workspace-url']

        self.dfu = DataFileUtil(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.wsc = Workspace(self.ws_url)

    def _prep_gff(self, gff_file):
        outfile = os.path.join(self.genome_dir, 'out.gff')
        sortcmd = f'(grep ^"#"  {gff_file}; grep -v ^"#" {gff_file} | sort -k1,1 -k4,4n)'

        with open(outfile, 'w') as o:
            p = subprocess.Popen(sortcmd, shell=True, stdout=o)
            out, err = p.communicate()
            o.close()

        bgzip = subprocess.Popen(['bgzip', 'out.gff'], cwd=self.genome_dir)
        out2, err2 = bgzip.communicate()

        outfile += '.gz'

        return outfile

    def _construct_gff_from_json(self, json, gff_file_path, contig_base_lengths):
        with open(gff_file_path, 'w') as f:
            for feature in json:
                if feature['feature_type'].strip().upper() == 'GENE':
                    end = int(feature['location'][0]['start'])+int(feature['location'][0]['length'])

                    metainfo = "ID="+feature['feature_id']

                    if feature['function']:
                        metainfo += ';FUNCTION='+feature['function']

                    contig_id = str(feature['location'][0]['contig_id'])
                    start = int(feature['location'][0]['start'])

                    # TODO: Fix Plink reassignment of Chr prefixes
                    try:
                        global_pos = int(contig_base_lengths[contig_id]) + start
                    except KeyError:
                        try:
                            global_pos = int(contig_base_lengths[contig_id.capitalize()]) + start
                        except KeyError:
                            try:
                                global_pos = int(contig_base_lengths['Chr'+str(contig_id)]) + start
                            except KeyError:
                                try:
                                    global_pos = int(contig_base_lengths['Chr0'+str(contig_id)]) + start
                                except KeyError:
                                    pp(contig_base_lengths)
                                    pp(contig_id)
                                    raise KeyError(e)

                    """
                    Remove ontology for now
                    if feature['ontology_terms']:
                        metainfo += ';ONTOLOGY('

                        for k, v in feature['ontology_terms'].items():
                            metainfo += str(k) + ',' + str(v) + ':'

                        metainfo = metainfo[:-1]  # remove trailing ;
                        metainfo += ')'
                    """

                    constructed_gff_line = str(feature['location'][0]['contig_id']) + '\t' + \
                                           'KBase\tgene\t' + \
                                           str(feature['location'][0]['start']) + '\t' + \
                                           str(end) + '\t.\t' + \
                                           str(feature['location'][0]['strand']) + '\t' + \
                                           str(global_pos) + '\t' + \
                                           str(metainfo) + '\n'
                    f.write(constructed_gff_line)
            f.close()
        if os.path.exists(gff_file_path):
            return gff_file_path
        else:
            raise FileNotFoundError('Unable to create GFF file form genome JSON.')

    def _process_tabix_results(self, queryresult):
        queryinfo = queryresult[8].split(';')
        if len(queryinfo) >= 2:
            extension = [clean_tsv_data(queryinfo[0][3:]), "NA", clean_tsv_data(queryinfo[1][9:])]
        elif len(queryinfo) is 1:
            extension = [clean_tsv_data(queryinfo[0][3:]), "NA", "NA"]
        else:
            extension = ['NA', 'NA', 'NA']
        return extension

    def find_gene_info(self, row):
        tb = tabix_query(self.sorted_gff, row["CHR"], int(row["POS"]), int(row["POS"]))
        tbresult = next(tb, None)
        if tbresult is None:
            tb2 = tabix_query(self.sorted_gff, 'chr' + row["CHR"], int(row["POS"]), int(row["POS"]))
            tbresult2 = next(tb2, None)
            if tbresult2 is None:
                tb3 = tabix_query(self.sorted_gff, 'chr0' + row["CHR"], int(row["POS"]), int(row["POS"]))
                tbresult3 = next(tb3, None)
                if tbresult3 is None:
                    if int(row["POS"]) < 500:
                        nstart = 0
                    else:
                        nstart = int(row["POS"]) - 500

                    neigh_tb = tabix_query(self.sorted_gff, row["CHR"], nstart, int(row["POS"]) + 500)
                    neigh_result = next(neigh_tb, None)

                    if neigh_result is None:
                        return pd.Series(['NA', 'NA', 'NA'], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
                    else:
                        nq = self._process_tabix_results(neigh_result)
                        return pd.Series([nq[1], nq[0], nq[2]], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
                else:
                    q3 = self._process_tabix_results(tbresult3)
                    return pd.Series(q3, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
            else:
                q2 = self._process_tabix_results(tbresult2)
                return pd.Series(q2, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
        else:
            q = self._process_tabix_results(tbresult)
            return pd.Series(q, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])

    def get_gwas_result_file(self, association_ref, association_name, p_value):
        #association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0]['data']['data']
        association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0]
        association_results = association_obj['data']["association_details"][0]["association_results"]
        result = "CHR\tSNP\tPOS\tP\tBP\n"
        for variation in association_results:
            if (float(variation[3]) > float(p_value)):
                continue
            result += str(variation[0]) + "\t" 
            result +=  str(variation[1]) + "\t" 
            result +=  str(variation[2]) + "\t" 
            result +=   str(variation[3]) + "\t"
            result +=   str(variation[2]) + "\n"
        filepath = os.path.join(self.genome_dir, association_name)
        with open(filepath, "w") as file1: 
            file1.write(result) 
        return (filepath)

    def build_featureset(self, filepath, genome_ref, description, workspace_name, association_name, prefix):
      gene_ids = dict()
      element_ordering = list()
      elements = dict()
      skip_words = ["GENEID", "NEIGHBORGENE", "NA"]
      with open(filepath, 'r') as reader:
          for line in reader:
              fields = line.split("\t")
              condition1 = fields[5] not in skip_words
              condition2 = fields[5] not in elements
              condition3 = fields[6] not in skip_words
              condition4 = fields[6] not in elements
              if condition1 and condition2:
                  element_ordering.append(fields[5])
                  elements[fields[5]] = [genome_ref]
              if condition3 and condition4:
                  element_ordering.append(fields[6])
                  elements[fields[6]] = [genome_ref]
      featureset = dict()
      featureset['description'] = description
      featureset['element_ordering'] = element_ordering
      featureset['elements'] = elements
      ws_id = self.dfu.ws_name_to_id(workspace_name)
      featureset_obj_name = prefix + str(association_name)

      save_info = self.dfu.save_objects( { 'id': ws_id, 
                                            'objects': [ {'type': 'KBaseCollections.FeatureSet', 
                                                          'data': featureset, 
                                                          'name': featureset_obj_name}]})[0]
      obj_ref  = "{0}/{1}/{2}".format( save_info[6], save_info[0], save_info[4] )   
      return obj_ref         


   
    def annotate_GWAS_results(self, genome_ref, association_ref, workspace_name, prefix, p_value):
         
        #TODO: Send outfile to prep gff function inseted of hardcord
        #TODO: Removed hard coded stuff and create new directory for each test function
        self.genome_dir_name = "_".join(genome_ref.split("/"))
        self.genome_dir = os.path.join(self.shared_folder, self.genome_dir_name)
        if not os.path.isdir(self.genome_dir):
            os.mkdir(self.genome_dir)
        sorted_gff_path = os.path.join(self.genome_dir, 'out.gff.gz')
        self.sorted_gff = sorted_gff_path

        if  not os.path.exists(sorted_gff_path):
            feature_num = self.gsu.search({'ref': genome_ref})['num_found']
            # get genome features for gff construction
            genome_features = self.gsu.search({
                'ref': genome_ref,
                'limit': feature_num,
                #'sort_by': [['feature_id', True]]
            })['features']

            assembly_ref = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])[0]['data']['assembly_ref']

            # get assembly contigs for base length calculations
            assembly_contigs = self.wsc.get_object_subset([{
                'included': ['/contigs'],
                'ref': assembly_ref
            }])[0]['data']['contigs']

            contig_ids = list(assembly_contigs.keys())
            contig_ids.sort()

            contig_base_lengths = {}
            prev_length = 0

            for contig in contig_ids:
                contig_base_lengths[contig] = prev_length
                prev_length += assembly_contigs[contig]['length']

            gff_file = os.path.join(self.genome_dir, 'constructed.gff')
            constructed_gff = self._construct_gff_from_json(genome_features, gff_file, contig_base_lengths)
            self.sorted_gff = self._prep_gff(constructed_gff)
            tabix_index(self.sorted_gff)

        obj_info = self.wsc.get_object_info3({"objects": [{"ref": association_ref}]})
        association_name =obj_info["infos"][0][1]


        gwas_results_file = self.get_gwas_result_file(association_ref, association_name, p_value)

        gwas_results = pd.read_csv(gwas_results_file, sep='\t')

        gwas_results[['GENEID', 'NEIGHBORGENE', 'FUNCTION']] = \
           gwas_results.apply(self.find_gene_info, axis=1)

        new_results_path = os.path.abspath(os.path.join(gwas_results_file, '..'))
        fname = 'final_' +  association_name
        new_results_path = os.path.join(new_results_path, fname )
        gwas_results.to_csv(path_or_buf=new_results_path, sep='\t', index=False)
        description = "Genelist for GWAS results of trait " + association_name
         
        featureset_obj = self.build_featureset( new_results_path, genome_ref, description, workspace_name, association_name, prefix)
        
        return featureset_obj
示例#31
0
class ImportAttributeMappingUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.genapi = GenericsAPI(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_attribute_mapping_from_staging(self, params):
        """
          import_attribute_mapping_from_staging: wrapper method for
                                    fba_tools.tsv_file_to_attribute_mapping

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          attribute_mapping_name: output conditionSet object name
          workspace_name: workspace name/ID of the object

          return:
          obj_ref: return object reference
        """

        log('--->\nrunning ImportConditionSetUtil.import_attribute_mapping_from_staging\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_attribute_mapping_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        ws_id = self.dfu.ws_name_to_id(params['workspace_name'])

        import_attribute_mapping_params = {
            'output_obj_name': params['attribute_mapping_name'],
            'output_ws_id': ws_id,
            'input_file_path': scratch_file_path
        }

        ref = self.genapi.file_to_attribute_mapping(
            import_attribute_mapping_params)

        # Update the workspace object related meta-data for staged file
        self.uploader_utils.update_staging_service(
            params.get('staging_file_subdir_path'),
            ref.get('attribute_mapping_ref'))
        returnVal = {'obj_ref': ref.get('attribute_mapping_ref')}

        return returnVal

    @staticmethod
    def validate_import_attribute_mapping_from_staging_params(params):
        """
        validate_import_attribute_mapping_from_staging_params:
                    validates params passed to import_attribute_mapping_from_staging method
        """
        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'workspace_name',
                'attribute_mapping_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                        import_attribute_mapping_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to
        """
        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False}

        object_data = self.dfu.get_objects(get_objects_params)

        upload_message += "Attribute Mapping Name: "
        upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n'
        upload_message += 'Imported File: {}\n'.format(
            params.get('staging_file_subdir_path'))
        report_params = {
            'message':
            upload_message,
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Imported Attribute Mapping'
            }],
            'workspace_name':
            params['workspace_name'],
            'report_object_name':
            'kb_upload_methods_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output