Exemplo n.º 1
0
class GenomeInterface:
    def __init__(self, config):
        self.handle_url = config.handleURL
        self.shock_url = config.shockURL
        self.sw_url = config.srvWizURL
        self.token = config.token
        self.auth_service_url = config.authServiceUrl
        self.callback_url = config.callbackURL
        self.re_api_url = config.re_api_url
        self.auth_client = _KBaseAuth(self.auth_service_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.taxon_wsname = config.raw['taxon-workspace-name']
        self.scratch = config.raw['scratch']
        self.ws_large_data = WsLargeDataIO(self.callback_url)

    @staticmethod
    def _validate_save_one_genome_params(params):
        """
        _validate_save_one_genome_params:
                validates params passed to save_one_genome method
        """
        logging.info('start validating save_one_genome params')
        # check for required parameters
        for p in ['workspace', 'name', 'data']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _check_shock_response(self, response, errtxt):
        """
        _check_shock_response: check shock node response (Copied from DataFileUtil)
        """
        logging.info('start checking shock response')

        if not response.ok:
            try:
                err = json.loads(response.content)['error'][0]
            except Exception:
                # this means shock is down or not responding.
                logging.error(
                    "Couldn't parse response error content from Shock: " +
                    response.content)
                response.raise_for_status()
            raise ValueError(errtxt + str(err))

    def _own_handle(self, genome_data, handle_property):
        """
        _own_handle: check that handle_property point to shock nodes owned by calling user
        """

        logging.info(
            'start checking handle {} ownership'.format(handle_property))

        if handle_property in genome_data:
            handle_id = genome_data[handle_property]
            hs = HandleService(self.handle_url, token=self.token)
            handles = hs.hids_to_handles([handle_id])
            shock_id = handles[0]['id']

            # Copy from DataFileUtil.own_shock_node implementation:
            header = {'Authorization': 'Oauth {}'.format(self.token)}
            res = requests.get(self.shock_url + '/node/' + shock_id +
                               '/acl/?verbosity=full',
                               headers=header,
                               allow_redirects=True)
            self._check_shock_response(
                res, 'Error getting ACLs for Shock node {}: '.format(shock_id))
            owner = res.json()['data']['owner']['username']
            user_id = self.auth_client.get_user(self.token)

            if owner != user_id:
                logging.info('start copying node to owner: {}'.format(user_id))
                dfu_shock = self.dfu.copy_shock_node({
                    'shock_id': shock_id,
                    'make_handle': True
                })
                handle_id = dfu_shock['handle']['hid']
                genome_data[handle_property] = handle_id

    def _check_dna_sequence_in_features(self, genome):
        """
        _check_dna_sequence_in_features: check dna sequence in each feature
        """
        logging.info('start checking dna sequence in each feature')

        if 'features' in genome:
            features_to_work = {}
            for feature in genome['features']:
                if not ('dna_sequence' in feature and feature['dna_sequence']):
                    features_to_work[feature['id']] = feature['location']

            if len(features_to_work) > 0:
                aseq = AssemblySequenceAPI(self.sw_url, token=self.token)
                get_dna_params = {'requested_features': features_to_work}
                if 'assembly_ref' in genome:
                    get_dna_params['assembly_ref'] = genome['assembly_ref']
                elif 'contigset_ref' in genome:
                    get_dna_params['contigset_ref'] = genome['contigset_ref']
                else:
                    # Nothing to do (it may be test genome without contigs)...
                    return
                dna_sequences = aseq.get_dna_sequences(
                    get_dna_params)['dna_sequences']
                for feature in genome['features']:
                    if feature['id'] in dna_sequences:
                        feature['dna_sequence'] = dna_sequences[feature['id']]
                        feature['dna_sequence_length'] = len(
                            feature['dna_sequence'])

    def get_one_genome(self, params):
        """Fetch a genome using WSLargeDataIO and return it as a python dict"""
        logging.info('fetching genome object')

        res = self.ws_large_data.get_objects(params)['data'][0]
        data = json.load(open(res['data_json_file']))
        return data, res['info']
        # return self.dfu.get_objects(params)['data'][0]

    def save_one_genome(self, params):
        logging.info('start saving genome object')
        self._validate_save_one_genome_params(params)
        workspace = params['workspace']
        name = params['name']
        data = params['data']
        # XXX there is no `workspace_datatype` param in the spec
        ws_datatype = params.get('workspace_datatype', "KBaseGenomes.Genome")
        # XXX there is no `meta` param in the spec
        meta = params.get('meta', {})
        if "AnnotatedMetagenomeAssembly" in ws_datatype:
            if params.get('upgrade') or 'feature_counts' not in data:
                data = self._update_metagenome(data)
        else:
            if params.get('upgrade') or 'feature_counts' not in data:
                data = self._update_genome(data)

        # check all handles point to shock nodes owned by calling user
        self._own_handle(data, 'genbank_handle_ref')
        self._own_handle(data, 'gff_handle_ref')
        if "AnnotatedMetagenomeAssembly" not in ws_datatype:
            self._check_dna_sequence_in_features(data)
            data['warnings'] = self.validate_genome(data)

        # sort data
        data = GenomeUtils.sort_dict(data)
        # dump genome to scratch for upload
        data_path = os.path.join(self.scratch, name + ".json")
        json.dump(data, open(data_path, 'w'))
        if 'hidden' in params and str(
                params['hidden']).lower() in ('yes', 'true', 't', '1'):
            hidden = 1
        else:
            hidden = 0

        if isinstance(workspace, int) or workspace.isdigit():
            workspace_id = workspace
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace)

        save_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': ws_datatype,
                'data_json_file': data_path,
                'name': name,
                'meta': meta,
                'hidden': hidden
            }]
        }
        dfu_oi = self.ws_large_data.save_objects(save_params)[0]
        returnVal = {'info': dfu_oi, 'warnings': data.get('warnings', [])}
        return returnVal

    @staticmethod
    def determine_tier(source):
        """
        Given a user provided source parameter, assign a source and genome tier
        """
        low_source = source.lower()
        if 'refseq' in low_source:
            if 'reference' in low_source:
                return "RefSeq", ['Reference', 'Representative', 'ExternalDB']
            if 'representative' in low_source:
                return "RefSeq", ['Representative', 'ExternalDB']
            if 'user' in low_source:
                return "RefSeq", ['ExternalDB', 'User']
            return "RefSeq", ['ExternalDB']
        if 'phytozome' in low_source:
            if 'flagship' in source:
                return "Phytosome", [
                    'Reference', 'Representative', 'ExternalDB'
                ]
            return "Phytosome", ['Representative', 'ExternalDB']
        if 'ensembl' in low_source:
            if 'user' in low_source:
                return "Ensembl", ['ExternalDB', 'User']
            return "Ensembl", ['Representative', 'ExternalDB']
        return source, ['User']

    def _update_metagenome(self, genome):
        """Checks for missing required fields and fixes breaking changes"""
        if 'molecule_type' not in genome:
            genome['molecule_type'] = 'Unknown'

    def _update_genome(self, genome):
        """Checks for missing required fields and fixes breaking changes"""
        # do top level updates
        ontologies_present = defaultdict(dict)  # type: dict
        ontologies_present.update(genome.get('ontologies_present', {}))
        ontology_events = genome.get('ontology_events', [])
        # NOTE: 'genome_tiers' not in Metagenome spec
        if 'genome_tiers' not in genome:
            genome['source'], genome['genome_tiers'] = self.determine_tier(
                genome['source'])
        if 'molecule_type' not in genome:
            genome['molecule_type'] = 'Unknown'

        # If an NCBI taxonomy ID is provided, fetch additional data about the taxon
        # NOTE: Metagenome object does not have a 'taxon_assignments' field
        if 'taxon_assignments' in genome and genome['taxon_assignments'].get(
                'ncbi'):
            tax_id = int(genome['taxon_assignments']['ncbi'])
            GenomeUtils.set_taxon_data(tax_id, self.re_api_url, genome)
        else:
            GenomeUtils.set_default_taxon_data(genome)

        if any([
                x not in genome
                for x in ('dna_size', 'md5', 'gc_content', 'num_contigs')
        ]):
            if 'assembly_ref' in genome:
                assembly_data = self.dfu.get_objects({
                    'object_refs': [genome['assembly_ref']],
                    'ignore_errors':
                    0
                })['data'][0]['data']
                genome["gc_content"] = assembly_data['gc_content']
                genome["dna_size"] = assembly_data['dna_size']
                genome["md5"] = assembly_data['md5']
                genome["num_contigs"] = assembly_data['num_contigs']

                if assembly_data.get('type'):
                    genome['genome_type'] = assembly_data['type']

            elif 'contigset_ref' in genome:
                contig_data = self.dfu.get_objects({
                    'object_refs': [genome['contigset_ref']],
                    'included': ['contigs/[*]/length', 'md5'],
                    'ignore_errors':
                    0
                })['data'][0]['data']
                genome["gc_content"] = None
                genome["dna_size"] = sum(
                    (c['length'] for c in contig_data['contigs']))
                genome["md5"] = contig_data['md5']
                genome["num_contigs"] = len(contig_data['contigs'])

        # NOTE: metagenomes do not have the following fields
        if 'cdss' not in genome:
            genome['cdss'] = []
        if 'mrnas' not in genome:
            genome['mrnas'] = []
        if 'non_coding_features' not in genome:
            genome['non_coding_features'] = []

        # do feature level updates
        retained_features = []
        type_counts = defaultdict(int)
        for field in ('mrnas', 'cdss', 'features'):
            for i, feat in enumerate(genome.get(field, [])):
                if 'function' in feat and not isinstance(feat, list):
                    feat['functions'] = feat['function'].split('; ')
                    del feat['function']
                if 'aliases' in feat:
                    if not feat['aliases']:
                        del feat['aliases']
                    elif not isinstance(feat['aliases'][0], (list, tuple)):
                        feat['aliases'] = [['gene_synonym', x]
                                           for x in feat['aliases']]
                if 'type' in feat:
                    type_counts[feat['type']] += 1
                for ontology, terms in feat.get('ontology_terms', {}).items():
                    for term in terms.values():
                        if isinstance(term, list):
                            continue
                        ontologies_present[ontology][
                            term['id']] = term['term_name']
                        term_evidence = []
                        for ev in term['evidence']:
                            ev['id'] = ontology
                            if "ontology_ref" in term:
                                ev['ontology_ref'] = term["ontology_ref"]
                            if ev not in ontology_events:
                                ontology_events.append(ev)
                            term_evidence.append(ontology_events.index(ev))
                        feat['ontology_terms'][ontology][
                            term['id']] = term_evidence

                # remove deprecated fields
                feat.pop('protein_families', None)
                feat.pop('atomic_regulons', None)
                feat.pop('orthologs', None)
                feat.pop('coexpressed_fids', None)
                feat.pop('publications', None)
                feat.pop('regulon_data', None)
                feat.pop('subsystem_data', None)

                if 'dna_sequence_length' not in feat:
                    feat['dna_sequence_length'] = sum(
                        x[3] for x in feat['location'])

                if 'protein_translation' in feat and 'protein_md5' not in feat:
                    feat['protein_md5'] = hashlib.md5(
                        feat.get('protein_translation',
                                 '').encode('utf8')).hexdigest()

                # split all the stuff lumped together in old versions into the
                # right arrays
                if field == 'features':
                    if feat.get('type', 'gene') == 'gene':
                        if not feat.get('cdss', []):
                            type_counts['non_coding_genes'] += 1
                            genome['non_coding_features'].append(feat)
                        else:
                            retained_features.append(feat)
                    elif feat.get('type', 'gene') == 'CDS':
                        if 'parent_gene' not in feat:
                            feat['parent_gene'] = ''
                        genome['cdss'].append(feat)
                    elif feat.get('type', 'gene') == 'mRNA':
                        if 'parent_gene' not in feat:
                            feat['parent_gene'] = ''
                        genome['mrnas'].append(feat)

        genome['features'] = retained_features
        if ontology_events:
            genome['ontology_events'] = ontology_events
        if ontologies_present:
            genome['ontologies_present'] = ontologies_present

        type_counts['mRNA'] = len(genome.get('mrnas', []))
        type_counts['CDS'] = len(genome.get('cdss', []))
        type_counts['protein_encoding_gene'] = len(genome['features'])
        type_counts['non_coding_features'] = len(
            genome.get('non_coding_features', []))
        genome['feature_counts'] = type_counts
        return genome

    @staticmethod
    def validate_genome(g):
        """
        Run a series of checks on the genome object and return any warnings
        """

        allowed_tiers = {'Representative', 'Reference', 'ExternalDB', 'User'}

        logging.info('Validating genome object contents')
        warnings = g.get('warnings', [])

        # TODO: Determine whether these checks make any sense for Metagenome
        #       object. Looks like many don't.
        #       Add validations for Metagenome object

        # this will fire for some annotation methods like PROKKA
        if g.get('domain') == "Bacteria" and len(g.get('cdss', [])) != len(
                g['features']):
            warnings.append(
                "For prokaryotes, CDS array should generally be the"
                " same length as the Features array.")

        if g.get('domain') == "Eukaryota" and len(g.get(
                'features', [])) == len(g.get('cdss', [])):
            warnings.append(
                "For Eukaryotes, CDS array should not be the same "
                "length as the Features array due to RNA splicing.")

        if g.get('molecule_type') not in {"DNA", 'ds-DNA'}:
            if g.get('domain', '') not in {'Virus', 'Viroid'} and \
                            g['molecule_type'] not in {"DNA", 'ds-DNA'}:
                warnings.append("Genome molecule_type {} is not expected "
                                "for domain {}.".format(
                                    g['molecule_type'], g.get('domain', '')))

        if "genome_tiers" in g and set(g['genome_tiers']) - allowed_tiers:
            warnings.append("Undefined terms in genome_tiers: " +
                            ", ".join(set(g['genome_tiers']) - allowed_tiers))
        assignments = g.get('taxon_assignments', {})
        if 'ncbi' not in assignments or ('taxon_ref' in g and g['taxon_ref']
                                         == "ReferenceTaxons/unknown_taxon"):
            warnings.append('Unable to determine organism taxonomy')

        GenomeInterface.handle_large_genomes(g)
        return warnings

    @staticmethod
    def handle_large_genomes(g):
        """Determines the size of various feature arrays and starts removing the dna_sequence if
        the genome is getting too big to store in the workspace"""
        def _get_size(obj):
            return sys.getsizeof(json.dumps(obj))

        # seems pretty uneccessary...
        def sizeof_fmt(num):
            for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
                if abs(num) < 1024.0:
                    return "%3.1f %sB" % (num, unit)
                num /= 1024.0
            return "%.1f %sB" % (num, 'Yi')

        feature_lists = ('mrnas', 'features', 'non_coding_features', 'cdss')
        master_key_sizes = dict()
        # Change want full breakdown to True if want to see break down of sizes.
        # By making this a changeable flag it will run faster for standard uploads.
        want_full_breakdown = False
        for x in feature_lists:
            if x in g:
                need_to_remove_dna_sequence = _get_size(g) > MAX_GENOME_SIZE
                if need_to_remove_dna_sequence or want_full_breakdown:
                    feature_type_dict_keys = dict()
                    for feature in g[x]:
                        for feature_key in list(feature.keys()):
                            if feature_key == "dna_sequence" and need_to_remove_dna_sequence:
                                # NOTE: should this get stored somewhere?
                                del (feature["dna_sequence"])
                            else:
                                if feature_key not in feature_type_dict_keys:
                                    feature_type_dict_keys[feature_key] = 0
                                feature_type_dict_keys[
                                    feature_key] += sys.getsizeof(
                                        feature[feature_key])
                    for feature_key in feature_type_dict_keys:
                        feature_type_dict_keys[feature_key] = sizeof_fmt(
                            feature_type_dict_keys[feature_key])
                    master_key_sizes[x] = feature_type_dict_keys
                print(f"{x}: {sizeof_fmt(_get_size(g[x]))}")
        total_size = _get_size(g)
        print(f"Total size {sizeof_fmt(total_size)} ")
        if want_full_breakdown:
            print(
                f"Here is the breakdown of the sizes of feature lists elements : "
                f"{str(master_key_sizes)}")
        if total_size > MAX_GENOME_SIZE:
            print(
                f"Here is the breakdown of the sizes of feature lists elements : "
                f"{str(master_key_sizes)}")
            raise ValueError(
                f"This genome size of {sizeof_fmt(total_size)} exceeds the maximum "
                f"permitted size of {sizeof_fmt(MAX_GENOME_SIZE)}.\n"
                f"Here is the breakdown for feature lists and their respective "
                f"sizes:\n{master_key_sizes}")
Exemplo n.º 2
0
class GenomeInterface:
    def __init__(self, config):
        self.handle_url = config.handleURL
        self.shock_url = config.shockURL
        self.sw_url = config.srvWizURL
        self.token = config.token
        self.auth_service_url = config.authServiceUrl
        self.callback_url = config.callbackURL

        self.auth_client = _KBaseAuth(self.auth_service_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.kbse = KBaseSearchEngine(config.raw['search-url'])
        self.taxon_wsname = config.raw['taxon-workspace-name']
        self.scratch = config.raw['scratch']
        self.ws_large_data = WsLargeDataIO(self.callback_url)

    @staticmethod
    def _validate_save_one_genome_params(params):
        """
        _validate_save_one_genome_params:
                validates params passed to save_one_genome method
        """

        log('start validating save_one_genome params')

        # check for required parameters
        for p in ['workspace', 'name', 'data']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _check_shock_response(self, response, errtxt):
        """
        _check_shock_response: check shock node response (Copied from DataFileUtil)
        """
        log('start checking shock response')

        if not response.ok:
            try:
                err = json.loads(response.content)['error'][0]
            except:
                # this means shock is down or not responding.
                self.log("Couldn't parse response error content from Shock: " +
                         response.content)
                response.raise_for_status()
            raise ValueError(errtxt + str(err))

    def _own_handle(self, genome_data, handle_property):
        """
        _own_handle: check that handle_property point to shock nodes owned by calling user
        """

        log('start checking handle {} ownership'.format(handle_property))

        if handle_property in genome_data:
            handle_id = genome_data[handle_property]
            hs = HandleService(self.handle_url, token=self.token)
            handles = hs.hids_to_handles([handle_id])
            shock_id = handles[0]['id']

            # Copy from DataFileUtil.own_shock_node implementation:
            header = {'Authorization': 'Oauth {}'.format(self.token)}
            res = requests.get(self.shock_url + '/node/' + shock_id +
                               '/acl/?verbosity=full',
                               headers=header,
                               allow_redirects=True)
            self._check_shock_response(
                res, 'Error getting ACLs for Shock node {}: '.format(shock_id))
            owner = res.json()['data']['owner']['username']
            user_id = self.auth_client.get_user(self.token)

            if owner != user_id:
                log('start copying node to owner: {}'.format(user_id))
                dfu_shock = self.dfu.copy_shock_node({
                    'shock_id': shock_id,
                    'make_handle': True
                })
                handle_id = dfu_shock['handle']['hid']
                genome_data[handle_property] = handle_id

    def _check_dna_sequence_in_features(self, genome):
        """
        _check_dna_sequence_in_features: check dna sequence in each feature
        """
        log('start checking dna sequence in each feature')

        if 'features' in genome:
            features_to_work = {}
            for feature in genome['features']:
                if not ('dna_sequence' in feature and feature['dna_sequence']):
                    features_to_work[feature['id']] = feature['location']

            if len(features_to_work) > 0:
                aseq = AssemblySequenceAPI(self.sw_url, token=self.token)
                get_dna_params = {'requested_features': features_to_work}
                if 'assembly_ref' in genome:
                    get_dna_params['assembly_ref'] = genome['assembly_ref']
                elif 'contigset_ref' in genome:
                    get_dna_params['contigset_ref'] = genome['contigset_ref']
                else:
                    # Nothing to do (it may be test genome without contigs)...
                    return
                dna_sequences = aseq.get_dna_sequences(
                    get_dna_params)['dna_sequences']
                for feature in genome['features']:
                    if feature['id'] in dna_sequences:
                        feature['dna_sequence'] = dna_sequences[feature['id']]
                        feature['dna_sequence_length'] = len(
                            feature['dna_sequence'])

    def get_one_genome(self, params):
        """Fetch a genome using WSLargeDataIO and return it as a python dict"""
        log('fetching genome object')

        res = self.ws_large_data.get_objects(params)['data'][0]
        data = json.load(open(res['data_json_file']))
        return data, res['info']
        #return self.dfu.get_objects(params)['data'][0]

    def save_one_genome(self, params):
        log('start saving genome object')

        self._validate_save_one_genome_params(params)

        workspace = params['workspace']
        name = params['name']
        data = params['data']
        if 'meta' in params and params['meta']:
            meta = params['meta']
        else:
            meta = {}
        if params.get('upgrade') or 'feature_counts' not in data:
            data = self._update_genome(data)

        # check all handles point to shock nodes owned by calling user
        self._own_handle(data, 'genbank_handle_ref')
        self._own_handle(data, 'gff_handle_ref')

        self._check_dna_sequence_in_features(data)
        data['warnings'] = self.validate_genome(data)

        # dump genome to scratch for upload
        data_path = os.path.join(self.scratch, name + ".json")
        json.dump(data, open(data_path, 'w'))

        if 'hidden' in params and str(
                params['hidden']).lower() in ('yes', 'true', 't', '1'):
            hidden = 1
        else:
            hidden = 0

        if isinstance(workspace, int) or workspace.isdigit():
            workspace_id = workspace
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace)

        save_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': 'KBaseGenomes.Genome',
                'data_json_file': data_path,
                'name': name,
                'meta': meta,
                'hidden': hidden
            }]
        }

        dfu_oi = self.ws_large_data.save_objects(save_params)[0]

        returnVal = {'info': dfu_oi, 'warnings': data['warnings']}

        return returnVal

    def old_retrieve_taxon(self, taxon_wsname, scientific_name):
        """
        old_retrieve_taxon: use SOLR to retrieve taxonomy and taxon_reference

        """
        default = ('Unconfirmed Organism: ' + scientific_name,
                   'ReferenceTaxons/unknown_taxon', 'Unknown', 11)
        solr_url = 'http://kbase.us/internal/solr-ci/search/'
        solr_core = 'taxonomy_ci'
        query = '/select?q=scientific_name:"{}"&fl=scientific_name%2Cscientific_lineage%2Ctaxonomy_id%2Cdomain%2Cgenetic_code&rows=5&wt=json'
        match = re.match("\S+\s?\S*", scientific_name)
        if not match:
            return default
        res = requests.get(solr_url + solr_core + query.format(match.group(0)))
        results = res.json()['response']['docs']
        if not results:
            return default
        taxonomy = results[0]['scientific_lineage']
        taxon_reference = '{}/{}_taxon'.format(taxon_wsname,
                                               results[0]['taxonomy_id'])
        domain = results[0]['domain']
        genetic_code = results[0]['genetic_code']

        return taxonomy, taxon_reference, domain, genetic_code

    def retrieve_taxon(self, taxon_wsname, scientific_name):
        """
        _retrieve_taxon: retrieve taxonomy and taxon_reference

        """
        default = ('Unconfirmed Organism: ' + scientific_name,
                   'ReferenceTaxons/unknown_taxon', 'Unknown', 11)

        def extract_values(search_obj):
            return (search_obj['data']['scientific_lineage'],
                    taxon_wsname + "/" + search_obj['object_name'],
                    search_obj['data']['domain'],
                    search_obj['data'].get('genetic_code', 11))

        search_params = {
            "object_types": ["taxon"],
            "match_filter": {
                "lookup_in_keys": {
                    "scientific_name": {
                        "value": scientific_name
                    }
                },
                "exclude_subobjects": 1
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "sorting_rules": [{
                "is_object_property": 0,
                "property": "timestamp",
                "ascending": 0
            }]
        }
        objects = self.kbse.search_objects(search_params)['objects']
        if len(objects):
            if len(objects) > 100000:
                raise RuntimeError(
                    "Too many matching taxons returned for {}. "
                    "Potential issue with searchAPI.".format(scientific_name))
            return extract_values(objects[0])
        search_params['match_filter']['lookup_in_keys'] = {
            "aliases": {
                "value": scientific_name
            }
        }
        objects = self.kbse.search_objects(search_params)['objects']
        if len(objects):
            return extract_values(objects[0])
        return default

    @staticmethod
    def determine_tier(source):
        """
        Given a user provided source parameter, assign a source and genome tier
        """
        low_source = source.lower()
        if 'refseq' in low_source:
            if 'reference' in low_source:
                return "RefSeq", ['Reference', 'Representative', 'ExternalDB']
            if 'representative' in low_source:
                return "RefSeq", ['Representative', 'ExternalDB']
            if 'user' in low_source:
                return "RefSeq", ['ExternalDB', 'User']
            return "RefSeq", ['ExternalDB']
        if 'phytozome' in low_source:
            if 'flagship' in source:
                return "Phytosome", [
                    'Reference', 'Representative', 'ExternalDB'
                ]
            return "Phytosome", ['Representative', 'ExternalDB']
        if 'ensembl' in low_source:
            if 'user' in low_source:
                return "Ensembl", ['ExternalDB', 'User']
            return "Ensembl", ['Representative', 'ExternalDB']
        return source, ['User']

    def _update_genome(self, genome):
        """Checks for missing required fields and fixes breaking changes"""
        # do top level updates
        ontologies_present = defaultdict(dict)
        ontologies_present.update(genome.get('ontologies_present', {}))
        ontology_events = genome.get('ontology_events', [])
        if 'genome_tier' not in genome:
            genome['source'], genome['genome_tiers'] = self.determine_tier(
                genome['source'])
        if 'molecule_type' not in genome:
            genome['molecule_type'] = 'Unknown'
        if 'taxon_ref' not in genome:
            genome['taxonomy'], genome['taxon_ref'], genome['domain'], \
            genome['genetic_code'] = self.retrieve_taxon(
                self.taxon_wsname, genome['scientific_name'])

        if any([
                x not in genome
                for x in ('dna_size', 'md5', 'gc_content', 'num_contigs')
        ]):
            if 'assembly_ref' in genome:
                assembly_data = self.dfu.get_objects({
                    'object_refs': [genome['assembly_ref']],
                    'ignore_errors':
                    0
                })['data'][0]['data']
                genome["gc_content"] = assembly_data['gc_content']
                genome["dna_size"] = assembly_data['dna_size']
                genome["md5"] = assembly_data['md5']
                genome["num_contigs"] = assembly_data['num_contigs']
            elif 'contigset_ref' in genome:
                contig_data = self.dfu.get_objects({
                    'object_refs': [genome['contigset_ref']],
                    'included': ['contigs/[*]/length', 'md5'],
                    'ignore_errors':
                    0
                })['data'][0]['data']
                genome["gc_content"] = None
                genome["dna_size"] = sum(
                    (c['length'] for c in contig_data['contigs']))
                genome["md5"] = contig_data['md5']
                genome["num_contigs"] = len(contig_data['contigs'])

        if 'cdss' not in genome:
            genome['cdss'] = []
        if 'mrnas' not in genome:
            genome['mrnas'] = []
        if 'non_coding_features' not in genome:
            genome['non_coding_features'] = []

        # do feature level updates
        retained_features = []
        type_counts = defaultdict(int)
        for field in ('mrnas', 'cdss', 'features'):
            for i, feat in enumerate(genome.get(field, [])):
                if 'function' in feat and not isinstance(feat, list):
                    feat['functions'] = feat['function'].split('; ')
                    del feat['function']
                if 'aliases' in feat:
                    if not feat['aliases']:
                        del feat['aliases']
                    elif not isinstance(feat['aliases'][0], (list, tuple)):
                        feat['aliases'] = [['gene_synonym', x]
                                           for x in feat['aliases']]
                if 'type' in feat:
                    type_counts[feat['type']] += 1
                for ontology, terms in feat.get('ontology_terms', {}).items():
                    for term in terms.values():
                        if isinstance(term, list):
                            continue
                        ontologies_present[ontology][
                            term['id']] = term['term_name']
                        term_evidence = []
                        for ev in term['evidence']:
                            ev['id'] = ontology
                            ev['ontology_ref'] = term["ontology_ref"]
                            if ev not in ontology_events:
                                ontology_events.append(ev)
                            term_evidence.append(ontology_events.index(ev))
                        feat['ontology_terms'][ontology][
                            term['id']] = term_evidence

                # remove deprecated fields
                feat.pop('protein_families', None)
                feat.pop('atomic_regulons', None)
                feat.pop('orthologs', None)
                feat.pop('coexpressed_fids', None)
                feat.pop('publications', None)
                feat.pop('regulon_data', None)
                feat.pop('subsystem_data', None)

                if 'dna_sequence_length' not in feat:
                    feat['dna_sequence_length'] = sum(
                        x[3] for x in feat['location'])

                if 'protein_translation' in feat and 'protein_md5' not in feat:
                    feat['protein_md5'] = hashlib.md5(
                        feat.get('protein_translation',
                                 '').encode('utf8')).hexdigest()

                # split all the stuff lumped together in old versions into the
                # right arrays
                if field == 'features':
                    if feat.get('type', 'gene') == 'gene':
                        if not feat.get('cdss', []):
                            genome['non_coding_features'].append(feat)
                        else:
                            retained_features.append(feat)
                    elif feat.get('type', 'gene') == 'CDS':
                        if 'parent_gene' not in feat:
                            feat['parent_gene'] = ''
                        genome['cdss'].append(feat)
                    elif feat.get('type', 'gene') == 'mRNA':
                        if 'parent_gene' not in feat:
                            feat['parent_gene'] = ''
                        genome['mrnas'].append(feat)

        genome['features'] = retained_features
        if ontology_events:
            genome['ontology_events'] = ontology_events
        if ontologies_present:
            genome['ontologies_present'] = ontologies_present

        type_counts['mRNA'] = len(genome.get('mrnas', []))
        type_counts['CDS'] = len(genome.get('cdss', []))
        type_counts['protein_encoding_gene'] = len(genome['features'])
        type_counts['non_coding_features'] = len(
            genome.get('non_coding_features', []))
        genome['feature_counts'] = type_counts

        return genome

    @staticmethod
    def validate_genome(g):
        """
        Run a series of checks on the genome object and return any warnings
        """
        def _get_size(obj):
            return sys.getsizeof(json.dumps(obj))

        def sizeof_fmt(num):
            for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
                if abs(num) < 1024.0:
                    return "%3.1f %sB" % (num, unit)
                num /= 1024.0
            return "%.1f %sB" % (num, 'Yi')

        allowed_tiers = {'Representative', 'Reference', 'ExternalDB', 'User'}

        log('Validating genome object contents')
        warnings = g.get('warnings', [])

        # this will fire for some annotation methods like PROKKA
        if g['domain'] == "Bacteria" and len(g.get('cdss', [])) != len(
                g['features']):
            warnings.append(
                "For prokaryotes, CDS array should generally be the"
                " same length as the Features array.")

        if g['domain'] == "Eukaryota" and len(g.get('features', [])) == len(
                g.get('cdss', [])):
            warnings.append(
                "For Eukaryotes, CDS array should not be the same "
                "length as the Features array due to RNA splicing.")

        if "molecule_type" in g and g['molecule_type'] not in {
                "DNA", 'ds-DNA'
        }:
            if g.get('domain', '') not in {'Virus', 'Viroid'} and \
                            g['molecule_type'] not in {"DNA", 'ds-DNA'}:
                warnings.append("Genome molecule_type {} is not expected "
                                "for domain {}.".format(
                                    g['molecule_type'], g.get('domain', '')))

        if "genome_tiers" in g and set(g['genome_tiers']) - allowed_tiers:
            warnings.append("Undefined terms in genome_tiers: " +
                            ", ".join(set(g['genome_tiers']) - allowed_tiers))
        if g['taxon_ref'] == "ReferenceTaxons/unknown_taxon":
            warnings.append('Unable to determine organism taxonomy')

        #MAX_GENOME_SIZE = 1 #300000000 # UNCOMMENT TO TEST FAILURE MODE. Set to size needed
        feature_lists = ('mrnas', 'features', 'non_coding_features', 'cdss')
        master_key_sizes = dict()
        # Change want full breakdown to True if want to see break down of sizes.
        # By making this a changebale flag it will run faster for standard uploads.
        want_full_breakdown = False
        for x in feature_lists:
            if x in g:
                need_to_remove_dna_sequence = _get_size(g) > MAX_GENOME_SIZE
                if need_to_remove_dna_sequence or want_full_breakdown:
                    feature_type_dict_keys = dict()
                    for feature in g[x]:
                        for feature_key in list(feature.keys()):
                            if feature_key == "dna_sequence" and need_to_remove_dna_sequence:
                                del (feature["dna_sequence"])
                            else:
                                if feature_key not in feature_type_dict_keys:
                                    feature_type_dict_keys[feature_key] = 0
                                feature_type_dict_keys[
                                    feature_key] += sys.getsizeof(
                                        feature[feature_key])
                    for feature_key in feature_type_dict_keys:
                        feature_type_dict_keys[feature_key] = sizeof_fmt(
                            feature_type_dict_keys[feature_key])
                    master_key_sizes[x] = feature_type_dict_keys
                print("{}: {}".format(x, sizeof_fmt(_get_size(g[x]))))
        total_size = _get_size(g)
        print("Total size {} ".format(sizeof_fmt(total_size)))
        if want_full_breakdown:
            print(
                "Here is the breakdown of the sizes of feature lists elements : {}"
                .format(str(master_key_sizes)))
        if total_size > MAX_GENOME_SIZE:
            print(
                "Here is the breakdown of the sizes of feature lists elements : {}"
                .format(str(master_key_sizes)))
            raise ValueError(
                "This genome size of {} exceeds the maximum permitted size of {}.\nHere "
                "is the breakdown for feature lists and their respective sizes:\n{}"
                .format(sizeof_fmt(total_size), sizeof_fmt(MAX_GENOME_SIZE),
                        str(master_key_sizes)))
        return warnings