def generate_adeft_terms(): from adeft import available_shortforms from adeft.disambiguate import load_disambiguator all_term_args = set() for shortform in available_shortforms: da = load_disambiguator(shortform) for grounding in da.names.keys(): if grounding == 'ungrounded' or ':' not in grounding: continue db_ns, db_id = grounding.split(':', maxsplit=1) if db_ns == 'HGNC': standard_name = hgnc_client.get_hgnc_name(db_id) elif db_ns == 'GO': standard_name = go_client.get_go_label(db_id) elif db_ns == 'MESH': standard_name = mesh_client.get_mesh_name(db_id) elif db_ns == 'CHEBI': standard_name = chebi_client.get_chebi_name_from_id(db_id) elif db_ns == 'FPLX': standard_name = db_id elif db_ns == 'UP': standard_name = uniprot_client.get_gene_name(db_id) else: logger.warning('Unknown grounding namespace from Adeft: %s' % db_ns) continue term_args = (normalize(shortform), shortform, db_ns, db_id, standard_name, 'synonym', 'adeft') all_term_args.add(term_args) terms = [ Term(*term_args) for term_args in sorted(list(all_term_args), key=lambda x: x[0]) ] return terms
def get_specific_chebi_id(chebi_ids, name): # NOTE: this function is mainly factored out to be able to use cacheing, it # requires a frozenset as input to work. # First, if we have a manual override, we just do that manual_id = manual_chebi_map.get(name) if manual_id: return manual_id # The first thing we do is eliminate the secondary IDs by mapping them to # primaries primary_ids = {chebi_client.get_primary_id(cid) for cid in chebi_ids} # Occasinally, invalid ChEBI IDs are given that don't have corresponding # primary IDs, which we can filter out primary_ids = {pi for pi in primary_ids if pi is not None} # We then get rid of generic IDs which are never useful for grounding non_generic_ids = primary_ids - generic_chebi_ids # We then try name-based grounding to see if any of the names in the list # match the name of the entity well enough grounding_names = [ chebi_client.get_chebi_name_from_id(p) for p in non_generic_ids ] for grounding_name, grounding_id in zip(grounding_names, non_generic_ids): if grounding_name and (name.lower() == grounding_name.lower()): return grounding_id # If we still have no best grounding, we try to distill the IDs down to # the most specific one based on the hierarchy specific_chebi_id = chebi_client.get_specific_id(non_generic_ids) return specific_chebi_id
def standardize_agent_name(agent, standardize_refs=True): """Standardize the name of an Agent based on grounding information. If an agent contains a FamPlex grounding, the FamPlex ID is used as a name. Otherwise if it contains a Uniprot ID, an attempt is made to find the associated HGNC gene name. If one can be found it is used as the agent name and the associated HGNC ID is added as an entry to the db_refs. Similarly, CHEBI, MESH and GO IDs are used in this order of priority to assign a standardized name to the Agent. If no relevant IDs are found, the name is not changed. Parameters ---------- agent : indra.statements.Agent An INDRA Agent whose name attribute should be standardized based on grounding information. standardize_refs : Optional[bool] If True, this function assumes that the Agent's db_refs need to be standardized, e.g., HGNC mapped to UP. Default: True """ # We return immediately for None Agents if agent is None: return if standardize_refs: agent.db_refs = GroundingMapper.standardize_db_refs(agent.db_refs) # We next look for prioritized grounding, if missing, we return db_ns, db_id = agent.get_grounding() if not db_ns or not db_id: return # If there's a FamPlex ID, prefer that for the name if db_ns == 'FPLX': agent.name = agent.db_refs['FPLX'] # Importantly, HGNC here will be a symbol because that is what # get_grounding returns elif db_ns == 'HGNC': agent.name = hgnc_client.get_hgnc_name(db_id) elif db_ns == 'UP': # Try for the gene name gene_name = uniprot_client.get_gene_name(agent.db_refs['UP'], web_fallback=False) if gene_name: agent.name = gene_name elif db_ns == 'CHEBI': chebi_name = \ chebi_client.get_chebi_name_from_id(agent.db_refs['CHEBI']) if chebi_name: agent.name = chebi_name elif db_ns == 'MESH': mesh_name = mesh_client.get_mesh_name(agent.db_refs['MESH'], False) if mesh_name: agent.name = mesh_name elif db_ns == 'GO': go_name = go_client.get_go_label(agent.db_refs['GO']) if go_name: agent.name = go_name return
def generate_chebi_terms(): fname = os.path.join(indra_resources, 'chebi_entries.tsv') logger.info('Loading %s' % fname) terms = [] for row in read_csv(fname, header=True, delimiter='\t'): db = 'CHEBI' id = 'CHEBI:' + row['CHEBI_ID'] name = row['NAME'] term = Term(normalize(name), name, db, id, name, 'name', 'chebi') terms.append(term) logger.info('Loaded %d terms' % len(terms)) # Now we add synonyms # NOTE: this file is not in version control. The file is available # at ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_ # tab_delimited/names_3star.tsv.gz, it needs to be decompressed # into the INDRA resources folder. fname = os.path.join(indra_resources, 'names_3star.tsv') if not os.path.exists(fname): import pandas as pd chebi_url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' \ 'Flat_file_tab_delimited/names_3star.tsv.gz' logger.info('Loading %s into memory. You can download and decompress' ' it in the indra/resources folder for faster access.' % chebi_url) df = pd.read_csv(chebi_url, sep='\t') rows = (row for _, row in df.iterrows()) else: rows = read_csv(fname, header=True, delimiter='\t') added = set() for row in rows: chebi_id = chebi_client.get_primary_id(str(row['COMPOUND_ID'])) if not chebi_id: logger.info('Could not get valid CHEBI ID for %s' % row['COMPOUND_ID']) continue db = 'CHEBI' id = 'CHEBI:%s' % chebi_id name = str(row['NAME']) chebi_name = \ chebi_client.get_chebi_name_from_id(chebi_id, offline=True) if chebi_name is None: logger.info('Could not get valid name for %s' % chebi_id) continue term_args = (normalize(name), name, db, id, chebi_name, 'synonym', 'chebi') if term_args in added: continue else: term = Term(*term_args) terms.append(term) added.add(term_args) logger.info('Loaded %d terms' % len(terms)) return terms
def generate_famplex_terms(ignore_mappings=False): fname = os.path.join(indra_resources, 'famplex', 'grounding_map.csv') logger.info('Loading %s' % fname) terms = [] for row in read_csv(fname, delimiter=','): txt = row[0] norm_txt = normalize(txt) groundings = {k: v for k, v in zip(row[1::2], row[2::2]) if (k and v)} if 'FPLX' in groundings: id = groundings['FPLX'] term = Term(norm_txt, txt, 'FPLX', id, id, 'assertion', 'famplex') elif 'HGNC' in groundings: id = groundings['HGNC'] term = Term(norm_txt, txt, 'HGNC', hgnc_client.get_hgnc_id(id), id, 'assertion', 'famplex', '9606') elif 'UP' in groundings: db = 'UP' id = groundings['UP'] name = id organism = None if uniprot_client.is_human(id): organism = '9606' hgnc_id = uniprot_client.get_hgnc_id(id) if hgnc_id: name = hgnc_client.get_hgnc_name(hgnc_id) if hgnc_id: db = 'HGNC' id = hgnc_id else: logger.warning('No gene name for %s' % id) # TODO: should we add organism info here? term = Term(norm_txt, txt, db, id, name, 'assertion', 'famplex', organism) elif 'CHEBI' in groundings: id = groundings['CHEBI'] name = chebi_client.get_chebi_name_from_id(id[6:]) term = Term(norm_txt, txt, 'CHEBI', id, name, 'assertion', 'famplex') elif 'GO' in groundings: id = groundings['GO'] term = Term(norm_txt, txt, 'GO', id, go_client.get_go_label(id), 'assertion', 'famplex') elif 'MESH' in groundings: id = groundings['MESH'] mesh_mapping = mesh_mappings.get(id) db, db_id, name = mesh_mapping if (mesh_mapping and not ignore_mappings) else \ ('MESH', id, mesh_client.get_mesh_name(id)) term = Term(norm_txt, txt, db, db_id, name, 'assertion', 'famplex') else: # TODO: handle HMDB, PUBCHEM, CHEMBL continue terms.append(term) return terms
def generate_chebi_terms(): # We can get standard names directly from the OBO terms = _generate_obo_terms('chebi', ignore_mappings=True, map_to_ns={}) # Now we add synonyms # NOTE: this file is not in version control. The file is available # at ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_ # tab_delimited/names_3star.tsv.gz, it needs to be decompressed # into the INDRA resources folder. fname = os.path.join(indra_resources, 'names_3star.tsv') if not os.path.exists(fname): import pandas as pd chebi_url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' \ 'Flat_file_tab_delimited/names_3star.tsv.gz' logger.info('Loading %s into memory. You can download and decompress' ' it in the indra/resources folder for faster access.' % chebi_url) df = pd.read_csv(chebi_url, sep='\t') rows = (row for _, row in df.iterrows()) else: rows = read_csv(fname, header=True, delimiter='\t') added = set() for row in rows: chebi_id = chebi_client.get_primary_id(str(row['COMPOUND_ID'])) if not chebi_id: logger.info('Could not get valid CHEBI ID for %s' % row['COMPOUND_ID']) continue db = 'CHEBI' name = str(row['NAME']) chebi_name = \ chebi_client.get_chebi_name_from_id(chebi_id, offline=True) if chebi_name is None: logger.info('Could not get valid name for %s' % chebi_id) continue # We skip entries of the form Glu-Lys with synonyms like EK since # there are highly ambiguous with other acronyms, and are unlikely # to be used in practice. if is_aa_sequence(chebi_name) and re.match(r'(^[A-Z-]+$)', name): continue term_args = (normalize(name), name, db, chebi_id, chebi_name, 'synonym', 'chebi') if term_args in added: continue else: term = Term(*term_args) terms.append(term) added.add(term_args) logger.info('Loaded %d terms' % len(terms)) return terms
def _get_complex_agents(self, complex_id): """Returns a list of agents corresponding to each of the constituents in a SIGNOR complex.""" agents = [] components = self._recursively_lookup_complex(complex_id) for c in components: db_refs = {} if c.startswith('CHEBI'): db_refs['CHEBI'] = c name = chebi_client.get_chebi_name_from_id(c) else: name = uniprot_client.get_gene_name(c) if name is None: db_refs['SIGNOR'] = c else: db_refs['UP'] = c hgnc_id = uniprot_client.get_hgnc_id(c) if hgnc_id: name = hgnc_client.get_hgnc_name(hgnc_id) db_refs['HGNC'] = hgnc_id famplex_key = ('SIGNOR', c) if famplex_key in famplex_map: db_refs['FPLX'] = famplex_map[famplex_key] if not name: # Set agent name to Famplex name if # the Uniprot name is not available name = db_refs['FPLX'] elif not name: # We neither have a Uniprot nor Famplex grounding logger.info('Have neither a Uniprot nor Famplex grounding ' 'for "%s" in complex %s' % (c, complex_id)) if not name: # Set the agent name to the Signor name if neither the # Uniprot nor Famplex names are available name = db_refs['SIGNOR'] assert name is not None agents.append(Agent(name, db_refs=db_refs)) return agents
def update_hmdb_chebi_map(): logger.info('--Updating HMDB to ChEBI entries----') ns = {'hmdb': 'http://www.hmdb.ca'} url = 'http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip' fname = os.path.join(path, 'hmdb_metabolites.zip') logger.info('Downloading %s' % url) urlretrieve(url, fname) mappings = [] with ZipFile(fname) as input_zip: with input_zip.open('hmdb_metabolites.xml') as fh: for event, elem in ET.iterparse(fh, events=('start', 'end')): #print(elem.tag) if event == 'start' and \ elem.tag == '{%s}metabolite' % ns['hmdb']: hmdb_id = None chebi_id = None # Important: we only look at accession if there's no HMDB # ID yet, otherwise we pick up secondary accession tags elif event == 'start' and \ elem.tag == '{%s}accession' % ns['hmdb'] and \ not hmdb_id: hmdb_id = elem.text elif event == 'start' and \ elem.tag == '{%s}chebi_id' % ns['hmdb']: chebi_id = elem.text elif event == 'end' and \ elem.tag == '{%s}metabolite' % ns['hmdb']: if hmdb_id and chebi_id: name = chebi_client.get_chebi_name_from_id(chebi_id) if not name: print('Likely invalid ChEBI mapping: ', hmdb_id, chebi_id) continue mappings.append([hmdb_id, chebi_id]) elem.clear() fname = os.path.join(path, 'hmdb_to_chebi.tsv') mappings = [['HMDB_ID', 'CHEBI_ID']] + sorted(mappings, key=lambda x: x[0]) write_unicode_csv(fname, mappings, delimiter='\t')
def get_db_refs_by_name(ns, name, node_data): """Return standard name and grounding based on a namespace and a name. Parameters ---------- ns : str A name space in which the given name is interpreted. name : str The name in the given name space to get grounding for. node_data : dict Node data for logging purposes. Returns ------- name : str The standardized name for the given entity. db_refs : dict The grounding for the given entity. """ db_refs = None if ns == 'HGNC': # Assumption: name is an HGNC symbol hgnc_id = hgnc_client.get_current_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return name, None elif isinstance(hgnc_id, list): logger.info('More than one current HGNC ID for %s, choosing %s' % (name, hgnc_id[0])) hgnc_id = hgnc_id[0] name = hgnc_client.get_hgnc_name(hgnc_id) db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id elif ns in ('UNIPROT', 'UP'): up_id = None # This is a simple test to see if name is a valid UniProt ID, # if we can't get a mnemonic, we assume it's not a UP ID if uniprot_client.get_mnemonic(name, web_fallback=False): up_id = name # We next check if it's a mnemonic else: up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name) if up_id_from_mnem: up_id = up_id_from_mnem if not up_id: logger.info('Couldn\'t get UP ID from %s' % name) return name, None db_refs = {'UP': up_id} hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) else: name = uniprot_client.get_gene_name(up_id) elif ns == 'FPLX': db_refs = {'FPLX': name} elif ns in ('GO', 'GOBP', 'GOCC'): if name == 'cell proliferation': name = 'cell population proliferation' go_id = go_client.get_go_id_from_label(name) if not go_id: logger.info('Could not find GO ID for %s' % name) return name, None db_refs = {'GO': go_id} name = go_client.get_go_label(go_id) elif ns in ('MESHPP', 'MESHD', 'MESH'): mesh_id, mesh_name = mesh_client.get_mesh_id_name(name) if not mesh_id: logger.info('Could not find MESH ID from %s' % name) return name, None name = mesh_name db_refs = {'MESH': mesh_id} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns == 'MGI': up_id = mouse_lookup.get(name) if up_id: db_refs = {'UP': up_id} elif ns == 'RGD': up_id = rat_lookup.get(name) if up_id: db_refs = {'UP': up_id} # Map Selventa families and complexes to FamPlex elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name elif ns == 'SCOMP': db_refs = {'SCOMP': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SCOMP complex: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'): hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info( 'HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id else: logger.debug('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name elif ns == 'MIRBASE': mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name) if not mirbase_id: logger.info('Could not map miRBase name %s to ID', name) return name, None db_refs = {'MIRBASE': mirbase_id} hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) # CHEBI elif ns == 'CHEBI': # We first look up BEL's own namespace map for ChEBI names to IDs chebi_id = chebi_name_id.get(name) # If that fails, we look up INDRA's ChEBI name to ID mapping if not chebi_id: chebi_id = chebi_client.get_chebi_id_from_name(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # These appear in the name slot but are actually IDs elif ns == 'CHEBIID': chebi_id = identifiers.ensure_chebi_prefix(name) db_refs = {'CHEBI': chebi_id} name = chebi_client.get_chebi_name_from_id(chebi_id) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM', 'TEXT'): db_refs = {ns: name} elif ns == 'TAX': tid = taxonomy_client.get_taxonomy_id(name) if tid: db_refs = {'TAXONOMY': tid} else: logger.info('Could not get taxonomy ID for %s' % name) else: logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) return name, db_refs
def test_chebi_id_to_name(): name = chebi_client.get_chebi_name_from_id('CHEBI:63637') assert name == 'vemurafenib', name
def _urn_to_db_refs(urn): """Converts a Medscan URN to an INDRA db_refs dictionary with grounding information. Parameters ---------- urn : str A Medscan URN Returns ------- db_refs : dict A dictionary with grounding information, mapping databases to database identifiers. If the Medscan URN is not recognized, returns an empty dictionary. db_name : str The Famplex name, if available; otherwise the HGNC name if available; otherwise None """ # Convert a urn to a db_refs dictionary if urn is None: return {}, None m = URN_PATT.match(urn) if m is None: return None, None urn_type, urn_id = m.groups() db_refs = {} db_name = None # TODO: support more types of URNs if urn_type == 'agi-cas': # Identifier is CAS, convert to CHEBI chebi_id = get_chebi_id_from_cas(urn_id) if chebi_id: db_refs['CHEBI'] = chebi_id db_name = get_chebi_name_from_id(chebi_id) elif urn_type == 'agi-llid': # This is an Entrez ID, convert to HGNC hgnc_id = get_hgnc_from_entrez(urn_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id # Convert the HGNC ID to a Uniprot ID uniprot_id = get_uniprot_id(hgnc_id) if uniprot_id is not None: db_refs['UP'] = uniprot_id # Try to lookup HGNC name; if it's available, set it to the # agent name db_name = get_hgnc_name(hgnc_id) elif urn_type in [ 'agi-meshdis', 'agi-ncimorgan', 'agi-ncimtissue', 'agi-ncimcelltype' ]: if urn_id.startswith('C') and urn_id[1:].isdigit(): # Identifier is probably UMLS db_refs['UMLS'] = urn_id else: # Identifier is MESH urn_mesh_name = unquote(urn_id) mesh_id, mesh_name = mesh_client.get_mesh_id_name(urn_mesh_name) if mesh_id: db_refs['MESH'] = mesh_id db_name = mesh_name else: db_name = urn_mesh_name elif urn_type == 'agi-gocomplex': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id elif urn_type == 'agi-go': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id # If we have a GO or MESH grounding, see if there is a corresponding # Famplex grounding db_sometimes_maps_to_famplex = ['GO', 'MESH'] for db in db_sometimes_maps_to_famplex: if db in db_refs: key = (db, db_refs[db]) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the urn corresponds to an eccode, groudn to famplex if that eccode # is in the Famplex equivalences table if urn.startswith('urn:agi-enz'): tokens = urn.split(':') eccode = tokens[2] key = ('ECCODE', eccode) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding key = ('MEDSCAN', urn) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If there is a Famplex grounding, use Famplex for entity name if 'FPLX' in db_refs: db_name = db_refs['FPLX'] elif 'GO' in db_refs: db_name = go_client.get_go_label(db_refs['GO']) return db_refs, db_name
"""This script helps identify entries in PubChem.tsv that systematically lead to incorrect groundings and should therefore be removed.""" import os import re from indra.databases import chebi_client if __name__ == '__main__': # Basic positioning here = os.path.dirname(os.path.abspath(__file__)) kb_dir = os.path.join(here, os.pardir, 'src', 'main', 'resources', 'org', 'clulab', 'reach', 'kb') resource_fname = os.path.join(kb_dir, 'PubChem.tsv') keep_rows = [] with open(resource_fname, 'r') as fh: for row in fh.readlines(): if '\t' not in row: continue txt, id = [x.strip() for x in row.split('\t')] if re.match(r'^[A-Z][A-Z]$', txt): chebi_id = chebi_client.get_chebi_id_from_pubchem(id) name = chebi_client.get_chebi_name_from_id(chebi_id) if name and '-' in name and len(name) == 7: continue keep_rows.append(row) with open(resource_fname, 'w') as fh: for row in keep_rows: fh.write(row)
def test_chebi_id_to_name(): name = chebi_client.get_chebi_name_from_id('63637', offline=True) assert name == 'vemurafenib', name
def _urn_to_db_refs(urn): """Converts a Medscan URN to an INDRA db_refs dictionary with grounding information. Parameters ---------- urn : str A Medscan URN Returns ------- db_refs : dict A dictionary with grounding information, mapping databases to database identifiers. If the Medscan URN is not recognized, returns an empty dictionary. db_name : str The Famplex name, if available; otherwise the HGNC name if available; otherwise None """ # Convert a urn to a db_refs dictionary if urn is None: return {}, None m = URN_PATT.match(urn) if m is None: return None, None urn_type, urn_id = m.groups() db_refs = {} db_name = None # TODO: support more types of URNs if urn_type == 'agi-cas': # Identifier is CAS, convert to CHEBI chebi_id = get_chebi_id_from_cas(urn_id) if chebi_id: db_refs['CHEBI'] = 'CHEBI:%s' % chebi_id db_name = get_chebi_name_from_id(chebi_id) elif urn_type == 'agi-llid': # This is an Entrez ID, convert to HGNC hgnc_id = get_hgnc_from_entrez(urn_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id # Convert the HGNC ID to a Uniprot ID uniprot_id = get_uniprot_id(hgnc_id) if uniprot_id is not None: db_refs['UP'] = uniprot_id # Try to lookup HGNC name; if it's available, set it to the # agent name db_name = get_hgnc_name(hgnc_id) elif urn_type in ['agi-meshdis', 'agi-ncimorgan', 'agi-ncimtissue', 'agi-ncimcelltype']: if urn_id.startswith('C') and urn_id[1:].isdigit(): # Identifier is probably UMLS db_refs['UMLS'] = urn_id else: # Identifier is MESH urn_mesh_name = unquote(urn_id) mesh_id, mesh_name = mesh_client.get_mesh_id_name(urn_mesh_name) if mesh_id: db_refs['MESH'] = mesh_id db_name = mesh_name else: db_name = urn_mesh_name elif urn_type == 'agi-gocomplex': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id elif urn_type == 'agi-go': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id # If we have a GO or MESH grounding, see if there is a corresponding # Famplex grounding db_sometimes_maps_to_famplex = ['GO', 'MESH'] for db in db_sometimes_maps_to_famplex: if db in db_refs: key = (db, db_refs[db]) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the urn corresponds to an eccode, groudn to famplex if that eccode # is in the Famplex equivalences table if urn.startswith('urn:agi-enz'): tokens = urn.split(':') eccode = tokens[2] key = ('ECCODE', eccode) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding key = ('MEDSCAN', urn) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If there is a Famplex grounding, use Famplex for entity name if 'FPLX' in db_refs: db_name = db_refs['FPLX'] elif 'GO' in db_refs: db_name = go_client.get_go_label(db_refs['GO']) return db_refs, db_name