def test_standardize_up_isoform(): refs = standardize_db_refs({'UP': 'Q99490'}) assert refs == {'UP': 'Q99490', 'HGNC': '16921', 'EGID': '116986', 'MESH': 'C485997'}, refs refs = standardize_db_refs({'UP': 'Q99490-123'}) assert refs == {'UP': 'Q99490-123', 'HGNC': '16921', 'EGID': '116986', 'MESH': 'C485997'}, refs
def test_standardize_hgnc_fplx_mesh_bug(): refs = standardize_db_refs({'HGNC': '1514'}) assert refs['UP'] == 'P41180' assert 'FPLX' not in refs refs = standardize_db_refs({'FPLX': 'Calcium_sensing_receptors'}) assert refs['HGNC_GROUP'] == '279' assert 'HGNC' not in refs
def get_chemical_agent(name, mesh_id, cas_id): db_refs = {'MESH': mesh_id} if cas_id: db_refs['CAS'] = cas_id db_refs = standardize_db_refs(db_refs) assert_valid_db_refs(db_refs) return Agent(name, db_refs=db_refs)
def test_obo_replacements(): assert bio_ontology.get_node_property('GO', 'GO:0036442', 'obsolete') is True assert bio_ontology.get_replacement('GO', 'GO:0036442') == \ ('GO', 'GO:0008553') assert standardize_db_refs({'GO': 'GO:0036442'}).get('GO') == \ 'GO:0008553'
def test_uniprot_replacements(): assert bio_ontology.get_node_property('UP', 'A0A059MHB0', 'obsolete') is True assert bio_ontology.get_replacement('UP', 'A0A059MHB0') == \ ('UP', 'C7U1M6') assert standardize_db_refs({'UP': 'A0A059MHB0'}).get('UP') == \ 'C7U1M6'
def parse_context_entry(entry, grounder, sentence=None): """Return a dict of context type and object processed from an entry.""" match = re.match(r'(.*): (.*)', entry) if not match: return None context_type, context_txt = match.groups() if context_type not in allowed_contexts: logger.warning('Unknown context type %s' % context_type) return None terms = grounder(context_txt, context=sentence) if not terms: logger.warning('Could not ground %s context: %s' % (context_type, context_txt)) db_refs = {} if terms: db_refs = standardize_db_refs({terms[0].term.db: terms[0].term.id}) db_refs['TEXT'] = context_txt standard_name = None if terms: standard_name = bio_ontology.get_name(terms[0].term.db, terms[0].term.id) name = standard_name if standard_name else context_txt context = RefContext(name=name, db_refs=db_refs) return {allowed_contexts[context_type]: context}
def test_mesh_replacements(): assert bio_ontology.get_name('MESH', 'D000086382') == 'COVID-19' assert bio_ontology.isrel('MESH', 'C000657245', 'MESH', 'D000086382', {'replaced_by'}) assert bio_ontology.get_replacement('MESH', 'C000657245') == \ ('MESH', 'D000086382') assert standardize_db_refs({'MESH': 'C000657245'}).get('MESH') == \ 'D000086382'
def test_standardize_db_refs_efo_hp_doid(): refs = standardize_db_refs({'EFO': '0009502'}) assert refs.get('MESH') == 'D000007', refs refs = standardize_db_refs({'MESH': 'D000007'}) assert refs.get('EFO') == '0009502', refs refs = standardize_db_refs({'HP': 'HP:0031801'}) assert refs.get('MESH') == 'D064706', refs refs = standardize_db_refs({'MESH': 'D064706'}) assert refs.get('HP') == 'HP:0031801', refs # Currently there is no one-to-many mapping in the direction towards MeSH # (there used to be) if there is again, we should test it here #refs = standardize_db_refs({'DOID': 'DOID:0060695'}) #assert 'MESH' not in refs # One-to-many mappings away from MESH refs = standardize_db_refs({'MESH': 'D000071017'}) assert 'DOID' not in refs refs = standardize_db_refs({'DOID': 'DOID:0060495'}) assert refs.get('MESH') == 'D000067208' # This is an xrefs-based mapping that isn't in Gilda's resource file refs = standardize_db_refs({'EFO': '0000694'}) assert refs.get('MESH') == 'D045169'
def indra_db_refs_from_minerva_refs(refs): db_refs = {} for db_ns, db_id in refs: db_ns = minerva_to_indra_map[db_ns] \ if db_ns in minerva_to_indra_map else db_ns db_nbs, db_id = fix_id_standards(db_ns, db_id) db_refs[db_ns] = db_id db_refs = standardize_db_refs(db_refs) return db_refs
def test_name_standardize_mesh_other_db(): a1 = Agent('x', db_refs={'MESH': 'D001194'}) standardize_agent_name(a1, True) assert a1.db_refs['CHEBI'] == 'CHEBI:46661' assert a1.name == 'asbestos', a1.name db_refs = {'MESH': 'D000067777'} db_refs = standardize_db_refs(db_refs) assert db_refs.get('HGNC') == '3313', db_refs assert db_refs.get('UP') == 'Q12926', db_refs a2 = Agent('x', db_refs=db_refs) standardize_agent_name(a2) assert a2.name == 'ELAVL2'
def align_identifiers_urls(indra_groundings, dm_urls): matches = [] identifiers_prefix = 'https://identifiers.org/' for dm_url in dm_urls: # We do it this way instead of splitting because of DOIs which have # extra slashes entity = dm_url[len(identifiers_prefix):] db_ns, db_id = entity.split(':', maxsplit=1) if db_ns == 'CHEBI': db_refs = [ standardize_db_refs({'CHEBI': '%s:%s' % (db_ns, db_id)}) ] elif db_ns == 'hgnc': db_refs = [standardize_db_refs({'HGNC': db_id})] elif db_ns == 'hgnc.symbol': hgnc_id = hgnc_client.get_current_hgnc_id(db_id) db_refs = [standardize_db_refs({'HGNC': hgnc_id})] elif db_ns == 'pubchem.compound': db_refs = [standardize_db_refs({'PUBCHEM': db_id})] elif db_ns == 'uniprot': db_refs = [standardize_db_refs({'UP': db_id})] elif db_ns == 'bigg.metabolite': chebi_ids = bigg_to_chebi.get(db_id) if chebi_ids: db_refs = [ standardize_db_refs({'CHEBI': chebi_id}) for chebi_id in chebi_ids ] else: db_refs = [{}] elif db_ns == 'ncbigene': hgnc_id = hgnc_client.get_hgnc_from_entrez(db_id) if hgnc_id: db_refs = [standardize_db_refs({'HGNC': hgnc_id})] else: db_refs = [{}] # Skip literature references that aren't entities elif db_ns in {'doi', 'pubmed'}: continue else: print('Unhandled namespace %s' % db_ns) db_refs = {} matched = None for db_ref in db_refs: for k, v in db_ref.items(): if (k, v) in indra_groundings: matched = (k, v) break matches.append( (dm_url, get_identifiers_url(*matched) if matched else None)) return matches
def indra_db_refs_from_minerva_refs(refs): db_refs = {} for db_ns, db_id in refs: db_ns = minerva_to_indra_map[db_ns] \ if db_ns in minerva_to_indra_map else db_ns db_ns, db_id = fix_id_standards(db_ns, db_id) db_refs[db_ns] = db_id # We need some special handling here for issues in the curated maps # If we have a specific gene grounding, remove ECCODE grounding since # it can incorrectly result in a family interpretation if 'HGNC' in db_refs: db_refs.pop('ECCODE', None) db_refs = standardize_db_refs(db_refs) return db_refs
def standardize_db_refs(db_refs): """Return a standardized db refs dict for a given db refs dict. Parameters ---------- db_refs : dict A dict of db refs that may not be standardized, i.e., may be missing an available UP ID corresponding to an existing HGNC ID. Returns ------- dict The db_refs dict with standardized entries. """ return standardize_db_refs(db_refs)
def _get_db_refs(entity_term): db_refs = {} for xr in entity_term['xrefs']: ns = xr['namespace'] if ns == 'uniprot': db_refs['UP'] = xr['id'] elif ns == 'hgnc': db_refs['HGNC'] = xr['id'] elif ns == 'pfam': fplx_id = famplex_map.get(('PF', xr['id'])) if fplx_id: db_refs['FPLX'] = fplx_id db_refs['PF'] = xr['id'] elif ns == 'interpro': fplx_id = famplex_map.get(('IP', xr['id'])) if fplx_id: db_refs['FPLX'] = fplx_id db_refs['IP'] = xr['id'] elif ns == 'chebi': db_refs['CHEBI'] = xr['id'] elif ns == 'pubchem': db_refs['PUBCHEM'] = xr['id'] elif ns == 'go': go_id = xr['id'] # Handle secondary to primary mapping if necessary pri = go_client.get_primary_id(go_id) if pri: go_id = pri db_refs['GO'] = go_id elif ns == 'mesh': db_refs['MESH'] = xr['id'] elif ns == 'hmdb': db_refs['HMDB'] = xr['id'] elif ns == 'simple_chemical': if xr['id'].startswith('HMDB'): db_refs['HMDB'] = xr['id'] # We handle "be" here for compatibility with older versions elif ns in ('fplx', 'be'): db_refs['FPLX'] = xr['id'] # These name spaces are ignored elif ns in ['uaz']: pass else: logger.warning('Unhandled xref namespace: %s' % ns) db_refs['TEXT'] = entity_term['text'] db_refs = standardize_db_refs(db_refs) return db_refs
def _add_node(self, agent, uuid=None): node_key = agent.name node_id = self._existing_nodes.get(node_key) # if the node already exists we do not want to add it again # we must however add its uuid if node_id is not None: # fetch the appropriate node n = [x for x in self._nodes if x['data']['id'] == node_id][0] uuid_list = n['data']['uuid_list'] if uuid not in uuid_list: uuid_list.append(uuid) return node_id db_refs = _get_db_refs(agent) node_id = self._get_new_id() self._existing_nodes[node_key] = node_id node_name = agent.name node_name = node_name.replace('_', ' ') if 'FPLX' in db_refs: expanded_families = bio_ontology.get_children( *agent.get_grounding(), ns_filter={'HGNC'}) else: expanded_families = [] members = {} for member in expanded_families: member_db_refs = {member[0]: member[1]} member_db_refs = standardize_db_refs(member_db_refs) gene_name = bio_ontology.get_name(*member) members[gene_name] = {'db_refs': {}} for dbns, dbid in member_db_refs.items(): url = get_identifiers_url(dbns, dbid) if url: members[gene_name]['db_refs'][dbns] = url node = { 'data': { 'id': node_id, 'name': node_name, 'db_refs': db_refs, 'parent': '', 'members': members, 'uuid_list': [uuid] } } self._nodes.append(node) return node_id
def _get_db_refs(entity_term, organism_priority=None): db_refs = {} for xr in entity_term['xrefs']: ns = xr['namespace'] if ns == 'uniprot': # Note: we add both full protein and protein chain # IDs here so that we can appli organism prioritization in # a uniform way. Later these will be separated out. up_id = xr['id'] db_refs['UP'] = up_id elif ns == 'hgnc': db_refs['HGNC'] = xr['id'] elif ns == 'pfam': fplx_id = famplex_map.get(('PF', xr['id'])) if fplx_id: db_refs['FPLX'] = fplx_id db_refs['PF'] = xr['id'] elif ns == 'interpro': fplx_id = famplex_map.get(('IP', xr['id'])) if fplx_id: db_refs['FPLX'] = fplx_id db_refs['IP'] = xr['id'] elif ns == 'chebi': db_refs['CHEBI'] = xr['id'] elif ns == 'pubchem': db_refs['PUBCHEM'] = xr['id'] elif ns == 'go': go_id = xr['id'] # Handle secondary to primary mapping if necessary pri = go_client.get_primary_id(go_id) if pri: go_id = pri db_refs['GO'] = go_id elif ns == 'mesh': db_refs['MESH'] = xr['id'] elif ns == 'hmdb': db_refs['HMDB'] = xr['id'] elif ns == 'simple_chemical': if xr['id'].startswith('HMDB'): db_refs['HMDB'] = xr['id'] # We handle "be" here for compatibility with older versions elif ns in ('fplx', 'be'): db_refs['FPLX'] = xr['id'] elif ns == 'proonto': db_refs['PR'] = xr['id'] # These name spaces are ignored elif ns in ['uaz']: pass else: logger.warning('Unhandled xref namespace: %s' % ns) db_refs['TEXT'] = entity_term['text'] # If we have a UniProt grounding and we have a non-default # organism priority list, we call the prioritization function if db_refs.get('UP'): if organism_priority: # These are all the unique groundings in the alt-xrefs list, # which redundantly lists the same match multiple times because # it enumerates multiple synonyms for organisms redundantly unique_altxrefs = \ set((axr['namespace'], axr['id']) for axr in entity_term.get('alt-xrefs', [])) # This returns a single prioritized UniProt ID or None prioritized_id = \ prioritize_organism_grounding(db_refs['UP'], unique_altxrefs, organism_priority) # If we got an ID, we set the UP grounding to that, otherwise # we keep what we already got from the primary xref if prioritized_id: db_refs['UP'] = prioritized_id # After all this, we need to separate protein chain grounding # and so if we are dealing with one of those, we pop out the UP # key, split the ID to get the chain ID and add that in the UPPRO # namespace. if '#' in db_refs['UP']: up_id = db_refs.pop('UP', None) db_refs['UPPRO'] = up_id.split('#')[1] db_refs = standardize_db_refs(db_refs) return db_refs
def test_pubchem_mesh(): db_refs = standardize_db_refs({'PUBCHEM': '56649450'}) assert db_refs.get('MESH') == 'C585539'
def test_standardize_up_isoform(): assert standardize_db_refs({'UP': 'Q99490'}) == \ {'UP': 'Q99490', 'HGNC': '16921'} assert standardize_db_refs({'UP': 'Q99490-123'}) == \ {'UP': 'Q99490-123', 'HGNC': '16921'}
def test_standardize_chembl(): db_refs = standardize_db_refs({'DRUGBANK': 'DB00305'}) assert 'CHEMBL' in db_refs, db_refs assert db_refs['CHEMBL'] == 'CHEMBL105', db_refs