def _get_agent_from_ref(self, ref): # TODO: handle collections if ref.attrib.get('category') == 'collection': logger.warning('Skipping collection Agent.') return None name_tag = ref.find("var/[@name='name']") if name_tag is not None: name = name_tag.text else: return None uid_tag = ref.find("var/[@name='uid']") if uid_tag is not None: uid = uid_tag.text else: uid = None db_refs = {} text_tag = ref.find("var/[@name='raw-text']") if text_tag is not None: db_refs['TEXT'] = text_tag.text if uid is not None and uid.startswith('UP:'): up_mnemonic = uid[3:] up_id = uniprot_client.get_id_from_mnemonic(up_mnemonic) if up_id is not None: up_name = uniprot_client.get_gene_name(up_id) if up_name is not None: name = up_name db_refs['UP'] = up_id assert name is not None agent = Agent(name, db_refs=db_refs) return agent
def _get_agent_from_ref(self, ref): # TODO: handle collections if ref.attrib.get('category') == 'collection': logger.warning('Skipping collection Agent.') return None name_tag = ref.find("var/[@name='name']") if name_tag is not None: name = name_tag.text else: return None uid_tag = ref.find("var/[@name='uid']") if uid_tag is not None: uid = uid_tag.text else: uid = None db_refs = {} text_tag = ref.find("var/[@name='raw-text']") if text_tag is not None: db_refs['TEXT'] = text_tag.text if uid is not None and uid.startswith('UP:'): up_mnemonic = uid[3:] up_id = uniprot_client.get_id_from_mnemonic(up_mnemonic) if up_id is not None: up_name = uniprot_client.get_gene_name(up_id) if up_name is not None: name = up_name db_refs['UP'] = up_id assert name is not None agent = Agent(name, db_refs=db_refs) return agent
def get_db_refs_by_name(ns, name, node_data): """Return standard name and grounding based on a namespace and a name. Parameters ---------- ns : str A name space in which the given name is interpreted. name : str The name in the given name space to get grounding for. node_data : dict Node data for logging purposes. Returns ------- name : str The standardized name for the given entity. db_refs : dict The grounding for the given entity. """ db_refs = None if ns == 'HGNC': # Assumption: name is an HGNC symbol hgnc_id = hgnc_client.get_current_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return name, None elif isinstance(hgnc_id, list): logger.info('More than one current HGNC ID for %s, choosing %s' % (name, hgnc_id[0])) hgnc_id = hgnc_id[0] name = hgnc_client.get_hgnc_name(hgnc_id) db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id elif ns in ('UNIPROT', 'UP'): up_id = None # This is a simple test to see if name is a valid UniProt ID, # if we can't get a mnemonic, we assume it's not a UP ID if uniprot_client.get_mnemonic(name, web_fallback=False): up_id = name # We next check if it's a mnemonic else: up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name) if up_id_from_mnem: up_id = up_id_from_mnem if not up_id: logger.info('Couldn\'t get UP ID from %s' % name) return name, None db_refs = {'UP': up_id} hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) else: name = uniprot_client.get_gene_name(up_id) elif ns == 'FPLX': db_refs = {'FPLX': name} elif ns in ('GO', 'GOBP', 'GOCC'): if name == 'cell proliferation': name = 'cell population proliferation' go_id = go_client.get_go_id_from_label(name) if not go_id: logger.info('Could not find GO ID for %s' % name) return name, None db_refs = {'GO': go_id} name = go_client.get_go_label(go_id) elif ns in ('MESHPP', 'MESHD', 'MESH'): mesh_id, mesh_name = mesh_client.get_mesh_id_name(name) if not mesh_id: logger.info('Could not find MESH ID from %s' % name) return name, None name = mesh_name db_refs = {'MESH': mesh_id} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns == 'MGI': up_id = mouse_lookup.get(name) if up_id: db_refs = {'UP': up_id} elif ns == 'RGD': up_id = rat_lookup.get(name) if up_id: db_refs = {'UP': up_id} # Map Selventa families and complexes to FamPlex elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name elif ns == 'SCOMP': db_refs = {'SCOMP': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SCOMP complex: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'): hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info( 'HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id else: logger.debug('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name elif ns == 'MIRBASE': mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name) if not mirbase_id: logger.info('Could not map miRBase name %s to ID', name) return name, None db_refs = {'MIRBASE': mirbase_id} hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) # CHEBI elif ns == 'CHEBI': # We first look up BEL's own namespace map for ChEBI names to IDs chebi_id = chebi_name_id.get(name) # If that fails, we look up INDRA's ChEBI name to ID mapping if not chebi_id: chebi_id = chebi_client.get_chebi_id_from_name(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # These appear in the name slot but are actually IDs elif ns == 'CHEBIID': chebi_id = identifiers.ensure_chebi_prefix(name) db_refs = {'CHEBI': chebi_id} name = chebi_client.get_chebi_name_from_id(chebi_id) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM', 'TEXT'): db_refs = {ns: name} elif ns == 'TAX': tid = taxonomy_client.get_taxonomy_id(name) if tid: db_refs = {'TAXONOMY': tid} else: logger.info('Could not get taxonomy ID for %s' % name) else: logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) return name, db_refs
def _get_agent(self, participant): dbid = participant.get('identifier') text = participant.get('entity_text')[0] if dbid == 'GENERIC': if not text: return None else: return Agent(text) db_refs = {} entity_type = participant.get('entity_type') if entity_type in ['protein', 'chemical', 'gene']: # TODO: standardize name here name = participant.get('entity_text')[0] db_refs['TEXT'] = text if dbid: db_name, db_id = dbid.split(':') if db_name.lower() == 'uniprot': uniprot_id = uniprot_client.get_id_from_mnemonic(db_id) db_refs['UP'] = uniprot_id elif db_name.lower() == 'pubchem': chebi_id = chebi_client.get_chebi_id_from_pubchem(db_id) db_refs['CHEBI'] = chebi_id elif db_name.lower() == 'hgnc': db_refs['HGNC'] = db_id elif entity_type == 'protein_family': name = text else: return None # TODO: handle other participant types agent = Agent(name, db_refs=db_refs) features = participant.get('features') if features: for feature in features: feature_type = feature.get('feature_type') if feature_type == 'modification_feature': mc = self._get_mod_condition(feature) agent.mods.append(mc) elif feature_type == 'binding_feature': bc = self._get_bound_condition(feature) agent.bound_conditions.append(bc) elif feature_type == 'mutation_feature': mc = self._get_mut_condition(feature) agent.mutations.append(mc) elif feature_type == 'location_feature': agent.location = feature.get('location') not_features = participant.get('features') if not_features: for feature in not_features: feature_type = feature.get('feature_type') if feature_type == 'modification_feature': mc = self._get_mod_condition(feature) mc.is_modified = False agent.mods.append(mc) elif feature_type == 'binding_feature': bc = self._get_bound_condition(feature) bc.is_bound = False agent.bound_conditions.append(bc) return agent
def get_db_refs_by_name(ns, name, node_data): """Return standard name and grounding based on a namespace and a name. Parameters ---------- ns : str A name space in which the given name is interpreted. name : str The name in the given name space to get grounding for. node_data : dict Node data for logging purposes. Returns ------- name : str The standardized name for the given entity. db_refs : dict The grounding for the given entity. """ db_refs = None if ns == 'HGNC': hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return name, None db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id elif ns in ('UNIPROT', 'UP'): up_id = None gene_name = uniprot_client.get_gene_name(name) if gene_name: up_id = name else: up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name) if up_id_from_mnem: up_id = up_id_from_mnem gene_name = uniprot_client.get_gene_name(up_id) if not up_id: logger.info('Couldn\'t get UP ID from %s' % name) return name, None db_refs = {'UP': up_id} if uniprot_client.is_human(up_id): hgnc_id = hgnc_client.get_hgnc_id(gene_name) if not hgnc_id: logger.info('Uniprot ID linked to invalid human gene ' 'name %s' % name) else: db_refs['HGNC'] = hgnc_id elif ns == 'FPLX': db_refs = {'FPLX': name} elif ns in ('GO', 'GOBP', 'GOCC'): go_id = go_client.get_go_id_from_label(name) if not go_id: logger.info('Could not find GO ID for %s' % name) return name, None db_refs = {'GO': go_id} elif ns in ('MESHPP', 'MESHD', 'MESH'): mesh_id = mesh_client.get_mesh_id_name(name) if not mesh_id: logger.info('Could not find MESH ID fro %s' % name) return name, None db_refs = {'MESH': mesh_id} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns in ('MGI', 'RGD'): db_refs = {ns: name} # Map Selventa families to FamPlexes elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'): hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id else: logger.info('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name elif ns == 'MIRBASE': mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name) if not mirbase_id: logger.info('Could not map miRBase name %s to ID', name) return db_refs = {'MIRBASE': mirbase_id} hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id) if hgnc_id: db_refs['HGNC'] = hgnc_id # CHEBI elif ns == 'CHEBI': chebi_id = chebi_name_id.get(name) if not chebi_id: chebi_id = chebi_client.get_chebi_id_from_name(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM'): db_refs = {ns: name} else: logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) return name, db_refs
def _get_agent(self, participant): dbid = participant.get('identifier') text = participant.get('entity_text')[0] if dbid == 'GENERIC': if not text: return None else: return Agent(text) db_refs = {} entity_type = participant.get('entity_type') if entity_type in ['protein', 'chemical', 'gene']: # TODO: standardize name here name = participant.get('entity_text')[0] db_refs['TEXT'] = text if dbid: db_name, db_id = dbid.split(':') if db_name.lower() == 'uniprot': uniprot_id = uniprot_client.get_id_from_mnemonic(db_id) db_refs['UP'] = uniprot_id elif db_name.lower() == 'pubchem': chebi_id = chebi_client.get_chebi_id_from_pubchem(db_id) db_refs['CHEBI'] = chebi_id elif db_name.lower() == 'hgnc': db_refs['HGNC'] = db_id elif entity_type == 'protein_family': name = text else: return None # TODO: handle other participant types agent = Agent(name, db_refs=db_refs) features = participant.get('features') if features: for feature in features: feature_type = feature.get('feature_type') if feature_type == 'modification_feature': mc = self._get_mod_condition(feature) agent.mods.append(mc) elif feature_type == 'binding_feature': bc = self._get_bound_condition(feature) agent.bound_conditions.append(bc) elif feature_type == 'mutation_feature': mc = self._get_mut_condition(feature) agent.mutations.append(mc) elif feature_type == 'location_feature': agent.location = feature.get('location') not_features = participant.get('features') if not_features: for feature in not_features: feature_type = feature.get('feature_type') if feature_type == 'modification_feature': mc = self._get_mod_condition(feature) mc.is_modified = False agent.mods.append(mc) elif feature_type == 'binding_feature': bc = self._get_bound_condition(feature) bc.is_bound = False agent.bound_conditions.append(bc) return agent
def _get_agent_from_ref(self, ref): # TODO: handle collections if ref.attrib.get('category') == 'collection': #logger.warning('Skipping collection Agent.') return None # Find the name, uid and raw-text tags first and get their text # content if available uid_tag = ref.find("var/[@name='uid']") name_tag = ref.find("var/[@name='name']") text_tag = ref.find("var/[@name='raw-text']") if name_tag is not None and name_tag.text: name = name_tag.text else: name = None if uid_tag is not None and uid_tag.text: uid = uid_tag.text else: uid = None if text_tag is not None and text_tag.text: raw_text = text_tag.text else: raw_text = None # TODO: factor this out and reuse fix_agents db_refs = {} # Save raw text if available if raw_text: db_refs['TEXT'] = raw_text agent_name = raw_text # If we have a proper UID then we try to reconstruct an Agent from that if uid is not None and len(uid.split(':')) == 2: db_ns, db_id = uid.split(':') be_id = famplex_map.get((db_ns, db_id)) if be_id: db_refs[db_ns] = db_id db_refs['FPLX'] = be_id agent_name = be_id elif db_ns in ['UP', 'Uniprot']: id_from_mnemonic = uniprot_client.get_id_from_mnemonic(db_id) if id_from_mnemonic: db_id = id_from_mnemonic db_refs['UP'] = db_id hgnc_id = uniprot_client.get_hgnc_id(db_id) if hgnc_id: db_refs['HGNC'] = hgnc_id agent_name = hgnc_client.get_hgnc_name(hgnc_id) else: gene_name = uniprot_client.get_gene_name(db_id) if gene_name: agent_name = gene_name elif db_ns == 'NCIT': db_refs['NCIT'] = db_id target = ncit_map.get(db_id) if target: db_refs[target[0]] = target[1] if target[0] == 'HGNC': up_id = hgnc_client.get_uniprot_id(target[1]) agent_name = hgnc_client.get_hgnc_name(target[1]) if up_id: db_refs['UP'] = up_id elif target[0] == 'UP': agent_name = uniprot_client.get_gene_name(target[1]) if agent_name: hgnc_id = hgnc_client.get_hgnc_id(agent_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif db_ns == 'FA': db_refs['NXP'] = 'FA:' + db_id elif db_ns == 'XFAM': db_refs['PF'] = db_id.split('.')[0] elif db_ns == 'CHEBI': db_refs['CHEBI'] = 'CHEBI:' + db_id elif db_ns in ['GO', 'MESH', 'FPLX']: db_refs[db_ns] = db_id # Handle old BE mappings and add them as FPLX elif db_ns == 'BE': db_refs['FPLX'] = db_id elif db_ns in ['PR', 'CO', 'CVCL', 'EFO', 'ORPHANET']: db_refs[db_ns] = db_id else: logger.warning('Unknown database name space %s' % db_ns) if not agent_name: if raw_text is not None: agent_name = raw_text else: return None assert (agent_name) agent = Agent(agent_name, db_refs=db_refs) return agent