def _make_db_refs(self, entrez_id, text_id): """Looks up the HGNC ID and name, as well as the Uniprot ID. Parameters ---------- entrez_id : str Entrez gene ID. text_id : str or None A plain text systematic name, or None if not listed in the Biogrid data. Returns ------- hgnc_name : str Official HGNC symbol for the gene. db_refs : dict db_refs grounding dictionary, used when constructing the Agent object. """ db_refs = {} if text_id != '-' and text_id is not None: db_refs['TEXT'] = text_id hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id) hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id is not None: db_refs['UP'] = up_id return (hgnc_name, db_refs)
def read_variant_pairs(self): mutations_by_gene = defaultdict(list) for mutation in self.patient.mutations: gene_name = get_hgnc_name( get_hgnc_from_entrez(str(mutation['entrezGeneId']))) change = mutation['proteinChange'] mutations_by_gene[gene_name].append(change) return list(mutations_by_gene.items())
def get_agent(raw_name, entrez_id): db_refs = {'TEXT': raw_name, 'EGID': entrez_id} logger.debug('Looking up grounding data for Entrez #%s' % entrez_id) hgnc_id = hgc.get_hgnc_from_entrez(entrez_id) if hgnc_id: db_refs['HGNC'] = hgnc_id agent = Agent(raw_name, db_refs=db_refs) standardize_agent_name(agent, standardize_refs=True) return agent
def _extract_protein(self, name, gene_id): refs = {'EGID': gene_id} hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id) if hgnc_id is not None: refs['HGNC'] = hgnc_id standard_name, db_refs = standardize_name_db_refs(refs) if standard_name: name = standard_name return Agent(name, db_refs=db_refs)
def get_agent(raw_name, entrez_id): db_refs = {'TEXT': raw_name} logger.debug('Looking up grounding data for Entrez #%s' % entrez_id) hgnc_id = hgc.get_hgnc_from_entrez(entrez_id) if hgnc_id is not None: db_refs['UP'] = hgc.get_uniprot_id(hgnc_id) name = hgc.get_hgnc_name(hgnc_id) else: name = raw_name agent = Agent(name, db_refs=db_refs) return agent
def _extract_protein(self, name, gene_id): refs = {'EGID': gene_id} hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id) if hgnc_id is not None: refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: refs['UP'] = up_id # If there is a HGNC ID, we standardize the gene name name = hgnc_client.get_hgnc_name(hgnc_id) return Agent(name, db_refs=refs)
def align_identifiers_urls(indra_groundings, dm_urls): matches = [] identifiers_prefix = 'https://identifiers.org/' for dm_url in dm_urls: # We do it this way instead of splitting because of DOIs which have # extra slashes entity = dm_url[len(identifiers_prefix):] db_ns, db_id = entity.split(':', maxsplit=1) if db_ns == 'CHEBI': db_refs = [ standardize_db_refs({'CHEBI': '%s:%s' % (db_ns, db_id)}) ] elif db_ns == 'hgnc': db_refs = [standardize_db_refs({'HGNC': db_id})] elif db_ns == 'hgnc.symbol': hgnc_id = hgnc_client.get_current_hgnc_id(db_id) db_refs = [standardize_db_refs({'HGNC': hgnc_id})] elif db_ns == 'pubchem.compound': db_refs = [standardize_db_refs({'PUBCHEM': db_id})] elif db_ns == 'uniprot': db_refs = [standardize_db_refs({'UP': db_id})] elif db_ns == 'bigg.metabolite': chebi_ids = bigg_to_chebi.get(db_id) if chebi_ids: db_refs = [ standardize_db_refs({'CHEBI': chebi_id}) for chebi_id in chebi_ids ] else: db_refs = [{}] elif db_ns == 'ncbigene': hgnc_id = hgnc_client.get_hgnc_from_entrez(db_id) if hgnc_id: db_refs = [standardize_db_refs({'HGNC': hgnc_id})] else: db_refs = [{}] # Skip literature references that aren't entities elif db_ns in {'doi', 'pubmed'}: continue else: print('Unhandled namespace %s' % db_ns) db_refs = {} matched = None for db_ref in db_refs: for k, v in db_ref.items(): if (k, v) in indra_groundings: matched = (k, v) break matches.append( (dm_url, get_identifiers_url(*matched) if matched else None)) return matches
def get_db_refs(egid): hgnc_id = hgnc_client.get_hgnc_from_entrez(egid) if not hgnc_id: logger.info("No HGNC ID for Entrez ID: %s" % egid) return (None, {}) hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if not hgnc_name: logger.info("No HGNC name for HGNC ID: %s" % hgnc_id) return (None, {}) up_id = hgnc_client.get_uniprot_id(hgnc_id) if not up_id: logger.info("No Uniprot ID for EGID / HGNC ID / Symbol " "%s / %s / %s" % (egid, hgnc_id, hgnc_name)) return (None, {}) return (hgnc_name, {'HGNC': hgnc_id, 'UP': up_id})
def read_cna_pairs(self): alteration_map = { -2: 'DEL', -1: 'del', 0: 'neu', 1: 'amp', 2: 'AMP', } cna_pairs = [] for cna in self.patient.cnas: gene_name = get_hgnc_name( get_hgnc_from_entrez(str(cna['entrezGeneId']))) alteration_str = alteration_map[cna['alteration']] cna_pairs.append((gene_name, alteration_str)) return cna_pairs
def map_entrez_human(entrez_ids): """Return references based on human Entrez gene IDs.""" refs = [] for entrez_id in entrez_ids: ref = {'EGID': entrez_id} hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id) if hgnc_id is None: logger.warning("Could not find HGNC ID for Entrez ID %s" % entrez_id) continue hgnc_ref = _refs_from_hgnc_id(hgnc_id) if hgnc_ref is None: continue ref.update(hgnc_ref) refs.append(ref) return refs
def test_entrez_hgnc_none(): entrez_id = 'xxx' hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id) assert hgnc_id is None
def test_entrez_hgnc(): entrez_id = '653509' hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id) assert hgnc_id == '10798'
def _extract_protein(self, name, gene_id): db_refs = {'EGID': gene_id} hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id return get_standard_agent(name, db_refs=db_refs)
def _make_agent(self, hprd_id, refseq_id=None): if hprd_id is None or hprd_id is nan: return None # Get the basic info (HGNC name/symbol, Entrez ID) from the # ID mappings dataframe try: egid = self.id_df.loc[hprd_id].EGID except KeyError: logger.info('HPRD ID %s not found in mappings table.' % hprd_id) return None if not egid: logger.info('No Entrez ID for HPRD ID %s' % hprd_id) return None # Get the HGNC ID hgnc_id = hgnc_client.get_hgnc_from_entrez(egid) # If we couldn't get an HGNC ID for the Entrez ID, this means that # the Entrez ID has been discontinued or replaced. if not hgnc_id: self.no_hgnc_for_egid.append(egid) return None # Get the (possibly updated) HGNC Symbol hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) assert hgnc_name is not None # See if we can get a Uniprot ID from the HGNC symbol--if there is # a RefSeq ID we wil also try to use it to get an isoform specific # UP ID, but we will have this one to fall back on. But if we can't # get one here, then we skip the Statement up_id_from_hgnc = hgnc_client.get_uniprot_id(hgnc_id) if not up_id_from_hgnc: self.no_up_for_hgnc.append((egid, hgnc_name, hgnc_id)) return None # If we have provided the RefSeq ID, it's because we need to make # sure that we are getting the right isoform-specific ID (for sequence # positions of PTMs). Here we try to get the Uniprot ID from the # Refseq->UP mappings in the protmapper.uniprot_client. if refseq_id is not None: # Get the Uniprot IDs from the uniprot client up_ids = uniprot_client.get_ids_from_refseq(refseq_id, reviewed_only=True) # Nothing for this RefSeq ID (quite likely because the RefSeq ID # is obsolete; take the UP ID from HGNC if len(up_ids) == 0: self.no_up_for_refseq.append(refseq_id) up_id = up_id_from_hgnc # More than one reviewed entry--no thanks, we'll take the one from # HGNC instead elif len(up_ids) > 1: self.many_ups_for_refseq.append(refseq_id) up_id = up_id_from_hgnc # We got a unique, reviewed UP entry for the RefSeq ID else: up_id = up_ids[0] # If it's the canonical isoform, strip off the '-1' if up_id.endswith('-1'): up_id = up_id.split('-')[0] # For completeness, get the Refseq ID from the HPRD ID table else: refseq_id = self.id_df.loc[hprd_id].REFSEQ_PROTEIN up_id = up_id_from_hgnc # Make db_refs, return Agent db_refs = { 'HGNC': hgnc_id, 'UP': up_id, 'EGID': egid, 'REFSEQ_PROT': refseq_id } return Agent(hgnc_name, db_refs=db_refs)
def get_agent(concept, entity): name = term_from_uri(concept) namespace = namespace_from_uri(entity) db_refs = {} if namespace == 'HGNC': agent_name = name hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id is not None: db_refs['HGNC'] = str(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.warning('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.' % (name, hgnc_id)) else: logger.warning("Couldn't get HGNC ID for HGNC symbol %s" % name) elif namespace in ('MGI', 'RGD'): agent_name = name db_refs[namespace] = name elif namespace in ('PFH', 'SFAM'): indra_name = bel_to_indra.get(name) db_refs[namespace] = name if indra_name is None: agent_name = name msg = 'Could not find mapping for BEL family: %s' % name logger.warning(msg) else: db_refs['BE'] = indra_name db_refs['TEXT'] = name agent_name = indra_name elif namespace in ('NCH', 'SCOMP'): indra_name = bel_to_indra.get(name) db_refs[namespace] = name if indra_name is None: agent_name = name msg = 'Could not find mapping for BEL complex: %s' % name logger.warning(msg) else: db_refs['BE'] = indra_name db_refs['TEXT'] = name agent_name = indra_name elif namespace == 'CHEBI': chebi_id = chebi_name_id.get(name) if chebi_id: db_refs['CHEBI'] = chebi_id else: logger.warning('CHEBI name %s not found in map.' % name) agent_name = name elif namespace == 'EGID': hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs['EGID'] = name if hgnc_id is not None: db_refs['HGNC'] = str(hgnc_id) agent_name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.warning('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.' % (name, hgnc_id)) else: logger.warning('Could not map EGID%s to HGNC.' % name) agent_name = 'E%s' % name else: logger.warning('Unhandled entity namespace: %s' % namespace) print('%s, %s' % (concept, entity)) agent_name = name agent = Agent(agent_name, db_refs=db_refs) return agent
def get_agent_from_entity_info(entity_info): """Return an INDRA Agent by processing an entity_info dict.""" # This will be the default name. If we get a gene name, it will # override this rawtext name. raw_text = entity_info['entityText'] name = raw_text # Get the db refs. refs = {'TEXT': raw_text} ref_counts = Counter([entry['source'] for entry in entity_info['entityId']]) for source, count in ref_counts.items(): if source in ('Entrez', 'UniProt') and count > 1: logger.info('%s has %d entries for %s, skipping' % (raw_text, count, source)) return None, None muts = [] for id_dict in entity_info['entityId']: if id_dict['source'] == 'Entrez': refs['EGID'] = id_dict['idString'] hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString']) if hgnc_id is not None: # Check against what we may have already inferred from # UniProt. If it disagrees with this, let it be. Inference # from Entrez isn't as reliable. if 'HGNC' in refs.keys(): if refs['HGNC'] != hgnc_id: msg = ('HGNC:%s previously set does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], hgnc_id, refs['EGID']) logger.info(msg) else: refs['HGNC'] = hgnc_id elif id_dict['source'] == 'UniProt': refs['UP'] = id_dict['idString'] gene_name = uniprot_client.get_gene_name(id_dict['idString']) if gene_name is not None: name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id is not None: # Check to see if we have a conflict with an HGNC id # found from the Entrez id. If so, overwrite with this # one, in which we have greater faith. if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id: msg = ('Inferred HGNC:%s from UP:%s does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], refs['UP'], hgnc_id, refs['EGID']) logger.info(msg) refs['HGNC'] = hgnc_id elif id_dict['source'] in ('Tax', 'NCBI'): refs['TAX'] = id_dict['idString'] elif id_dict['source'] == 'CHEBI': refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString'] # These we take as is elif id_dict['source'] in ('MESH', 'OMIM', 'CTD'): refs[id_dict['source']] = id_dict['idString'] # Handle mutations elif id_dict['source'] == 'Unk' and \ id_dict['entityType'] == 'ProteinMutation': # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk', # 'tool': 'PubTator', 'entityType': 'ProteinMutation'} # Mpk1(Y268A)' if id_dict['idString'].startswith('p|SUB|'): try: # Handle special cases like p|SUB|A|30|P;RS#:104893878 parts = id_dict['idString'].split(';')[0].split('|') residue_from, pos, residue_to = parts[2:5] mut = MutCondition(pos, residue_from, residue_to) muts.append(mut) except Exception as e: logger.info('Could not process mutation %s' % id_dict['idString']) else: logger.info('Unhandled mutation: %s' % id_dict['idString']) else: logger.warning("Unhandled id type: {source}={idString}" .format(**id_dict)) raw_coords = (entity_info['charStart'], entity_info['charEnd']) return Agent(name, db_refs=refs, mutations=muts), raw_coords
def get_db_refs_by_name(ns, name, node_data): """Return standard name and grounding based on a namespace and a name. Parameters ---------- ns : str A name space in which the given name is interpreted. name : str The name in the given name space to get grounding for. node_data : dict Node data for logging purposes. Returns ------- name : str The standardized name for the given entity. db_refs : dict The grounding for the given entity. """ db_refs = None if ns == 'HGNC': hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return name, None db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id elif ns in ('UNIPROT', 'UP'): up_id = None gene_name = uniprot_client.get_gene_name(name) if gene_name: up_id = name else: up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name) if up_id_from_mnem: up_id = up_id_from_mnem gene_name = uniprot_client.get_gene_name(up_id) if not up_id: logger.info('Couldn\'t get UP ID from %s' % name) return name, None db_refs = {'UP': up_id} if uniprot_client.is_human(up_id): hgnc_id = hgnc_client.get_hgnc_id(gene_name) if not hgnc_id: logger.info('Uniprot ID linked to invalid human gene ' 'name %s' % name) else: db_refs['HGNC'] = hgnc_id elif ns == 'FPLX': db_refs = {'FPLX': name} elif ns in ('GO', 'GOBP', 'GOCC'): go_id = go_client.get_go_id_from_label(name) if not go_id: logger.info('Could not find GO ID for %s' % name) return name, None db_refs = {'GO': go_id} elif ns in ('MESHPP', 'MESHD', 'MESH'): mesh_id = mesh_client.get_mesh_id_name(name) if not mesh_id: logger.info('Could not find MESH ID fro %s' % name) return name, None db_refs = {'MESH': mesh_id} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns in ('MGI', 'RGD'): db_refs = {ns: name} # Map Selventa families to FamPlexes elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'): hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id else: logger.info('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name elif ns == 'MIRBASE': mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name) if not mirbase_id: logger.info('Could not map miRBase name %s to ID', name) return db_refs = {'MIRBASE': mirbase_id} hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id) if hgnc_id: db_refs['HGNC'] = hgnc_id # CHEBI elif ns == 'CHEBI': chebi_id = chebi_name_id.get(name) if not chebi_id: chebi_id = chebi_client.get_chebi_id_from_name(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM'): db_refs = {ns: name} else: logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) return name, db_refs
def get_agent_from_entity_info(entity_info): """Return an INDRA Agent by processing an entity_info dict.""" # This will be the default name. If we get a gene name, it will # override this rawtext name. raw_text = entity_info['entityText'] name = raw_text # Get the db refs. refs = {'TEXT': raw_text} ref_counts = Counter( [entry['source'] for entry in entity_info['entityId']]) for source, count in ref_counts.items(): if source in ('Entrez', 'UniProt') and count > 1: logger.info('%s has %d entries for %s, skipping' % (raw_text, count, source)) return None, None muts = [] for id_dict in entity_info['entityId']: if id_dict['source'] == 'Entrez': refs['EGID'] = id_dict['idString'] hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString']) if hgnc_id is not None: # Check against what we may have already inferred from # UniProt. If it disagrees with this, let it be. Inference # from Entrez isn't as reliable. if 'HGNC' in refs.keys(): if refs['HGNC'] != hgnc_id: msg = ('HGNC:%s previously set does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], hgnc_id, refs['EGID']) logger.info(msg) else: refs['HGNC'] = hgnc_id elif id_dict['source'] == 'UniProt': refs['UP'] = id_dict['idString'] gene_name = uniprot_client.get_gene_name(id_dict['idString']) if gene_name is not None: name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id is not None: # Check to see if we have a conflict with an HGNC id # found from the Entrez id. If so, overwrite with this # one, in which we have greater faith. if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id: msg = ('Inferred HGNC:%s from UP:%s does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], refs['UP'], hgnc_id, refs['EGID']) logger.info(msg) refs['HGNC'] = hgnc_id elif id_dict['source'] in ('Tax', 'NCBI'): refs['TAX'] = id_dict['idString'] elif id_dict['source'] == 'CHEBI': refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString'] # These we take as is elif id_dict['source'] in ('MESH', 'OMIM', 'CTD'): refs[id_dict['source']] = id_dict['idString'] # Handle mutations elif id_dict['source'] == 'Unk' and \ id_dict['entityType'] == 'ProteinMutation': # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk', # 'tool': 'PubTator', 'entityType': 'ProteinMutation'} # Mpk1(Y268A)' if id_dict['idString'].startswith('p|SUB|'): try: # Handle special cases like p|SUB|A|30|P;RS#:104893878 parts = id_dict['idString'].split(';')[0].split('|') residue_from, pos, residue_to = parts[2:5] mut = MutCondition(pos, residue_from, residue_to) muts.append(mut) except Exception as e: logger.info('Could not process mutation %s' % id_dict['idString']) else: logger.info('Unhandled mutation: %s' % id_dict['idString']) else: logger.warning( "Unhandled id type: {source}={idString}".format(**id_dict)) raw_coords = (entity_info['charStart'], entity_info['charEnd']) return Agent(name, db_refs=refs, mutations=muts), raw_coords
def _urn_to_db_refs(urn): """Converts a Medscan URN to an INDRA db_refs dictionary with grounding information. Parameters ---------- urn : str A Medscan URN Returns ------- db_refs : dict A dictionary with grounding information, mapping databases to database identifiers. If the Medscan URN is not recognized, returns an empty dictionary. db_name : str The Famplex name, if available; otherwise the HGNC name if available; otherwise None """ # Convert a urn to a db_refs dictionary if urn is None: return {}, None m = URN_PATT.match(urn) if m is None: return None, None urn_type, urn_id = m.groups() db_refs = {} db_name = None # TODO: support more types of URNs if urn_type == 'agi-cas': # Identifier is CAS, convert to CHEBI chebi_id = get_chebi_id_from_cas(urn_id) if chebi_id: db_refs['CHEBI'] = 'CHEBI:%s' % chebi_id db_name = get_chebi_name_from_id(chebi_id) elif urn_type == 'agi-llid': # This is an Entrez ID, convert to HGNC hgnc_id = get_hgnc_from_entrez(urn_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id # Convert the HGNC ID to a Uniprot ID uniprot_id = get_uniprot_id(hgnc_id) if uniprot_id is not None: db_refs['UP'] = uniprot_id # Try to lookup HGNC name; if it's available, set it to the # agent name db_name = get_hgnc_name(hgnc_id) elif urn_type in ['agi-meshdis', 'agi-ncimorgan', 'agi-ncimtissue', 'agi-ncimcelltype']: if urn_id.startswith('C') and urn_id[1:].isdigit(): # Identifier is probably UMLS db_refs['UMLS'] = urn_id else: # Identifier is MESH urn_mesh_name = unquote(urn_id) mesh_id, mesh_name = mesh_client.get_mesh_id_name(urn_mesh_name) if mesh_id: db_refs['MESH'] = mesh_id db_name = mesh_name else: db_name = urn_mesh_name elif urn_type == 'agi-gocomplex': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id elif urn_type == 'agi-go': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id # If we have a GO or MESH grounding, see if there is a corresponding # Famplex grounding db_sometimes_maps_to_famplex = ['GO', 'MESH'] for db in db_sometimes_maps_to_famplex: if db in db_refs: key = (db, db_refs[db]) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the urn corresponds to an eccode, groudn to famplex if that eccode # is in the Famplex equivalences table if urn.startswith('urn:agi-enz'): tokens = urn.split(':') eccode = tokens[2] key = ('ECCODE', eccode) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding key = ('MEDSCAN', urn) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If there is a Famplex grounding, use Famplex for entity name if 'FPLX' in db_refs: db_name = db_refs['FPLX'] elif 'GO' in db_refs: db_name = go_client.get_go_label(db_refs['GO']) return db_refs, db_name
def _make_agent(self, hprd_id, refseq_id=None): if hprd_id is None or hprd_id is nan: return None # Get the basic info (HGNC name/symbol, Entrez ID) from the # ID mappings dataframe try: egid = self.id_df.loc[hprd_id].EGID except KeyError: logger.info('HPRD ID %s not found in mappings table.' % hprd_id) return None if not egid: logger.info('No Entrez ID for HPRD ID %s' % hprd_id) return None # Get the HGNC ID hgnc_id = hgnc_client.get_hgnc_from_entrez(egid) # If we couldn't get an HGNC ID for the Entrez ID, this means that # the Entrez ID has been discontinued or replaced. if not hgnc_id: self.no_hgnc_for_egid.append(egid) return None # Get the (possibly updated) HGNC Symbol hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) assert hgnc_name is not None # See if we can get a Uniprot ID from the HGNC symbol--if there is # a RefSeq ID we wil also try to use it to get an isoform specific # UP ID, but we will have this one to fall back on. But if we can't # get one here, then we skip the Statement up_id_from_hgnc = hgnc_client.get_uniprot_id(hgnc_id) if not up_id_from_hgnc: self.no_up_for_hgnc.append((egid, hgnc_name, hgnc_id)) return None # If we have provided the RefSeq ID, it's because we need to make # sure that we are getting the right isoform-specific ID (for sequence # positions of PTMs). Here we try to get the Uniprot ID from the # Refseq->UP mappings in the protmapper.uniprot_client. if refseq_id is not None: # Get the Uniprot IDs from the uniprot client up_ids = uniprot_client.get_ids_from_refseq(refseq_id, reviewed_only=True) # Nothing for this RefSeq ID (quite likely because the RefSeq ID # is obsolete; take the UP ID from HGNC if len(up_ids) == 0: self.no_up_for_refseq.append(refseq_id) up_id = up_id_from_hgnc # More than one reviewed entry--no thanks, we'll take the one from # HGNC instead elif len(up_ids) > 1: self.many_ups_for_refseq.append(refseq_id) up_id = up_id_from_hgnc # We got a unique, reviewed UP entry for the RefSeq ID else: up_id = up_ids[0] # If it's the canonical isoform, strip off the '-1' if up_id.endswith('-1'): up_id = up_id.split('-')[0] # For completeness, get the Refseq ID from the HPRD ID table else: refseq_id = self.id_df.loc[hprd_id].REFSEQ_PROTEIN up_id = up_id_from_hgnc # Make db_refs, return Agent db_refs = {'HGNC': hgnc_id, 'UP': up_id, 'EGID': egid, 'REFSEQ_PROT': refseq_id} return Agent(hgnc_name, db_refs=db_refs)
def _urn_to_db_refs(urn): """Converts a Medscan URN to an INDRA db_refs dictionary with grounding information. Parameters ---------- url : str A Medscan URN Returns ------- db_refs : dict A dictionary with grounding information, mapping databases to database identifiers. If the Medscan URN is not recognized, returns an empty dictionary. db_name : str The Famplex name, if available; otherwise the HGNC name if available; otherwise None """ # Convert a urn to a db_refs dictionary if urn is None: return {}, None p = 'urn:([^:]+):([^:]+)' m = re.match(p, urn) if m is None: return None, None urn_type = m.group(1) urn_id = m.group(2) db_refs = {} db_name = None # TODO: support more types of URNs if urn_type == 'agi-cas': # Identifier is CAS, convert to CHEBI chebi_id = get_chebi_id_from_cas(urn_id) if chebi_id: db_refs['CHEBI'] = 'CHEBI:%s' % chebi_id elif urn_type == 'agi-llid': # This is an Entrez ID, convert to HGNC hgnc_id = get_hgnc_from_entrez(urn_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id # Convert the HGNC ID to a Uniprot ID uniprot_id = get_uniprot_id(hgnc_id) db_refs['UP'] = uniprot_id # Try to lookup HGNC name; if it's available, set it to the # agent name db_name = get_hgnc_name(hgnc_id) elif urn_type == 'agi-ncimorgan': # Identifier is MESH db_refs['MESH'] = urn_id elif urn_type == 'agi-ncimcelltype': # Identifier is MESH db_refs['MESH'] = urn_id elif urn_type == 'agi-meshdis': # Identifier is MESH db_refs['MESHDIS'] = urn_id elif urn_type == 'agi-gocomplex': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id elif urn_type == 'agi-go': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id elif urn_type == 'agi-ncimtissue': # Identifier is MESH db_refs['MESH'] = urn_id # If we have a GO or MESH grounding, see if there is a corresponding # Famplex grounding db_sometimes_maps_to_famplex = ['GO', 'MESH'] for db in db_sometimes_maps_to_famplex: if db in db_refs: key = (db, db_refs[db]) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the urn corresponds to an eccode, groudn to famplex if that eccode # is in the Famplex equivalences table if urn.startswith('urn:agi-enz'): tokens = urn.split(':') eccode = tokens[2] key = ('ECCODE', eccode) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding key = ('MEDSCAN', urn) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If there is a Famplex grounding, use Famplex for entity name if 'FPLX' in db_refs: db_name = db_refs['FPLX'] return db_refs, db_name
def get_agent(node_data, node_modifier_data=None): # FIXME: Handle translocations on the agent for ActiveForms, turn into # location conditions # Check the node type/function node_func = node_data[pc.FUNCTION] if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX, pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA): mod_data = node_modifier_data or 'No node data' logger.info("Nodes of type %s not handled: %s", node_func, mod_data) return None # Skip gene/protein fusions if pc.FUSION in node_data: logger.info("Gene and protein fusions not handled: %s" % str(node_data)) return None # COMPLEXES ------------ # First, handle complexes, which will consist recursively of other agents if node_func == pc.COMPLEX: # First, check for members: if there are no members, we assume this # is a named complex members = node_data.get(pc.MEMBERS) if members is None: return None # Otherwise, get the "main" agent, to which the other members will be # attached as bound conditions main_agent = get_agent(members[0]) # If we can't get the main agent, return None if main_agent is None: return None bound_conditions = [BoundCondition(get_agent(m), True) for m in members[1:]] # Check the bound_conditions for any None agents if any([bc.agent is None for bc in bound_conditions]): return None main_agent.bound_conditions = bound_conditions # Get activity of main agent ac = _get_activity_condition(node_modifier_data) main_agent.activity = ac return main_agent # OTHER NODE TYPES ----- # Get node identifier information name = node_data.get(pc.NAME) ns = node_data[pc.NAMESPACE] ident = node_data.get(pc.IDENTIFIER) # No ID present, get identifier using the name, namespace db_refs = None if not ident: assert name, "Node must have a name if lacking an identifier." if ns == 'HGNC': hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return None db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id # FIXME: Look up go ID in ontology lookup service # FIXME: Look up MESH IDs from name # FIXME: For now, just use node name elif ns in ('GOBP', 'MESHPP', 'MESHD'): db_refs = {} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns in ('MGI', 'RGD'): db_refs = {ns: name} # Map Selventa families to FamPlexes elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns == 'EGID': hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) else: logger.info('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name # CHEBI elif ns == 'CHEBI': chebi_id = chebi_name_id.get(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM'): db_refs = {ns: name} else: print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) # We've already got an identifier, look up other identifiers if necessary else: # Get the name, overwriting existing name if necessary if ns == 'HGNC': name = hgnc_client.get_hgnc_name(ident) db_refs = {'HGNC': ident} up_id = _get_up_id(ident) if up_id: db_refs['UP'] = up_id elif ns == 'UP': db_refs = {'UP': ident} name = uniprot_client.get_gene_name(ident) assert name if uniprot_client.is_human(ident): hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info('Uniprot ID linked to invalid human gene ' 'name %s' % name) else: db_refs['HGNC'] = hgnc_id elif ns in ('MGI', 'RGD'): raise ValueError('Identifiers for MGI and RGD databases are not ' 'currently handled: %s' % node_data) else: print("Unhandled namespace with identifier: %s: %s (%s)" % (ns, name, node_data)) if db_refs is None: logger.info('Unable to get identifier information for node: %s', node_data) return None # Get modification conditions mods, muts = _get_all_pmods(node_data) # Get activity condition ac = _get_activity_condition(node_modifier_data) to_loc = _get_translocation_target(node_modifier_data) # Check for unhandled node modifiers, skip if so if _has_unhandled_modifiers(node_modifier_data): return None # Make the agent ag = Agent(name, db_refs=db_refs, mods=mods, mutations=muts, activity=ac, location=to_loc) return ag
def _get_agent(node_data, node_modifier_data=None): # FIXME: Handle translocations on the agent for ActiveForms, turn into # location conditions # Check the node type/function node_func = node_data[pc.FUNCTION] if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX, pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA): mod_data = ('No node data' if not node_modifier_data else node_modifier_data.get(pc.CNAME)) logger.info("Nodes of type %s not handled: %s" % (node_func, mod_data)) return None # Skip gene/protein fusions if pc.FUSION in node_data: logger.info("Gene and protein fusions not handled: %s" % str(node_data)) return None # COMPLEXES ------------ # First, handle complexes, which will consist recursively of other agents if node_func == pc.COMPLEX: # First, check for members: if there are no members, we assume this # is a named complex members = node_data.get(pc.MEMBERS) if members is None: return None # Otherwise, get the "main" agent, to which the other members will be # attached as bound conditions main_agent = _get_agent(members[0]) # If we can't get the main agent, return None if main_agent is None: return None bound_conditions = [ BoundCondition(_get_agent(m), True) for m in members[1:] ] # Check the bound_conditions for any None agents if any([bc.agent is None for bc in bound_conditions]): return None main_agent.bound_conditions = bound_conditions # Get activity of main agent ac = _get_activity_condition(node_modifier_data) main_agent.activity = ac return main_agent # OTHER NODE TYPES ----- # Get node identifier information name = node_data.get(pc.NAME) ns = node_data[pc.NAMESPACE] ident = node_data.get(pc.IDENTIFIER) # No ID present, get identifier using the name, namespace db_refs = None if not ident: assert name, "Node must have a name if lacking an identifier." if ns == 'HGNC': hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return None db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id # FIXME: Look up go ID in ontology lookup service # FIXME: Look up MESH IDs from name # FIXME: For now, just use node name elif ns in ('GOBP', 'MESHPP', 'MESHD'): db_refs = {} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns in ('MGI', 'RGD'): db_refs = {ns: name} # Map Selventa families to FamPlexes elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns == 'EGID': hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.' % (name, hgnc_id)) else: logger.info('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name # CHEBI elif ns == 'CHEBI': chebi_id = chebi_name_id.get(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM'): db_refs = {ns: name} else: print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) # We've already got an identifier, look up other identifiers if necessary else: # Get the name, overwriting existing name if necessary if ns == 'HGNC': name = hgnc_client.get_hgnc_name(ident) db_refs = {'HGNC': ident} up_id = _get_up_id(ident) if up_id: db_refs['UP'] = up_id elif ns == 'UP': db_refs = {'UP': ident} name = uniprot_client.get_gene_name(ident) assert name if uniprot_client.is_human(ident): hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info('Uniprot ID linked to invalid human gene ' 'name %s' % name) else: db_refs['HGNC'] = hgnc_id elif ns in ('MGI', 'RGD'): raise ValueError('Identifiers for MGI and RGD databases are not ' 'currently handled: %s' % node_data) else: print("Unhandled namespace with identifier: %s: %s (%s)" % (ns, name, node_data)) if db_refs is None: logger.info('Unable to get identifier information for node: %s' % node_data) return None # Get modification conditions mods, muts = _get_all_pmods(node_data) # Get activity condition ac = _get_activity_condition(node_modifier_data) to_loc = _get_translocation_target(node_modifier_data) # Check for unhandled node modifiers, skip if so if _has_unhandled_modifiers(node_modifier_data): return None # Make the agent ag = Agent(name, db_refs=db_refs, mods=mods, mutations=muts, activity=ac, location=to_loc) return ag
def get_db_refs_by_name(ns, name, node_data): """Return standard name and grounding based on a namespace and a name. Parameters ---------- ns : str A name space in which the given name is interpreted. name : str The name in the given name space to get grounding for. node_data : dict Node data for logging purposes. Returns ------- name : str The standardized name for the given entity. db_refs : dict The grounding for the given entity. """ db_refs = None if ns == 'HGNC': # Assumption: name is an HGNC symbol hgnc_id = hgnc_client.get_current_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return name, None elif isinstance(hgnc_id, list): logger.info('More than one current HGNC ID for %s, choosing %s' % (name, hgnc_id[0])) hgnc_id = hgnc_id[0] name = hgnc_client.get_hgnc_name(hgnc_id) db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id elif ns in ('UNIPROT', 'UP'): up_id = None # This is a simple test to see if name is a valid UniProt ID, # if we can't get a mnemonic, we assume it's not a UP ID if uniprot_client.get_mnemonic(name, web_fallback=False): up_id = name # We next check if it's a mnemonic else: up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name) if up_id_from_mnem: up_id = up_id_from_mnem if not up_id: logger.info('Couldn\'t get UP ID from %s' % name) return name, None db_refs = {'UP': up_id} hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) else: name = uniprot_client.get_gene_name(up_id) elif ns == 'FPLX': db_refs = {'FPLX': name} elif ns in ('GO', 'GOBP', 'GOCC'): if name == 'cell proliferation': name = 'cell population proliferation' go_id = go_client.get_go_id_from_label(name) if not go_id: logger.info('Could not find GO ID for %s' % name) return name, None db_refs = {'GO': go_id} name = go_client.get_go_label(go_id) elif ns in ('MESHPP', 'MESHD', 'MESH'): mesh_id, mesh_name = mesh_client.get_mesh_id_name(name) if not mesh_id: logger.info('Could not find MESH ID from %s' % name) return name, None name = mesh_name db_refs = {'MESH': mesh_id} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns == 'MGI': up_id = mouse_lookup.get(name) if up_id: db_refs = {'UP': up_id} elif ns == 'RGD': up_id = rat_lookup.get(name) if up_id: db_refs = {'UP': up_id} # Map Selventa families and complexes to FamPlex elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name elif ns == 'SCOMP': db_refs = {'SCOMP': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SCOMP complex: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'): hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info( 'HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id else: logger.debug('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name elif ns == 'MIRBASE': mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name) if not mirbase_id: logger.info('Could not map miRBase name %s to ID', name) return name, None db_refs = {'MIRBASE': mirbase_id} hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) # CHEBI elif ns == 'CHEBI': # We first look up BEL's own namespace map for ChEBI names to IDs chebi_id = chebi_name_id.get(name) # If that fails, we look up INDRA's ChEBI name to ID mapping if not chebi_id: chebi_id = chebi_client.get_chebi_id_from_name(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # These appear in the name slot but are actually IDs elif ns == 'CHEBIID': chebi_id = identifiers.ensure_chebi_prefix(name) db_refs = {'CHEBI': chebi_id} name = chebi_client.get_chebi_name_from_id(chebi_id) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM', 'TEXT'): db_refs = {ns: name} elif ns == 'TAX': tid = taxonomy_client.get_taxonomy_id(name) if tid: db_refs = {'TAXONOMY': tid} else: logger.info('Could not get taxonomy ID for %s' % name) else: logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) return name, db_refs
"""Reads a list of kinases annotated with which ones are considered "dark" kinases, and retrieves a list of statements associated with them from the INDRA database.""" import pickle import pandas as pd from indra.db import client as dbc from indra.databases import hgnc_client dk_col = 'in_IDG_darkkinases' kinase_file = 'Table_001_all_kinases.csv' if __name__ == '__main__': kinases = pd.read_csv(kinase_file, delimiter=',', header=0) dark_kinases = kinases[kinases[dk_col]] results = {} for egid in dark_kinases.gene_id: hgnc_id = hgnc_client.get_hgnc_from_entrez(str(egid)) if hgnc_id is None: print("No HGNC id for Entrez Gene id %s" % egid) continue gene_sym = hgnc_client.get_hgnc_name(hgnc_id) if gene_sym is None: print("No symbol for gene id %s" % hgnc_id) stmts = dbc.get_statements_by_gene_role_type(agent_id=hgnc_id, agent_ns='HGNC') results[gene_sym] = stmts with open('dark_kinase_stmts.pkl', 'wb') as f: pickle.dump(results, f)