def standardize_agent_db_refs(agent, map_db_refs, do_rename=True): gene_name = None up_id = map_db_refs.get('UP') hgnc_sym = map_db_refs.get('HGNC') if up_id and not hgnc_sym: gene_name = uniprot_client.get_gene_name(up_id, False) if gene_name: hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: map_db_refs['HGNC'] = hgnc_id elif hgnc_sym and not up_id: # Override the HGNC symbol entry from the grounding # map with an HGNC ID hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym) if hgnc_id: map_db_refs['HGNC'] = hgnc_id # Now get the Uniprot ID for the gene up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: map_db_refs['UP'] = up_id # If there's no HGNC ID for this symbol, raise an # Exception else: raise ValueError('No HGNC ID corresponding to gene ' 'symbol %s in grounding map.' % hgnc_sym) # If we have both, check the gene symbol ID against the # mapping from Uniprot elif up_id and hgnc_sym: # Get HGNC Symbol from Uniprot gene_name = uniprot_client.get_gene_name(up_id) if not gene_name: raise ValueError('No gene name found for Uniprot ' 'ID %s (expected %s)' % (up_id, hgnc_sym)) # We got gene name, compare it to the HGNC name else: if gene_name != hgnc_sym: raise ValueError('Gene name %s for Uniprot ID ' '%s does not match HGNC ' 'symbol %s given in grounding ' 'map.' % (gene_name, up_id, hgnc_sym)) else: hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym) if not hgnc_id: logger.error('No HGNC ID corresponding to gene ' 'symbol %s in grounding map.' % hgnc_sym) else: map_db_refs['HGNC'] = hgnc_id # Assign the DB refs from the grounding map to the agent agent.db_refs = map_db_refs # Are we renaming right now? if do_rename: # If there's a FamPlex ID, prefer that for the name if agent.db_refs.get('FPLX'): agent.name = agent.db_refs.get('FPLX') # Get the HGNC symbol or gene name (retrieved above) elif hgnc_sym is not None: agent.name = hgnc_sym elif gene_name is not None: agent.name = gene_name return
def test_specific_query(self): """Test whether we can get a "fully" specified statement.""" resp = self.__check_good_statement_query(object='MAP2K1', subject='MAPK1', type='Phosphorylation') _check_stmt_agents(resp, agents=[ (0, 'HGNC', hgnc_client.get_hgnc_id('MAPK1')), (1, 'HGNC', hgnc_client.get_hgnc_id('MAP2K1')) ])
def read_phosphosite(fname): df = pandas.read_csv(fname, index_col=None) statements = [] antibody_map = {} for _, row in df.iterrows(): sub_upid = row['SUB_ID'] if not pandas.isnull(sub_upid): sub_hgnc_symbol = uniprot_client.get_gene_name(sub_upid) sub_hgnc = hgnc_client.get_hgnc_id(sub_hgnc_symbol) else: sub_hgnc_symbol = row['SUB_GENE'] sub_hgnc_id = hgnc_client.get_hgnc_id(sub_hgnc_symbol) sub_upid = hgnc_client.get_uniprot_id(sub_hgnc_id) sub = Agent(sub_hgnc_symbol, db_refs={'UP': sub_upid,'HGNC': sub_hgnc}) residue = row['Actual_site'][0] if len(row['Actual_site']) > 1: position = row['Actual_site'][1:] else: position = None sub_readout = deepcopy(sub) mc = ModCondition('phosphorylation', residue, position) sub_readout.mods = [mc] ps = row['phosphosite'] if ps in antibody_map: found = False for p in antibody_map[ps]: if p.name == sub.name and p.mods[0].residue == residue and \ p.mods[0].position == position: found = True break if not found: antibody_map[ps].append(sub_readout) else: antibody_map[ps] = [sub_readout] kin_upid = row['KIN_ID'] if not pandas.isnull(kin_upid): if not uniprot_client.is_human(kin_upid): print('%s non human' % kin_upid) continue kin_hgnc_symbol = uniprot_client.get_gene_name(kin_upid) kin_hgnc = hgnc_client.get_hgnc_id(kin_hgnc_symbol) else: kin_hgnc_symbol = row['KINASE_GENE_SYMBOL'] kin_hgnc_id = hgnc_client.get_hgnc_id(kin_hgnc_symbol) kin_upid = hgnc_client.get_uniprot_id(kin_hgnc_id) kin = Agent(kin_hgnc_symbol, db_refs={'UP': kin_upid,'HGNC': kin_hgnc}) ev = Evidence(source_api='phosphosite') st = Phosphorylation(kin, sub, residue, position, evidence = [ev]) statements.append(st) return statements, antibody_map
def test_query_with_two_agents(self): """Test a query were the roles of the agents are not given.""" resp = self.__check_good_statement_query(agent0='MAP2K1', agent1='MAPK1', type='Phosphorylation') _check_stmt_agents(resp, agents=[(None, 'HGNC', hgnc_client.get_hgnc_id('MAPK1')), (None, 'HGNC', hgnc_client.get_hgnc_id('MAP2K1'))]) return
def get_agent(self, acsn_agent: str) -> Union[Agent, None]: """Return an INDRA Agent corresponding to an ACSN agent. Parameters ---------- acsn_agent : Agent extracted from the relations statement data frame Returns ------- : Returns INDRA agent with HGNC or FamPlex ID in db_refs. If there are no groundings available, we return None. """ mapping = self.correspondence_dict.get(acsn_agent) if not mapping: return None if len(mapping) == 1: hgnc_id = get_hgnc_id(mapping[0]) if hgnc_id: db_refs = {'HGNC': hgnc_id} return get_standard_agent(mapping[0], db_refs=db_refs) else: fplx_rel = self.fplx_lookup.get( tuple(sorted(self.correspondence_dict[acsn_agent]))) if fplx_rel: db_refs = {'FPLX': fplx_rel} return get_standard_agent(fplx_rel, db_refs=db_refs) return None
def _get_complex_agents(self, complex_id): """Returns a list of agents corresponding to each of the constituents in a SIGNOR complex.""" agents = [] components = self._recursively_lookup_complex(complex_id) for c in components: db_refs = {} name = uniprot_client.get_gene_name(c) if name is None: db_refs['SIGNOR'] = c else: db_refs['UP'] = c hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id: db_refs['HGNC'] = hgnc_id famplex_key = ('SIGNOR', c) if famplex_key in famplex_map: db_refs['FPLX'] = famplex_map[famplex_key] if not name: name = db_refs['FPLX'] # Set agent name to Famplex name if # the Uniprot name is not available elif not name: # We neither have a Uniprot nor Famplex grounding logger.info('Have neither a Uniprot nor Famplex grounding ' + \ 'for ' + c) if not name: name = db_refs['SIGNOR'] # Set the agent name to the # Signor name if neither the # Uniprot nor Famplex names are # available assert(name is not None) agents.append(Agent(name, db_refs=db_refs)) return agents
def test_get_agent(): # Protein/gene # Create an empty Signor processor sp = SignorProcessor([]) test_ag = Agent('RELA', db_refs={ 'HGNC': hgnc_client.get_hgnc_id('RELA'), 'UP': 'Q04206' }) sp_ag = sp._get_agent(test_row.ENTITYA, test_row.TYPEA, test_row.IDA, test_row.DATABASEA) assert test_ag.matches(sp_ag) # Chemical test_ag = Agent('AZD1480', db_refs={'PUBCHEM': 'CID:16659841'}) sp_ag = sp._get_agent('AZD1480', 'chemical', 'CID:16659841', 'PUBCHEM') assert test_ag.matches(sp_ag) # Signor phenotype test_ag = Agent('Cell cycle progr.', db_refs={'SIGNOR': 'SIGNOR-PH42'}) sp_ag = sp._get_agent('Cell cycle progr.', 'phenotype', 'SIGNOR-PH42', 'SIGNOR') assert test_ag.matches(sp_ag) # Ungrounded -- couldn't find a real example in the dataset test_ag = Agent('Foobar', db_refs={}) sp_ag = sp._get_agent('Foobar', 'pathway', None, None) assert test_ag.matches(sp_ag) sp_ag = sp._get_agent('Foobar', 'antibody', None, None) assert test_ag.matches(sp_ag)
def _add_node(self, agent): node_key = agent.name node_id = self._existing_nodes.get(node_key) if node_id is not None: return node_id db_refs = _get_db_refs(agent) node_id = self._get_new_id() self._existing_nodes[node_key] = node_id node_name = agent.name node_name = node_name.replace('_', ' ') expanded_families = expander.get_children(agent, ns_filter='HGNC') members = {} for member in expanded_families: hgnc_symbol = member[1] hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol) if hgnc_id: up_id = hgnc_client.get_uniprot_id(hgnc_id) member_agent = Agent(hgnc_symbol, db_refs={'HGNC': hgnc_id, 'UP': up_id}) member_db_refs = _get_db_refs(member_agent) else: member_db_refs = {} members[member[1]] = { 'mutation': None, 'expression': None, 'db_refs': member_db_refs } node = {'data': {'id': node_id, 'name': node_name, 'db_refs': db_refs, 'parent': '', 'members': members}} self._nodes.append(node) return node_id
def fix_stmts(stmts): new_stmts = [] for stmt in stmts: for ev in stmt.evidence: if ev.pmid and ev.pmid.startswith('PMID'): ev.pmid = ev.pmid[:-4] # Skip if no subject if isinstance(stmt, RegulateActivity): if stmt.subj is None: continue # Skip if no locations if isinstance(stmt, Translocation): if not (stmt.from_location or stmt.to_location): continue for agent in stmt.agent_list(): if agent is not None: upid = agent.db_refs.get('UP') if upid: gene_name = uniprot_client.get_gene_name(upid) if gene_name: agent.name = gene_name if uniprot_client.is_human(upid): hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: agent.db_refs['HGNC'] = hgnc_id new_stmts.append(stmt) return new_stmts
def get_grounding_from_name(name): """Return grounding given an agent name.""" # See if it's a gene name hgnc_id = get_hgnc_id(name) if hgnc_id: return ('HGNC', hgnc_id) # Check if it's in the grounding map try: refs = gm[name] if isinstance(refs, dict): for dbn, dbi in refs.items(): if dbn != 'TEXT': return (dbn, dbi) # If not, search by text except KeyError: pass chebi_id = get_chebi_id_from_name(name) if chebi_id: return ('CHEBI', f'CHEBI:{chebi_id}') mesh_id, _ = get_mesh_id_name(name) if mesh_id: return ('MESH', mesh_id) return None
def get_mappings() -> Iterable[PredictionTuple]: """Iterate high-confidence lexical mappings between MeSH and UniProt human proteins.""" url = get_script_url(__file__) mapping_type = "lexical" match_type = "skos:exactMatch" confidence = 0.999 for mesh_name, mesh_id in mesh_client.mesh_name_to_id.items(): match = MESH_PROTEIN_RE.match(mesh_name) if not match: continue gene_name = match.groups()[0] hgnc_id = hgnc_client.get_hgnc_id(gene_name) if not hgnc_id: continue uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if not uniprot_id or "," in uniprot_id: continue yield PredictionTuple( "mesh", mesh_id, mesh_name, match_type, "uniprot", uniprot_id, gene_name, mapping_type, confidence, url, )
def get_relevant_nodes(self, pct_heat_threshold): """Return a list of the relevant nodes in the prior. Heat diffusion is applied to the prior network based on initial heat on nodes that are mutated according to patient statistics. """ logger.info('Setting heat for relevant nodes in prior network') heats = np.zeros(len(self.prior_graph)) mut_nodes = {} for gene_name, muts in self.norm_mutations.items(): if muts: hgnc_id = get_hgnc_id(gene_name) node_key = 'HGNC:%s' % hgnc_id mut_nodes[node_key] = muts for idx, node in enumerate(self.prior_graph.nodes()): if node in mut_nodes: heats[idx] = mut_nodes[node] gamma = -0.1 logger.info('Calculating Laplacian matrix') lp_mx = nx.normalized_laplacian_matrix(self.prior_graph, weight='weight') logger.info('Diffusing heat') Df = expm_multiply(gamma * lp_mx, heats) heat_thresh = np.percentile(Df, pct_heat_threshold) logger.info('Filtering to relevant nodes with heat threshold %.2f ' '(%s percentile)' % (heat_thresh, pct_heat_threshold)) # Zip the nodes with their heats and sort node_heats = sorted(list(zip(self.prior_graph.nodes(), Df)), key=lambda x: x[1], reverse=True) relevant_nodes = [n for n, heat in node_heats if heat >= heat_thresh] return relevant_nodes
def get_grounding_from_name(name): # See if it's a gene name hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id: return ('HGNC', hgnc_id) # Check if it's in the grounding map try: refs = gm[name] if isinstance(refs, dict): for dbn, dbi in refs.items(): if dbn != 'TEXT': return (dbn, dbi) # If not, search by text except KeyError: pass # If none of these, we try TRIPS try: print('Looking up %s with TRIPS' % name) tp = trips.process_text(name, service_endpoint='drum-dev') terms = tp.tree.findall('TERM') if not terms: return ('TEXT', name) term_id = terms[0].attrib['id'] agent = tp._get_agent_by_id(term_id, None) if 'HGNC' in agent.db_refs: return ('HGNC', agent.db_refs['HGNC']) if 'FPLX' in agent.db_refs: return ('FPLX', agent.db_refs['FPLX']) except Exception as e: print(e) return ('TEXT', name) return ('TEXT', name)
def get_all_gene_names(data): gene_names = data['antibody']['Gene Name'] uniprot_ids = data['antibody']['UniProt ID'] all_genes = set() invalid_genes = set() for gn, upid in zip(gene_names, uniprot_ids): # Some entries are lists of genes separated by commas # and we also strip off extra spaces names = [x.strip() for x in gn.split(',')] ids = [x.strip() for x in upid.split(',')] names_from_ids = [uniprot_client.get_gene_name(x) for x in ids] # Find invalid gene names for name in names: if not hgnc_client.get_hgnc_id(name): print('Invalid or deprecated gene symbol: %s' % name) invalid_genes.add(name) # Find inconsistent gene names and UniProt IDs if set(names) != set(names_from_ids): print('Inconsistent entries:') print('- Given gene names: %s' % ','.join(names)) print('- Genes from uniprot IDs: %s' % ','.join(names_from_ids)) # Add both the gene names and the gene names derived from UniProt IDs all_genes = all_genes.union(set(names)).union(set(names_from_ids)) # Finally remove the invalid gene names all_genes = all_genes.difference(invalid_genes) all_genes = sorted(list(all_genes)) return all_genes
def test_get_agent(): # Protein/gene # Create an empty Signor processor sp = SignorProcessor([]) test_ag = Agent('RELA', db_refs={'HGNC': hgnc_client.get_hgnc_id('RELA'), 'UP': 'Q04206'}) sp_ag = sp._get_agent(test_row.ENTITYA, test_row.TYPEA, test_row.IDA, test_row.DATABASEA) assert test_ag.matches(sp_ag) # Chemical test_ag = Agent('AZD1480', db_refs={'PUBCHEM': '16659841'}) sp_ag = sp._get_agent('AZD1480', 'chemical', 'CID: 16659841', 'PUBCHEM') assert test_ag.matches(sp_ag) # Signor phenotype test_ag = Agent('Cell cycle progr.', db_refs={'SIGNOR': 'SIGNOR-PH42'}) sp_ag = sp._get_agent('Cell cycle progr.', 'phenotype', 'SIGNOR-PH42', 'SIGNOR') assert test_ag.matches(sp_ag) # Ungrounded -- couldn't find a real example in the dataset test_ag = Agent('Foobar', db_refs={}) sp_ag = sp._get_agent('Foobar', 'pathway', None, None) assert test_ag.matches(sp_ag) sp_ag = sp._get_agent('Foobar', 'antibody', None, None) assert test_ag.matches(sp_ag)
def agent_from_gene_name(gene_name): """Return an Agent based on a gene name.""" hgnc_id = hgnc_client.get_hgnc_id(gene_name) up_id = hgnc_client.get_uniprot_id(hgnc_id) agent = Agent(gene_name, db_refs={'HGNC': hgnc_id, 'UP': up_id}) return agent
def rename_agents(self, stmts): # Make a copy of the stmts mapped_stmts = deepcopy(stmts) # Iterate over the statements for stmt_ix, stmt in enumerate(mapped_stmts): # Iterate over the agents for agent in stmt.agent_list(): if agent is None: continue old_name = agent.name # If there's a Bioentities ID, prefer that for the name if agent.db_refs.get('BE'): agent.name = agent.db_refs.get('BE') # Take a HGNC name from Uniprot next elif agent.db_refs.get('UP'): # Try for the gene name gene_name = uniprot_client.get_gene_name( agent.db_refs.get('UP'), web_fallback=False) if gene_name: agent.name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: agent.db_refs['HGNC'] = hgnc_id # Take the text string #if agent.db_refs.get('TEXT'): # agent.name = agent.db_refs.get('TEXT') # If this fails, then we continue with no change # Fall back to the text string #elif agent.db_refs.get('TEXT'): # agent.name = agent.db_refs.get('TEXT') return mapped_stmts
def agent_from_gene_name(name): """Return a grounded Agent based on a gene name.""" agent = Agent(name) hgnc_id = hgnc_client.get_hgnc_id(name) uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) agent.db_refs = {'HGNC': hgnc_id, 'UP': uniprot_id} return agent
def _get_complex_agents(self, complex_id): """Returns a list of agents corresponding to each of the constituents in a SIGNOR complex.""" agents = [] components = self._recursively_lookup_complex(complex_id) for c in components: db_refs = {} name = uniprot_client.get_gene_name(c) if name is None: db_refs['SIGNOR'] = c else: db_refs['UP'] = c hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id: db_refs['HGNC'] = hgnc_id famplex_key = ('SIGNOR', c) if famplex_key in famplex_map: db_refs['FPLX'] = famplex_map[famplex_key] if not name: name = db_refs['FPLX'] # Set agent name to Famplex name if # the Uniprot name is not available elif not name: # We neither have a Uniprot nor Famplex grounding logger.info('Have neither a Uniprot nor Famplex grounding ' + \ 'for ' + c) if not name: name = db_refs['SIGNOR'] # Set the agent name to the # Signor name if neither the # Uniprot nor Famplex names are # available assert (name is not None) agents.append(Agent(name, db_refs=db_refs)) return agents
def _get_db_refs(bpe): db_refs = {} if _is_protein(bpe): hgnc_id = BiopaxProcessor._get_hgnc_id(bpe) uniprot_id = BiopaxProcessor._get_uniprot_id(bpe) # Handle missing HGNC/UP ids if hgnc_id and not uniprot_id: uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if uniprot_id and not hgnc_id: if uniprot_client.is_human(uniprot_id): hgnc_name = uniprot_client.get_gene_name(uniprot_id, False) if hgnc_name: hgnc_id = hgnc_client.get_hgnc_id(hgnc_name) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id if uniprot_id is not None: db_refs['UP'] = uniprot_id elif _is_small_molecule(bpe): chebi_id = BiopaxProcessor._get_chebi_id(bpe) if chebi_id is not None: db_refs['CHEBI'] = chebi_id else: chebi_id = BiopaxProcessor._get_chebi_id(bpe) if chebi_id is not None: db_refs['CHEBI'] = chebi_id hgnc_id = BiopaxProcessor._get_hgnc_id(bpe) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id uniprot_id = BiopaxProcessor._get_uniprot_id(bpe) if uniprot_id is not None: db_refs['UP'] = uniprot_id return db_refs
def update_kinases(): logger.info('--Updating kinase list------') url = 'http://www.uniprot.org/uniprot/?' + \ 'sort=entry_name&desc=no&compress=no&query=database:(type:' + \ 'interpro%20ipr011009)%20AND%20reviewed:yes%20AND%20organism:' + \ '%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no' + \ '&format=tab&columns=id,genes(PREFERRED),organism-id,entry%20name' fname = os.path.join(path, 'kinases.tsv') save_from_http(url, fname) from indra.databases import hgnc_client, uniprot_client add_kinases = [ 'PGK1', 'PKM', 'TAF1', 'NME1', 'BCKDK', 'PDK1', 'PDK2', 'PDK3', 'PDK4', 'BCR', 'FAM20C', 'BAZ1B', 'PIKFYVE' ] df = pandas.read_csv(fname, sep='\t') for kinase in add_kinases: hgnc_id = hgnc_client.get_hgnc_id(kinase) up_id = hgnc_client.get_uniprot_id(hgnc_id) up_mnemonic = uniprot_client.get_mnemonic(up_id) df = df.append( { 'Entry': up_id, 'Gene names (primary )': kinase, 'Organism ID': '9606', 'Entry name': up_mnemonic }, ignore_index=True) df.to_csv(fname, sep='\t', index=False)
def _extract_protein(self, line): # Extract key information from the lines. prot_name = line['Protein Name'] prot_id = line['Protein HMS LINCS ID'] # Get available db-refs. db_refs = {} if prot_id: db_refs.update(self._lc.get_protein_refs(prot_id)) # Since the resource only gives us an UP ID (not HGNC), we # try to get that and standardize the name to the gene name up_id = db_refs.get('UP') if up_id: gene_name = uniprot_client.get_gene_name(up_id) if gene_name: prot_name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id # In some cases lines are missing protein information in which # case we return None else: return None # Create the agent. return Agent(prot_name, db_refs=db_refs)
def test_query_with_other(self): """Test that we can get an ActiveForm.""" resp = self.__check_good_statement_query(agent='MAPK1', type='ActiveForm') _check_stmt_agents(resp, agents=[(0, 'HGNC', hgnc_client.get_hgnc_id('MAPK1'))]) return
def test_object_only_query(self): """Test whether we can get an object only statement.""" resp = self.__check_good_statement_query(object='GLUL', type='IncreaseAmount') _check_stmt_agents(resp, agents=[(1, 'HGNC', hgnc_client.get_hgnc_id('GLUL')) ]) return
def test_active_form(): ras = Agent('KRAS', mutations=[MutCondition('12', 'G', 'V')], db_refs={'HGNC':'6407'}) mapk1_p = Agent('MAP2K1', mods=[ModCondition('phosphorylation', 'T', '185')], db_refs={'HGNC': hgnc_client.get_hgnc_id('MAP2K1')}) mapk1_pp = Agent('MAP2K1', mods=[ModCondition('phosphorylation', 'T', '185'), ModCondition('phosphorylation', 'Y', '187')], db_refs={'HGNC': hgnc_client.get_hgnc_id('MAP2K1')}) stmt1 = ActiveForm(ras, 'gtpbound', True) stmt2 = ActiveForm(mapk1_p, 'kinase', True) stmt3 = ActiveForm(mapk1_pp, 'kinase', True) for stmt in (stmt1, stmt2, stmt3): pba = pa.PybelAssembler([stmt]) belgraph = pba.make_model() assert len(belgraph) == 2
def test_bad_camel(self): """Test that a type can be poorly formatted and resolve correctly.""" resp = self.__check_good_statement_query(agent='MAPK1', type='acTivefOrm') _check_stmt_agents(resp, agents=[(0, 'HGNC', hgnc_client.get_hgnc_id('MAPK1'))]) return
def test_query_with_hgnc_symbol_ns(self): """Test specifying HGNC-SYMBOL as a namespace.""" resp = self.__check_good_statement_query(subject='MAPK1@HGNC-SYMBOL', type='Phosphorylation') _check_stmt_agents(resp, agents=[(0, 'HGNC', hgnc_client.get_hgnc_id('MAPK1'))]) return
def test_active_form(): ras = Agent('KRAS', mutations=[MutCondition('12', 'G', 'V')], db_refs={'HGNC': '6407'}) mapk1_p = Agent('MAP2K1', mods=[ModCondition('phosphorylation', 'T', '185')], db_refs={'HGNC': hgnc_client.get_hgnc_id('MAP2K1')}) mapk1_pp = Agent('MAP2K1', mods=[ModCondition('phosphorylation', 'T', '185'), ModCondition('phosphorylation', 'Y', '187')], db_refs={'HGNC': hgnc_client.get_hgnc_id('MAP2K1')}) stmt1 = ActiveForm(ras, 'gtpbound', True) stmt2 = ActiveForm(mapk1_p, 'kinase', True) stmt3 = ActiveForm(mapk1_pp, 'kinase', True) for stmt in (stmt1, stmt2, stmt3): pba = pa.PybelAssembler([stmt]) belgraph = pba.make_model() assert len(belgraph) == 2
def get_db_refs_by_ident(ns, ident, node_data): """Return standard name and grounding based on a namespace and an ID. Parameters ---------- ns : str A name space in which the given identifier is interpreted. ident : str The identifier in the given name space to get grounding for. node_data : dict Node data for logging purposes. Returns ------- name : str The standardized name for the given entity. db_refs : dict The grounding for the given entity. """ name = node_data.get(pc.NAME) db_refs = None if ns == 'HGNC': name = hgnc_client.get_hgnc_name(ident) if not name: return None, None db_refs = {'HGNC': ident} up_id = _get_up_id(ident) if up_id: db_refs['UP'] = up_id mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(ident) if mirbase_id: db_refs['MIRBASE'] = mirbase_id elif ns == 'UP': db_refs = {'UP': ident} name = uniprot_client.get_gene_name(ident) if not name: return None, None if uniprot_client.is_human(ident): hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info('Uniprot ID linked to invalid human gene ' 'name %s' % name) else: db_refs['HGNC'] = hgnc_id elif ns == 'MIRBASE': db_refs = {'MIRBASE': ident} elif ns in ('MGI', 'RGD', 'CHEBI', 'HMDB', 'MESH'): db_refs = {ns: ident} # raise ValueError('Identifiers for MGI and RGD databases are not ' # 'currently handled: %s' % node_data) elif ns == 'PUBCHEM.COMPOUND': db_refs = {'PUBCHEM': ident} else: logger.info("Unhandled namespace %s with name %s and " "identifier %s (%s)." % (ns, name, node_data.identifier, node_data)) return name, db_refs
def get_target_agent(target): target_hgnc_id = hgnc_client.get_hgnc_id(target) target_up_id = hgnc_client.get_uniprot_id(target_hgnc_id) target_agent = Agent(target, db_refs={ 'HGNC': target_hgnc_id, 'UP': target_up_id }) return target_agent
def get_gene_agent(name, gene_entrez_id): db_refs = {'EGID': gene_entrez_id} hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id: db_refs['HGNC'] = hgnc_id standard_name, db_refs = standardize_name_db_refs(db_refs) if standard_name: name = standard_name return Agent(name, db_refs=db_refs)
def validate(db_ns, db_id): """Validate identifier, accepting HGNC name or ID""" if db_ns == 'HGNC': if db_id.isdigit(): return validate_id(db_ns, db_id) else: return get_hgnc_id(db_id) is not None else: return validate_id(db_ns, db_id)
def get_ras220_hgnc_ids(): # RAS 220 genes hgnc_ids = [] with open('../../indra/data/ras_pathway_proteins.csv', 'r') as fh: for row in csv.reader(fh, delimiter='\t'): hgnc_symbol = row[0] hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol) hgnc_ids.append(hgnc_id) return hgnc_ids
def _get_agent_from_gene_name(gene_name): db_refs = {} hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id agent = Agent(gene_name, db_refs=db_refs) return agent
def get_dark_kinase_hgnc_ids(): # All dark kinases fname = '../../indra_analysis/Table_005_IDG_dark_kinome.csv' hgnc_ids = [] with open(fname, 'r') as fh: for row in csv.reader(fh): hgnc_symbol = row[1] hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol) hgnc_ids.append(hgnc_id) return hgnc_ids
def get_agent(concept, entity): name = gene_name_from_uri(concept) namespace = namespace_from_uri(entity) db_refs = {} if namespace == 'HGNC': hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id agent = Agent(name, db_refs=db_refs) return agent
def _get_agent_ref(agent): """Get the preferred ref for an agent for db web api.""" if agent is None: return None ag_hgnc_id = hgnc_client.get_hgnc_id(agent.name) if ag_hgnc_id is not None: return ag_hgnc_id + "@HGNC" db_refs = agent.db_refs for namespace in ['HGNC', 'FPLX', 'CHEBI', 'TEXT']: if namespace in db_refs.keys(): return '%s@%s' % (db_refs[namespace], namespace) return '%s@%s' % (agent.name, 'TEXT')
def rename_agents(self, stmts): """Return a list of mapped statements with updated agent names. Creates a new list of statements without modifying the original list. The agents in a statement should be renamed if the grounding map has updated their db_refs. If an agent contains a FamPlex grounding, the FamPlex ID is used as a name. Otherwise if it contains a Uniprot ID, an attempt is made to find the associated HGNC gene name. If one can be found it is used as the agent name and the associated HGNC ID is added as an entry to the db_refs. If neither a FamPlex ID or HGNC name can be found, falls back to the original name. Parameters ---------- stmts : list of :py:class:`indra.statements.Statement` List of statements whose Agents need their names updated. Returns ------- mapped_stmts : list of :py:class:`indra.statements.Statement` A new list of Statements with updated Agent names """ # Make a copy of the stmts mapped_stmts = deepcopy(stmts) # Iterate over the statements for _, stmt in enumerate(mapped_stmts): # Iterate over the agents for agent in stmt.agent_list(): if agent is None: continue # If there's a FamPlex ID, prefer that for the name if agent.db_refs.get('FPLX'): agent.name = agent.db_refs.get('FPLX') # Take a HGNC name from Uniprot next elif agent.db_refs.get('UP'): # Try for the gene name gene_name = uniprot_client.get_gene_name( agent.db_refs.get('UP'), web_fallback=False) if gene_name: agent.name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: agent.db_refs['HGNC'] = hgnc_id # Take the text string #if agent.db_refs.get('TEXT'): # agent.name = agent.db_refs.get('TEXT') # If this fails, then we continue with no change # Fall back to the text string #elif agent.db_refs.get('TEXT'): # agent.name = agent.db_refs.get('TEXT') return mapped_stmts
def _initialize_node_agents(self): """Initialize internal dicts containing node information.""" nodes = _get_dict_from_list('nodes', self.cx) invalid_genes = [] for node in nodes: id = node['@id'] cx_db_refs = self.get_aliases(node) up_id = cx_db_refs.get('UP') if up_id: gene_name = uniprot_client.get_gene_name(up_id) hgnc_id = hgnc_client.get_hgnc_id(gene_name) db_refs = {'UP': up_id, 'HGNC': hgnc_id, 'TEXT': gene_name} agent = Agent(gene_name, db_refs=db_refs) self._node_names[id] = gene_name self._node_agents[id] = agent continue else: node_name = node['n'] self._node_names[id] = node_name hgnc_id = hgnc_client.get_hgnc_id(node_name) db_refs = {'TEXT': node_name} if not hgnc_id: if not self.require_grounding: self._node_agents[id] = \ Agent(node_name, db_refs=db_refs) invalid_genes.append(node_name) else: db_refs.update({'HGNC': hgnc_id}) up_id = hgnc_client.get_uniprot_id(hgnc_id) # It's possible that a valid HGNC ID will not have a # Uniprot ID, as in the case of HOTAIR (HOX transcript # antisense RNA, HGNC:33510) if up_id: db_refs.update({'UP': up_id}) self._node_agents[id] = Agent(node_name, db_refs=db_refs) if invalid_genes: verb = 'Skipped' if self.require_grounding else 'Included' logger.info('%s invalid gene symbols: %s' % (verb, ', '.join(invalid_genes)))
def get_grounded_agent(gene_name): """Return a grounded Agent based on an HGNC symbol.""" db_refs = {'TEXT': gene_name} if gene_name in hgnc_map: gene_name = hgnc_map[gene_name] hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id agent = Agent(gene_name, db_refs=db_refs) return agent
def _get_agent(self, ent_name, ent_type, id, database): # Returns a list of agents corresponding to this id # (If it is a signor complex, returns an Agent object with complex # constituents as BoundConditions if database == 'SIGNOR' and id in self.complex_map: components = self.complex_map[id] agents = self._get_complex_agents(id) # Return the first agent with the remaining agents as a bound # condition agent = agents[0] agent.bound_conditions = \ [BoundCondition(a, True) for a in agents[1:]] return agent else: gnd_type = _type_db_map[(ent_type, database)] if gnd_type == 'UP': up_id = id db_refs = {'UP': up_id} name = uniprot_client.get_gene_name(up_id) hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id: db_refs['HGNC'] = hgnc_id # Map SIGNOR protein families to FamPlex families elif ent_type == 'proteinfamily': db_refs = {database: id} # Keep the SIGNOR family ID in db_refs key = (database, id) # Use SIGNOR name unless we have a mapping in FamPlex name = ent_name famplex_id = famplex_map.get(key) if famplex_id is None: logger.info('Could not find %s in FamPlex map' % str(key)) else: db_refs['FPLX'] = famplex_id name = famplex_id # Other possible groundings are PUBCHEM, SIGNOR, etc. elif gnd_type is not None: if database not in ('PUBCHEM', 'SIGNOR', 'ChEBI', 'miRBase'): raise ValueError('Unexpected database %s' % database) if database == 'PUBCHEM' and id.startswith('CID:'): # We take off the CID: prefix plus fix an issue with # SIGNOR's format in which it leaves extra spaces around # the ID, as in 'CID: 923' id = id[4:].strip() db_refs = {gnd_type: id} name = ent_name # If no grounding, include as an untyped/ungrounded node else: name = ent_name db_refs = {} return Agent(name, db_refs=db_refs)
def get_gene_agents(gene_names): agents = [] for gn in gene_names: hgnc_id = hgnc_client.get_hgnc_id(gn) if not hgnc_id: logger.warning('Invalid HGNC gene symbol: %s' % gn) continue db_refs = {'HGNC': hgnc_id} up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id agent = Agent(gn, db_refs=db_refs) agents.append(agent) return agents
def _agent_from_ns_id(ag_ns, ag_id): ag_name = ag_id db_refs = {'TEXT': ag_name} if ag_ns == 'HGNC': hgnc_id = hgnc_client.get_hgnc_id(ag_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id is not None: db_refs['UP'] = up_id else: if ag_id is not None: db_refs[ag_ns] = ag_id return Agent(ag_name, db_refs=db_refs)
def _get_agent_ref(agent): """Get the preferred ref for an agent for db web api.""" if agent is None: return None # TODO: This will no longer be needed when the database is refreshed. ag_hgnc_id = hgnc_client.get_hgnc_id(agent.name) if ag_hgnc_id is not None: return ag_hgnc_id + "@HGNC" db_refs = agent.db_refs for namespace in ['HGNC', 'FPLX', 'CHEBI', 'TEXT']: if namespace in db_refs.keys(): return '%s@%s' % (db_refs[namespace], namespace) return '%s@%s' % (agent.name, 'TEXT')
def get_kinase_activities(): kinase_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../resources/kinases.tsv') kinases = [] with open(kinase_file, 'rt') as fh: lines = [l.strip() for l in fh.readlines()] for lin in lines[1:]: up_id, hgnc_name, _, _ = lin.split('\t') hgnc_id = hgnc_client.get_hgnc_id(hgnc_name) agent = Agent(hgnc_name, db_refs={'UP': up_id, 'HGNC': hgnc_id}) kinases.append(agent) kin_activities = [] from indra.statements import HasActivity for kin in kinases: stmt = HasActivity(kin, 'kinase', True) kin_activities.append(stmt) return kin_activities
def get_all_gene_names(data, out_file='prior_genes.txt'): """Return all gene names corresponding to all ABs.""" filt = pandas.notnull(data['antibody']['Protein Data ID']) data_filt = data['antibody'][filt] gene_names = data_filt['Gene Name'] uniprot_ids = data_filt['UniProt ID'] all_genes = set() invalid_genes = set() for gn, upid in zip(gene_names, uniprot_ids): # Some entries are lists of genes separated by commas # and we also strip off extra spaces names = [x.strip() for x in gn.split(',')] ids = [x.strip() for x in upid.split(',')] names_from_ids = [uniprot_client.get_gene_name(x) for x in ids] # Find invalid gene names for name in names: if not hgnc_client.get_hgnc_id(name): print('Invalid or deprecated gene symbol: %s' % name) invalid_genes.add(name) # Find inconsistent gene names and UniProt IDs if set(names) != set(names_from_ids): print('Inconsistent entries:') print('- Given gene names: %s' % ','.join(names)) print('- Genes from uniprot IDs: %s' % ','.join(names_from_ids)) # Add both the gene names and the gene names derived from UniProt IDs all_genes = all_genes.union(set(names)).union(set(names_from_ids)) # Finally remove the invalid gene names all_genes = list(all_genes.difference(invalid_genes)) # Add the unannotated genes unannotated_ab_genes = get_unannotated_antibody_genes(data) all_genes += unannotated_ab_genes # Add drug target genes drug_targets = get_drug_targets() for targets in drug_targets.values(): all_genes += targets # Add other important genes, for now, the RAS pathway all_genes += get_ras227_genes() all_genes = sorted(list(set(all_genes))) print('%d genes in total' % len(all_genes)) with open(out_file, 'wb') as fh: for gene in all_genes: fh.write(('%s\n' % gene).encode('utf-8')) return all_genes
def get_ids_for_gene(hgnc_name, **kwargs): """Get the curated set of articles for a gene in the Entrez database. Search parameters for the Gene database query can be passed in as keyword arguments. Parameters ---------- hgnc_name : string The HGNC name of the gene. This is used to obtain the HGNC ID (using the hgnc_client module) and in turn used to obtain the Entrez ID associated with the gene. Entrez is then queried for that ID. """ # Get the HGNC ID for the HGNC name hgnc_id = hgnc_client.get_hgnc_id(hgnc_name) if hgnc_id is None: raise ValueError('Invalid HGNC name.') # Get the Entrez ID entrez_id = hgnc_client.get_entrez_id(hgnc_id) if entrez_id is None: raise ValueError('Entrez ID not found in HGNC table.') # Query the Entrez Gene database params = {'db': 'gene', 'retmode': 'xml', 'id': entrez_id} params.update(kwargs) tree = send_request(pubmed_fetch, params) if tree is None: return [] if tree.find('ERROR') is not None: logger.error(tree.find('ERROR').text) return [] # Get all PMIDs from the XML tree id_terms = tree.findall('.//PubMedId') if id_terms is None: return [] # Use a set to remove duplicate IDs ids = list(set([idt.text for idt in id_terms])) return ids
def update_kinases(): logger.info('--Updating kinase list------') url = 'http://www.uniprot.org/uniprot/?' + \ 'sort=entry_name&desc=no&compress=no&query=database:(type:' + \ 'interpro%20ipr011009)%20AND%20reviewed:yes%20AND%20organism:' + \ '%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no' + \ '&format=tab&columns=id,genes(PREFERRED),organism-id,entry%20name' fname = os.path.join(path, 'kinases.tsv') save_from_http(url, fname) from indra.databases import hgnc_client, uniprot_client add_kinases = ['PGK1', 'PKM', 'TAF1', 'NME1', 'BCKDK', 'PDK1', 'PDK2', 'PDK3', 'PDK4', 'BCR', 'FAM20C', 'BAZ1B', 'PIKFYVE'] df = pandas.read_csv(fname, sep='\t') for kinase in add_kinases: hgnc_id = hgnc_client.get_hgnc_id(kinase) up_id = hgnc_client.get_uniprot_id(hgnc_id) up_mnemonic = uniprot_client.get_mnemonic(up_id) df = df.append({'Entry': up_id, 'Gene names (primary )': kinase, 'Organism ID': '9606', 'Entry name': up_mnemonic}, ignore_index=True) df.to_csv(fname, sep='\t', index=False)
def _add_node(self, agent, uuid=None): node_key = agent.name node_id = self._existing_nodes.get(node_key) # if the node already exists we do not want to add it again # we must however add its uuid if node_id is not None: # fetch the appropriate node n = [x for x in self._nodes if x['data']['id'] == node_id][0] uuid_list = n['data']['uuid_list'] if uuid not in uuid_list: uuid_list.append(uuid) return node_id db_refs = _get_db_refs(agent) node_id = self._get_new_id() self._existing_nodes[node_key] = node_id node_name = agent.name node_name = node_name.replace('_', ' ') expanded_families = expander.get_children(agent, ns_filter='HGNC') members = {} for member in expanded_families: hgnc_symbol = member[1] hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol) if hgnc_id: up_id = hgnc_client.get_uniprot_id(hgnc_id) member_agent = Agent(hgnc_symbol, db_refs={'HGNC': hgnc_id, 'UP': up_id}) member_db_refs = _get_db_refs(member_agent) else: member_db_refs = {} members[member[1]] = {'db_refs': member_db_refs} node = {'data': {'id': node_id, 'name': node_name, 'db_refs': db_refs, 'parent': '', 'members': members, 'uuid_list': [uuid]}} self._nodes.append(node) return node_id
def get_agent(concept, entity): name = term_from_uri(concept) namespace = namespace_from_uri(entity) db_refs = {} if namespace == 'HGNC': agent_name = name hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id is not None: db_refs['HGNC'] = str(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.warning('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.' % (name, hgnc_id)) else: logger.warning("Couldn't get HGNC ID for HGNC symbol %s" % name) elif namespace in ('MGI', 'RGD'): agent_name = name db_refs[namespace] = name elif namespace in ('PFH', 'SFAM'): indra_name = bel_to_indra.get(name) db_refs[namespace] = name if indra_name is None: agent_name = name msg = 'Could not find mapping for BEL family: %s' % name logger.warning(msg) else: db_refs['BE'] = indra_name db_refs['TEXT'] = name agent_name = indra_name elif namespace in ('NCH', 'SCOMP'): indra_name = bel_to_indra.get(name) db_refs[namespace] = name if indra_name is None: agent_name = name msg = 'Could not find mapping for BEL complex: %s' % name logger.warning(msg) else: db_refs['BE'] = indra_name db_refs['TEXT'] = name agent_name = indra_name elif namespace == 'CHEBI': chebi_id = chebi_name_id.get(name) if chebi_id: db_refs['CHEBI'] = chebi_id else: logger.warning('CHEBI name %s not found in map.' % name) agent_name = name elif namespace == 'EGID': hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs['EGID'] = name if hgnc_id is not None: db_refs['HGNC'] = str(hgnc_id) agent_name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.warning('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.' % (name, hgnc_id)) else: logger.warning('Could not map EGID%s to HGNC.' % name) agent_name = 'E%s' % name else: logger.warning('Unhandled entity namespace: %s' % namespace) print('%s, %s' % (concept, entity)) agent_name = name agent = Agent(agent_name, db_refs=db_refs) return agent
def id(gene_name): return hgnc_client.get_hgnc_id(gene_name)
def _get_agent_from_ref(self, ref): # TODO: handle collections if ref.attrib.get('category') == 'collection': #logger.warning('Skipping collection Agent.') return None # Find the name, uid and raw-text tags first and get their text # content if available uid_tag = ref.find("var/[@name='uid']") name_tag = ref.find("var/[@name='name']") text_tag = ref.find("var/[@name='raw-text']") if name_tag is not None and name_tag.text: name = name_tag.text else: name = None if uid_tag is not None and uid_tag.text: uid = uid_tag.text else: uid = None if text_tag is not None and text_tag.text: raw_text = text_tag.text else: raw_text = None # TODO: factor this out and reuse fix_agents db_refs = {} # Save raw text if available if raw_text: db_refs['TEXT'] = raw_text agent_name = raw_text # If we have a proper UID then we try to reconstruct an Agent from that if uid is not None and len(uid.split(':')) == 2: db_ns, db_id = uid.split(':') be_id = famplex_map.get((db_ns, db_id)) if be_id: db_refs[db_ns] = db_id db_refs['FPLX'] = be_id agent_name = be_id elif db_ns in ['UP', 'Uniprot']: db_refs['UP'] = db_id gene_name = uniprot_client.get_gene_name(db_id) if gene_name: agent_name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif db_ns == 'NCIT': db_refs['NCIT'] = db_id target = ncit_map.get(db_id) if target: db_refs[target[0]] = target[1] if target[0] == 'HGNC': up_id = hgnc_client.get_uniprot_id(target[1]) agent_name = hgnc_client.get_hgnc_name(target[1]) if up_id: db_refs['UP'] = up_id elif target[0] == 'UP': agent_name = uniprot_client.get_gene_name(target[1]) if agent_name: hgnc_id = hgnc_client.get_hgnc_id(agent_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif db_ns == 'FA': db_refs['NXP'] = 'FA:' + db_id elif db_ns == 'XFAM': db_refs['PF'] = db_id.split('.')[0] elif db_ns == 'CHEBI': db_refs['CHEBI'] = 'CHEBI:' + db_id elif db_ns in ['GO', 'MESH', 'FPLX']: db_refs[db_ns] = db_id # Handle old BE mappings and add them as FPLX elif db_ns == 'BE': db_refs['FPLX'] = db_id elif db_ns in ['PR', 'CO', 'CVCL', 'EFO', 'ORPHANET']: db_refs[db_ns] = db_id else: logger.warning('Unknown database name space %s' % db_ns) if not agent_name: if raw_text is not None: agent_name = raw_text else: return None assert(agent_name) agent = Agent(agent_name, db_refs=db_refs) return agent
def _fix_agent(agent): if agent is None: return # First we fix some name spaces db_refs_tmp = copy(agent.db_refs) for db_ns, db_id in agent.db_refs.items(): # Change FA name space if db_ns == 'FA': db_refs_tmp.pop('FA', None) db_refs_tmp['NXPFA'] = db_id # Change IPR name space elif db_ns == 'IPR': db_refs_tmp.pop('IPR', None) db_refs_tmp['IP'] = db_id # Change XFAM name space elif db_ns == 'XFAM': db_refs_tmp.pop('XFAM', None) db_refs_tmp['PF'] = db_id.split('.')[0] elif db_ns == 'GO': if db_id.startswith('GO:'): db_refs_tmp['GO'] = db_id else: db_refs_tmp['GO'] = 'GO:' + db_id # Change PCID name space elif db_ns == 'PCID': db_refs_tmp.pop('PCID', None) db_refs_tmp['PUBCHEM'] = db_id agent.db_refs = db_refs_tmp # Check if we have a FPLX entry and handle old BE mappings if 'BE' in agent.db_refs: agent.db_refs['FPLX'] = agent.db_refs.pop('BE') be_id = agent.db_refs.get('FPLX') # Try to map to FPLX from NXP, IPR, PF, NCIT if not be_id: for db_ns, db_id in agent.db_refs.items(): be_id = famplex_map.get((db_ns, db_id)) if be_id: break # Try mapping NCIT to specific genes if possible if not be_id and 'NCIT' in agent.db_refs: target = ncit_map.get(agent.db_refs['NCIT']) if target: agent.db_refs[target[0]] = target[1] # Check what entries we have up_id = agent.db_refs.get('UP') hgnc_id = agent.db_refs.get('HGNC') # FPLX takes precedence if we have it if be_id: agent.db_refs['FPLX'] = be_id agent.name = be_id elif hgnc_id: gene_name = hgnc_client.get_hgnc_name(hgnc_id) if gene_name: agent.name = gene_name if not up_id: up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: agent.db_refs['UP'] = up_id elif up_id: gene_name = uniprot_client.get_gene_name(up_id) if gene_name: agent.name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: agent.db_refs['HGNC'] = hgnc_id # If it doesn't have a gene name, it's better to just # use the raw string name otherwise Sparser sets # has Uniprot IDs or mnemonics as the name else: name = agent.db_refs.get('TEXT', agent.name) agent.name = name
def get_agent(node_data, node_modifier_data=None): # FIXME: Handle translocations on the agent for ActiveForms, turn into # location conditions # Check the node type/function node_func = node_data[pc.FUNCTION] if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX, pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA): mod_data = node_modifier_data or 'No node data' logger.info("Nodes of type %s not handled: %s", node_func, mod_data) return None # Skip gene/protein fusions if pc.FUSION in node_data: logger.info("Gene and protein fusions not handled: %s" % str(node_data)) return None # COMPLEXES ------------ # First, handle complexes, which will consist recursively of other agents if node_func == pc.COMPLEX: # First, check for members: if there are no members, we assume this # is a named complex members = node_data.get(pc.MEMBERS) if members is None: return None # Otherwise, get the "main" agent, to which the other members will be # attached as bound conditions main_agent = get_agent(members[0]) # If we can't get the main agent, return None if main_agent is None: return None bound_conditions = [BoundCondition(get_agent(m), True) for m in members[1:]] # Check the bound_conditions for any None agents if any([bc.agent is None for bc in bound_conditions]): return None main_agent.bound_conditions = bound_conditions # Get activity of main agent ac = _get_activity_condition(node_modifier_data) main_agent.activity = ac return main_agent # OTHER NODE TYPES ----- # Get node identifier information name = node_data.get(pc.NAME) ns = node_data[pc.NAMESPACE] ident = node_data.get(pc.IDENTIFIER) # No ID present, get identifier using the name, namespace db_refs = None if not ident: assert name, "Node must have a name if lacking an identifier." if ns == 'HGNC': hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return None db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id # FIXME: Look up go ID in ontology lookup service # FIXME: Look up MESH IDs from name # FIXME: For now, just use node name elif ns in ('GOBP', 'MESHPP', 'MESHD'): db_refs = {} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns in ('MGI', 'RGD'): db_refs = {ns: name} # Map Selventa families to FamPlexes elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns == 'EGID': hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) else: logger.info('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name # CHEBI elif ns == 'CHEBI': chebi_id = chebi_name_id.get(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM'): db_refs = {ns: name} else: print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) # We've already got an identifier, look up other identifiers if necessary else: # Get the name, overwriting existing name if necessary if ns == 'HGNC': name = hgnc_client.get_hgnc_name(ident) db_refs = {'HGNC': ident} up_id = _get_up_id(ident) if up_id: db_refs['UP'] = up_id elif ns == 'UP': db_refs = {'UP': ident} name = uniprot_client.get_gene_name(ident) assert name if uniprot_client.is_human(ident): hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info('Uniprot ID linked to invalid human gene ' 'name %s' % name) else: db_refs['HGNC'] = hgnc_id elif ns in ('MGI', 'RGD'): raise ValueError('Identifiers for MGI and RGD databases are not ' 'currently handled: %s' % node_data) else: print("Unhandled namespace with identifier: %s: %s (%s)" % (ns, name, node_data)) if db_refs is None: logger.info('Unable to get identifier information for node: %s', node_data) return None # Get modification conditions mods, muts = _get_all_pmods(node_data) # Get activity condition ac = _get_activity_condition(node_modifier_data) to_loc = _get_translocation_target(node_modifier_data) # Check for unhandled node modifiers, skip if so if _has_unhandled_modifiers(node_modifier_data): return None # Make the agent ag = Agent(name, db_refs=db_refs, mods=mods, mutations=muts, activity=ac, location=to_loc) return ag
def map_agents(self, stmts, do_rename=True): # Make a copy of the stmts mapped_stmts = [] num_skipped = 0 # Iterate over the statements for stmt in stmts: mapped_stmt = deepcopy(stmt) # Iterate over the agents skip_stmt = False for agent in mapped_stmt.agent_list(): if agent is None or agent.db_refs.get('TEXT') is None: continue agent_text = agent.db_refs.get('TEXT') # Look this string up in the grounding map # If not in the map, leave agent alone and continue try: map_db_refs = self.gm[agent_text] except KeyError: continue # If it's in the map but it maps to None, then filter out # this statement by skipping it if map_db_refs is None: # Increase counter if this statement has not already # been skipped via another agent if not skip_stmt: num_skipped += 1 logger.debug("Skipping %s" % agent_text) skip_stmt = True # If it has a value that's not None, map it and add it else: # Otherwise, update the agent's db_refs field gene_name = None map_db_refs = deepcopy(self.gm.get(agent_text)) up_id = map_db_refs.get('UP') hgnc_sym = map_db_refs.get('HGNC') if up_id and not hgnc_sym: gene_name = uniprot_client.get_gene_name(up_id, False) if gene_name: hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: map_db_refs['HGNC'] = hgnc_id elif hgnc_sym and not up_id: # Override the HGNC symbol entry from the grounding # map with an HGNC ID hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym) if hgnc_id: map_db_refs['HGNC'] = hgnc_id # Now get the Uniprot ID for the gene up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: map_db_refs['UP'] = up_id # If there's no HGNC ID for this symbol, raise an # Exception else: raise ValueError('No HGNC ID corresponding to gene ' 'symbol %s in grounding map.' % hgnc_sym) # If we have both, check the gene symbol ID against the # mapping from Uniprot elif up_id and hgnc_sym: # Get HGNC Symbol from Uniprot gene_name = uniprot_client.get_gene_name(up_id) if not gene_name: raise ValueError('No gene name found for Uniprot ' 'ID %s (expected %s)' % (up_id, hgnc_sym)) # We got gene name, compare it to the HGNC name else: if gene_name != hgnc_sym: raise ValueError('Gene name %s for Uniprot ID ' '%s does not match HGNC ' 'symbol %s given in grounding ' 'map.' % (gene_name, up_id, hgnc_sym)) else: hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym) if not hgnc_id: raise ValueError('No HGNC ID ' 'corresponding to gene ' 'symbol %s in grounding ' 'map.' % hgnc_sym) # Assign the DB refs from the grounding map to the agent agent.db_refs = map_db_refs # Are we renaming right now? if do_rename: # If there's a Bioentities ID, prefer that for the name if agent.db_refs.get('BE'): agent.name = agent.db_refs.get('BE') # Get the HGNC symbol or gene name (retrieved above) elif hgnc_sym is not None: agent.name = hgnc_sym elif gene_name is not None: agent.name = gene_name # Check if we should skip the statement if not skip_stmt: mapped_stmts.append(mapped_stmt) logger.info('%s statements filtered out' % num_skipped) return mapped_stmts
def get_agent_from_entity_info(entity_info): """Return an INDRA Agent by processing an entity_info dict.""" # This will be the default name. If we get a gene name, it will # override this rawtext name. raw_text = entity_info['entityText'] name = raw_text # Get the db refs. refs = {'TEXT': raw_text} ref_counts = Counter([entry['source'] for entry in entity_info['entityId']]) for source, count in ref_counts.items(): if source in ('Entrez', 'UniProt') and count > 1: logger.info('%s has %d entries for %s, skipping' % (raw_text, count, source)) return None, None muts = [] for id_dict in entity_info['entityId']: if id_dict['source'] == 'Entrez': refs['EGID'] = id_dict['idString'] hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString']) if hgnc_id is not None: # Check against what we may have already inferred from # UniProt. If it disagrees with this, let it be. Inference # from Entrez isn't as reliable. if 'HGNC' in refs.keys(): if refs['HGNC'] != hgnc_id: msg = ('HGNC:%s previously set does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], hgnc_id, refs['EGID']) logger.info(msg) else: refs['HGNC'] = hgnc_id elif id_dict['source'] == 'UniProt': refs['UP'] = id_dict['idString'] gene_name = uniprot_client.get_gene_name(id_dict['idString']) if gene_name is not None: name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id is not None: # Check to see if we have a conflict with an HGNC id # found from the Entrez id. If so, overwrite with this # one, in which we have greater faith. if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id: msg = ('Inferred HGNC:%s from UP:%s does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], refs['UP'], hgnc_id, refs['EGID']) logger.info(msg) refs['HGNC'] = hgnc_id elif id_dict['source'] in ('Tax', 'NCBI'): refs['TAX'] = id_dict['idString'] elif id_dict['source'] == 'CHEBI': refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString'] # These we take as is elif id_dict['source'] in ('MESH', 'OMIM', 'CTD'): refs[id_dict['source']] = id_dict['idString'] # Handle mutations elif id_dict['source'] == 'Unk' and \ id_dict['entityType'] == 'ProteinMutation': # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk', # 'tool': 'PubTator', 'entityType': 'ProteinMutation'} # Mpk1(Y268A)' if id_dict['idString'].startswith('p|SUB|'): try: # Handle special cases like p|SUB|A|30|P;RS#:104893878 parts = id_dict['idString'].split(';')[0].split('|') residue_from, pos, residue_to = parts[2:5] mut = MutCondition(pos, residue_from, residue_to) muts.append(mut) except Exception as e: logger.info('Could not process mutation %s' % id_dict['idString']) else: logger.info('Unhandled mutation: %s' % id_dict['idString']) else: logger.warning("Unhandled id type: {source}={idString}" .format(**id_dict)) raw_coords = (entity_info['charStart'], entity_info['charEnd']) return Agent(name, db_refs=refs, mutations=muts), raw_coords