def get_ligands(): # Read and extract cell surface proteins from CSPA DB wb = openpyxl.load_workbook(SURFACE_PROTEINS_WB) surface_protein_set = set(row[4].value for row in wb['Sheet 1'] if row[6].value == 'yes') updated_surface_protein_set = { hgnc_client.get_hgnc_name(hgnc_client.get_current_hgnc_id(g)) for g in surface_protein_set } - {None} logger.info('Got %d surface proteins from spreadsheet' % len(surface_protein_set)) ligand_terms = [ 'cytokine activity', 'hormone activity', 'growth factor activity', 'extracellular matrix structural constituent' ] # Getting GO id's for ligands and receptors by using # GO terms ligand_go_ids = [ bio_ontology.get_id_from_name('GO', term)[1] for term in ligand_terms ] ligand_go_ids = expand_with_child_go_terms(ligand_go_ids) # Converting GO id's to gene symbols ligand_genes_go = get_genes_for_go_ids(ligand_go_ids) manual_ligands = set() ligand_genes_go = updated_surface_protein_set | ligand_genes_go | manual_ligands | get_cpdb_ligands( ) ligand_genes_go = { hgnc_client.get_hgnc_name(hgnc_client.get_current_hgnc_id(g)) for g in ligand_genes_go } - {None} ligand_genes_go = ligand_genes_go - (get_cpdb_receptors() | get_ion_channels()) return ligand_genes_go
def match_reactome(z_sc, reactome_dict): logger.info('Generating generator') corr_iterator = corr_matrix_to_generator(z_sc) res = { 'agA_hgnc': [], 'agA_up': [], 'agB_hgnc': [], 'agB_up': [], 'z_sc': [], 'has_pathways': [], 'common_pathways': [] } logger.info('Looping correlations') for a, b, corr in corr_iterator: hgnc_id_a = get_current_hgnc_id(a) if isinstance(hgnc_id_a, list): ix = 0 while True: try: a_up = get_uniprot_id(hgnc_id_a[ix]) except IndexError: a_up = None break if a_up is None: ix += 1 else: a_up = get_uniprot_id(hgnc_id_a) if a_up is None: continue hgnc_id_b = get_current_hgnc_id(b) if isinstance(hgnc_id_b, list): ix = 0 while True: try: b_up = get_uniprot_id(hgnc_id_b[ix]) except IndexError: b_up = None break if b_up is None: ix += 1 else: b_up = get_uniprot_id(hgnc_id_b) if b_up is None: continue common_reactome = set(reactome_dict.get(a_up, [])) & \ set(reactome_dict.get(b_up, [])) res['agA_hgnc'].append(a) res['agA_up'].append(a_up) res['agB_hgnc'].append(b) res['agB_up'].append(b_up) res['z_sc'].append(corr) res['common_pathways'].append(common_reactome) res['has_pathways'].append(bool(common_reactome)) logger.info('Returning results') return res
def test_get_current_id(): # Current symbol assert hgnc_client.get_current_hgnc_id('BRAF') == '1097' # Outdated symbol, one ID assert hgnc_client.get_current_hgnc_id('SEPT7') == '1717' # Outdated symbol, multiple IDs ids = hgnc_client.get_current_hgnc_id('HOX1') assert len(ids) == 10 assert '5101' in ids
def _replace_outdated_hgnc_symbols(self, pc_old, pc_current): logger.info('Replacing outdated HGNC symbols in %s and save as %s' % \ (pc_old, pc_current)) pc = pandas.read_csv(pc_old, sep='\t', dtype=str, header=None) col_mapper = {} col_mapper[0] = 'source' col_mapper[1] = 'rel_type' col_mapper[2] = 'target' pc = pc.rename(mapper=col_mapper, axis='columns') all_symbols = set(pc['source']).union(pc['target']) symbol_map = {} for sym in all_symbols: if not sym.startswith('CHEBI:'): hgnc_id = hgnc_client.get_current_hgnc_id(sym) if not hgnc_id: continue elif isinstance(hgnc_id, list): #outdated gene symbol is ambiguous: maps to multiple genes continue latest_symbol = hgnc_client.get_hgnc_name(hgnc_id) if latest_symbol != sym: symbol_map[sym] = latest_symbol if symbol_map: pc.replace(symbol_map, inplace=True) pc.to_csv(pc_current, sep='\t', header=False, index=False) os.remove(pc_old)
def _get_genes( record: Mapping[str, Any], prefix: str, key: str, ) -> List[Tuple[str, str, str]]: rv = [] #: A list of 2-tuples with the gene symbol then the expression value expressions = record[key] for symbol, _ in expressions: if prefix == "HGNC": current_id = hgnc_client.get_current_hgnc_id(symbol) # We may get no current IDs or more than one current IDs # in which case we skip this gene if not current_id or isinstance(current_id, list): identifier = None else: identifier = current_id _prefix = "HGNC" elif prefix == "MGI": _prefix, identifier = "UP", get_id_from_mgi_name(symbol) elif prefix == "RGD": _prefix, identifier = "UP", get_id_from_rgd_name(symbol) else: raise ValueError(f"Invalid prefix: {prefix} ! {symbol}") if identifier is None: if (prefix, symbol) not in MISSING_NAMES: logger.debug( f"Could not look up {symbol} by name in {prefix}", ) MISSING_NAMES.add((prefix, symbol)) continue rv.append((_prefix, identifier, symbol)) return rv
def get_statements_for_kinase_db_api(kinase): logger.info('Getting statements for %s' % kinase) hgnc_id = hgnc_client.get_current_hgnc_id(kinase) if hgnc_id is None: logger.warning('Could not get HGNC ID for %s' % kinase) return None ip = get_statements(agents=['%s@HGNC' % hgnc_id], ev_limit=10000) stmts = filter_out_medscan(ip.statements) stmts = sorted(stmts, key=lambda x: len(x.evidence), reverse=True) return stmts
def get_hgnc_ids(gene_names): ids = [] for gene in gene_names: if '.' in gene: print('%s is not an HGNC ID' % gene) continue hgnc_id = hgnc_client.get_current_hgnc_id(gene) if not hgnc_id: print('Invalid gene symbol: %s' % gene) continue ids.append(hgnc_id) return ids
def align_identifiers_urls(indra_groundings, dm_urls): matches = [] identifiers_prefix = 'https://identifiers.org/' for dm_url in dm_urls: # We do it this way instead of splitting because of DOIs which have # extra slashes entity = dm_url[len(identifiers_prefix):] db_ns, db_id = entity.split(':', maxsplit=1) if db_ns == 'CHEBI': db_refs = [ standardize_db_refs({'CHEBI': '%s:%s' % (db_ns, db_id)}) ] elif db_ns == 'hgnc': db_refs = [standardize_db_refs({'HGNC': db_id})] elif db_ns == 'hgnc.symbol': hgnc_id = hgnc_client.get_current_hgnc_id(db_id) db_refs = [standardize_db_refs({'HGNC': hgnc_id})] elif db_ns == 'pubchem.compound': db_refs = [standardize_db_refs({'PUBCHEM': db_id})] elif db_ns == 'uniprot': db_refs = [standardize_db_refs({'UP': db_id})] elif db_ns == 'bigg.metabolite': chebi_ids = bigg_to_chebi.get(db_id) if chebi_ids: db_refs = [ standardize_db_refs({'CHEBI': chebi_id}) for chebi_id in chebi_ids ] else: db_refs = [{}] elif db_ns == 'ncbigene': hgnc_id = hgnc_client.get_hgnc_from_entrez(db_id) if hgnc_id: db_refs = [standardize_db_refs({'HGNC': hgnc_id})] else: db_refs = [{}] # Skip literature references that aren't entities elif db_ns in {'doi', 'pubmed'}: continue else: print('Unhandled namespace %s' % db_ns) db_refs = {} matched = None for db_ref in db_refs: for k, v in db_ref.items(): if (k, v) in indra_groundings: matched = (k, v) break matches.append( (dm_url, get_identifiers_url(*matched) if matched else None)) return matches
def add_famplex_hierarchy(self): from indra.databases import hgnc_client edges = [] for row in read_unicode_csv(get_resource_path( os.path.join('famplex', 'relations.csv')), delimiter=','): ns1, id1, rel, ns2, id2 = row if ns1 == 'HGNC': id1 = hgnc_client.get_current_hgnc_id(id1) edges.append((self.label(ns1, id1), self.label(ns2, id2), {'type': rel})) self.add_edges_from(edges)
def _hgncsym2up(hgnc_symb: str) -> str: hgnc_id = get_current_hgnc_id(hgnc_symb) if isinstance(hgnc_id, list): ix = 0 upid = None while upid is None: try: upid = get_uniprot_id(hgnc_id[ix]) except IndexError: break ix += 1 else: upid = get_uniprot_id(hgnc_id) return upid
def _get_upid_from_hgnc_symbol(hgnc_gene: str) -> Union[str, None]: hgnc_id = get_current_hgnc_id(hgnc_gene) if isinstance(hgnc_id, list): ix = 0 while True: try: up_id = get_uniprot_id(hgnc_id[ix]) except IndexError: up_id = None break if up_id is None: ix += 1 else: up_id = get_uniprot_id(hgnc_id) return up_id
def get_grounded_agent(gene_name): """Return a grounded Agent based on an HGNC symbol.""" db_refs = {'TEXT': gene_name} if gene_name in hgnc_map: gene_name = hgnc_map[gene_name] hgnc_id = hgnc_client.get_hgnc_id(gene_name) if not hgnc_id: hgnc_id = hgnc_client.get_current_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id agent = Agent(gene_name, db_refs=db_refs) return agent
def get_all_kinase_hgnc_ids(): # RAS 220 genes hgnc_ids = [] with open('../../idg_ion_channels/data/IDG_target_final.csv', 'r') as fh: reader = csv.reader(fh, delimiter=',') next(reader) gene_names = [row[0] for row in reader if row[1] == 'Kinase'] hgnc_ids = [] for gene_name in gene_names: hgnc_id = hgnc_client.get_current_hgnc_id(gene_name) if not hgnc_id: print('Could not get HGNC ID for %s' % gene_name) else: hgnc_ids.append(hgnc_id) return hgnc_ids
def sanitize_hgnc_ids(raw_hgnc_ids): # First we get a list of primary IDs hgnc_ids = set() for raw_hgnc_id in raw_hgnc_ids: # Check if it's an ID first m1 = re.match('([0-9]+)', raw_hgnc_id) m2 = re.match('hgnc:([0-9]+)', raw_hgnc_id.lower()) if m1: hgnc_id = str(m1.groups()[0]) hgnc_ids.add(hgnc_id) elif m2: hgnc_id = str(m2.groups()[0]) hgnc_ids.add(hgnc_id) # If not, we assume it's a symbol else: hgnc_id = hgnc_client.get_current_hgnc_id(raw_hgnc_id) if isinstance(hgnc_id, list): hgnc_ids |= set(hgnc_id) elif hgnc_id: hgnc_ids.add(hgnc_id) return list(hgnc_ids)
def map_hgnc_symbols(hgnc_symbols): """Return references based on a list of HGNC symbols.""" refs = [] for hgnc_symbol in hgnc_symbols: ref = {'HGNC_SYMBOL': hgnc_symbol, 'HGNC': None, 'UP': None} hgnc_id = hgnc_client.get_current_hgnc_id(hgnc_symbol) if not hgnc_id: logger.warning('Could not get HGNC ID for symbol %s' % hgnc_symbol) continue elif isinstance(hgnc_id, list): logger.warning('More than one current HGNC ID for outdated ' 'symbol %s' % hgnc_symbol) continue ref['HGNC'] = hgnc_id uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if not uniprot_id: logger.warning('Could not get UniProt ID for symbol %s' % hgnc_symbol) continue ref['UP'] = uniprot_id refs.append(ref) return refs
def get_stmts_for_gene(gene: str, max_stmts: int = 100000) -> List[Statement]: """Return all existing Statements for a given gene from the DB. Parameters ---------- gene : The HGNC symbol of a gene to query. max_stmts: The maximum number of statements to return Returns ------- : A list of INDRA Statements in which the given gene is involved. """ hgnc_id = hgnc_client.get_current_hgnc_id(gene) if hgnc_id is None: return [] agents = [ (None, hgnc_id, "HGNC"), ] res = get_raw_stmt_jsons_from_agents(agents=agents, max_stmts=max_stmts) return stmts_from_json(res.values())
def up_for_hgnc(gene): """Return HGNC symbol and UniProt ID for a potentially outdated gene name.""" hgnc_id = hgnc_client.get_current_hgnc_id(gene) if hgnc_id is None: #print("Couldn't find current HGNC ID for gene %s" % gene) hgnc_name = gene up_id = None elif isinstance(hgnc_id, list): #print("More than one HGNC ID for gene %s" % gene) hgnc_name = gene up_id = None else: hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) up_id_str = hgnc_client.get_uniprot_id(hgnc_id) if up_id_str is None: #print("No Uniprot ID for HGNC ID %s, gene %s" % (hgnc_id, gene)) up_id = None elif ',' in up_id_str: up_ids = [u.strip() for u in up_id_str.split(',')] up_id = up_ids[0] else: up_id = up_id_str return hgnc_name, up_id
def get_db_refs_by_name(ns, name, node_data): """Return standard name and grounding based on a namespace and a name. Parameters ---------- ns : str A name space in which the given name is interpreted. name : str The name in the given name space to get grounding for. node_data : dict Node data for logging purposes. Returns ------- name : str The standardized name for the given entity. db_refs : dict The grounding for the given entity. """ db_refs = None if ns == 'HGNC': # Assumption: name is an HGNC symbol hgnc_id = hgnc_client.get_current_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return name, None elif isinstance(hgnc_id, list): logger.info('More than one current HGNC ID for %s, choosing %s' % (name, hgnc_id[0])) hgnc_id = hgnc_id[0] name = hgnc_client.get_hgnc_name(hgnc_id) db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id elif ns in ('UNIPROT', 'UP'): up_id = None # This is a simple test to see if name is a valid UniProt ID, # if we can't get a mnemonic, we assume it's not a UP ID if uniprot_client.get_mnemonic(name, web_fallback=False): up_id = name # We next check if it's a mnemonic else: up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name) if up_id_from_mnem: up_id = up_id_from_mnem if not up_id: logger.info('Couldn\'t get UP ID from %s' % name) return name, None db_refs = {'UP': up_id} hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) else: name = uniprot_client.get_gene_name(up_id) elif ns == 'FPLX': db_refs = {'FPLX': name} elif ns in ('GO', 'GOBP', 'GOCC'): if name == 'cell proliferation': name = 'cell population proliferation' go_id = go_client.get_go_id_from_label(name) if not go_id: logger.info('Could not find GO ID for %s' % name) return name, None db_refs = {'GO': go_id} name = go_client.get_go_label(go_id) elif ns in ('MESHPP', 'MESHD', 'MESH'): mesh_id, mesh_name = mesh_client.get_mesh_id_name(name) if not mesh_id: logger.info('Could not find MESH ID from %s' % name) return name, None name = mesh_name db_refs = {'MESH': mesh_id} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns == 'MGI': up_id = mouse_lookup.get(name) if up_id: db_refs = {'UP': up_id} elif ns == 'RGD': up_id = rat_lookup.get(name) if up_id: db_refs = {'UP': up_id} # Map Selventa families and complexes to FamPlex elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name elif ns == 'SCOMP': db_refs = {'SCOMP': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SCOMP complex: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'): hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info( 'HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id else: logger.debug('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name elif ns == 'MIRBASE': mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name) if not mirbase_id: logger.info('Could not map miRBase name %s to ID', name) return name, None db_refs = {'MIRBASE': mirbase_id} hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) # CHEBI elif ns == 'CHEBI': # We first look up BEL's own namespace map for ChEBI names to IDs chebi_id = chebi_name_id.get(name) # If that fails, we look up INDRA's ChEBI name to ID mapping if not chebi_id: chebi_id = chebi_client.get_chebi_id_from_name(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # These appear in the name slot but are actually IDs elif ns == 'CHEBIID': chebi_id = identifiers.ensure_chebi_prefix(name) db_refs = {'CHEBI': chebi_id} name = chebi_client.get_chebi_name_from_id(chebi_id) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM', 'TEXT'): db_refs = {ns: name} elif ns == 'TAX': tid = taxonomy_client.get_taxonomy_id(name) if tid: db_refs = {'TAXONOMY': tid} else: logger.info('Could not get taxonomy ID for %s' % name) else: logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) return name, db_refs