def match_reactome(z_sc, reactome_dict): logger.info('Generating generator') corr_iterator = corr_matrix_to_generator(z_sc) res = { 'agA_hgnc': [], 'agA_up': [], 'agB_hgnc': [], 'agB_up': [], 'z_sc': [], 'has_pathways': [], 'common_pathways': [] } logger.info('Looping correlations') for a, b, corr in corr_iterator: hgnc_id_a = get_current_hgnc_id(a) if isinstance(hgnc_id_a, list): ix = 0 while True: try: a_up = get_uniprot_id(hgnc_id_a[ix]) except IndexError: a_up = None break if a_up is None: ix += 1 else: a_up = get_uniprot_id(hgnc_id_a) if a_up is None: continue hgnc_id_b = get_current_hgnc_id(b) if isinstance(hgnc_id_b, list): ix = 0 while True: try: b_up = get_uniprot_id(hgnc_id_b[ix]) except IndexError: b_up = None break if b_up is None: ix += 1 else: b_up = get_uniprot_id(hgnc_id_b) if b_up is None: continue common_reactome = set(reactome_dict.get(a_up, [])) & \ set(reactome_dict.get(b_up, [])) res['agA_hgnc'].append(a) res['agA_up'].append(a_up) res['agB_hgnc'].append(b) res['agB_up'].append(b_up) res['z_sc'].append(corr) res['common_pathways'].append(common_reactome) res['has_pathways'].append(bool(common_reactome)) logger.info('Returning results') return res
def read_phosphosite(fname): df = pandas.read_csv(fname, index_col=None) statements = [] antibody_map = {} for _, row in df.iterrows(): sub_upid = row['SUB_ID'] if not pandas.isnull(sub_upid): sub_hgnc_symbol = uniprot_client.get_gene_name(sub_upid) sub_hgnc = hgnc_client.get_hgnc_id(sub_hgnc_symbol) else: sub_hgnc_symbol = row['SUB_GENE'] sub_hgnc_id = hgnc_client.get_hgnc_id(sub_hgnc_symbol) sub_upid = hgnc_client.get_uniprot_id(sub_hgnc_id) sub = Agent(sub_hgnc_symbol, db_refs={'UP': sub_upid,'HGNC': sub_hgnc}) residue = row['Actual_site'][0] if len(row['Actual_site']) > 1: position = row['Actual_site'][1:] else: position = None sub_readout = deepcopy(sub) mc = ModCondition('phosphorylation', residue, position) sub_readout.mods = [mc] ps = row['phosphosite'] if ps in antibody_map: found = False for p in antibody_map[ps]: if p.name == sub.name and p.mods[0].residue == residue and \ p.mods[0].position == position: found = True break if not found: antibody_map[ps].append(sub_readout) else: antibody_map[ps] = [sub_readout] kin_upid = row['KIN_ID'] if not pandas.isnull(kin_upid): if not uniprot_client.is_human(kin_upid): print('%s non human' % kin_upid) continue kin_hgnc_symbol = uniprot_client.get_gene_name(kin_upid) kin_hgnc = hgnc_client.get_hgnc_id(kin_hgnc_symbol) else: kin_hgnc_symbol = row['KINASE_GENE_SYMBOL'] kin_hgnc_id = hgnc_client.get_hgnc_id(kin_hgnc_symbol) kin_upid = hgnc_client.get_uniprot_id(kin_hgnc_id) kin = Agent(kin_hgnc_symbol, db_refs={'UP': kin_upid,'HGNC': kin_hgnc}) ev = Evidence(source_api='phosphosite') st = Phosphorylation(kin, sub, residue, position, evidence = [ev]) statements.append(st) return statements, antibody_map
def _hgncsym2up(hgnc_symb: str) -> str: hgnc_id = get_current_hgnc_id(hgnc_symb) if isinstance(hgnc_id, list): ix = 0 upid = None while upid is None: try: upid = get_uniprot_id(hgnc_id[ix]) except IndexError: break ix += 1 else: upid = get_uniprot_id(hgnc_id) return upid
def _get_upid_from_hgnc_symbol(hgnc_gene: str) -> Union[str, None]: hgnc_id = get_current_hgnc_id(hgnc_gene) if isinstance(hgnc_id, list): ix = 0 while True: try: up_id = get_uniprot_id(hgnc_id[ix]) except IndexError: up_id = None break if up_id is None: ix += 1 else: up_id = get_uniprot_id(hgnc_id) return up_id
def _get_db_refs(bpe): db_refs = {} if _is_protein(bpe): hgnc_id = BiopaxProcessor._get_hgnc_id(bpe) uniprot_id = BiopaxProcessor._get_uniprot_id(bpe) # Handle missing HGNC/UP ids if hgnc_id and not uniprot_id: uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if uniprot_id and not hgnc_id: if uniprot_client.is_human(uniprot_id): hgnc_name = uniprot_client.get_gene_name(uniprot_id, False) if hgnc_name: hgnc_id = hgnc_client.get_hgnc_id(hgnc_name) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id if uniprot_id is not None: db_refs['UP'] = uniprot_id elif _is_small_molecule(bpe): chebi_id = BiopaxProcessor._get_chebi_id(bpe) if chebi_id is not None: db_refs['CHEBI'] = chebi_id else: chebi_id = BiopaxProcessor._get_chebi_id(bpe) if chebi_id is not None: db_refs['CHEBI'] = chebi_id hgnc_id = BiopaxProcessor._get_hgnc_id(bpe) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id uniprot_id = BiopaxProcessor._get_uniprot_id(bpe) if uniprot_id is not None: db_refs['UP'] = uniprot_id return db_refs
def agent_from_gene_name(name): """Return a grounded Agent based on a gene name.""" agent = Agent(name) hgnc_id = hgnc_client.get_hgnc_id(name) uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) agent.db_refs = {'HGNC': hgnc_id, 'UP': uniprot_id} return agent
def get_mappings() -> Iterable[PredictionTuple]: """Iterate high-confidence lexical mappings between MeSH and UniProt human proteins.""" url = get_script_url(__file__) mapping_type = "lexical" match_type = "skos:exactMatch" confidence = 0.999 for mesh_name, mesh_id in mesh_client.mesh_name_to_id.items(): match = MESH_PROTEIN_RE.match(mesh_name) if not match: continue gene_name = match.groups()[0] hgnc_id = hgnc_client.get_hgnc_id(gene_name) if not hgnc_id: continue uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if not uniprot_id or "," in uniprot_id: continue yield PredictionTuple( "mesh", mesh_id, mesh_name, match_type, "uniprot", uniprot_id, gene_name, mapping_type, confidence, url, )
def agent_from_gene_name(gene_name): """Return an Agent based on a gene name.""" hgnc_id = hgnc_client.get_hgnc_id(gene_name) up_id = hgnc_client.get_uniprot_id(hgnc_id) agent = Agent(gene_name, db_refs={'HGNC': hgnc_id, 'UP': up_id}) return agent
def update_kinases(): logger.info('--Updating kinase list------') url = 'http://www.uniprot.org/uniprot/?' + \ 'sort=entry_name&desc=no&compress=no&query=database:(type:' + \ 'interpro%20ipr011009)%20AND%20reviewed:yes%20AND%20organism:' + \ '%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no' + \ '&format=tab&columns=id,genes(PREFERRED),organism-id,entry%20name' fname = os.path.join(path, 'kinases.tsv') save_from_http(url, fname) from indra.databases import hgnc_client, uniprot_client add_kinases = [ 'PGK1', 'PKM', 'TAF1', 'NME1', 'BCKDK', 'PDK1', 'PDK2', 'PDK3', 'PDK4', 'BCR', 'FAM20C', 'BAZ1B', 'PIKFYVE' ] df = pandas.read_csv(fname, sep='\t') for kinase in add_kinases: hgnc_id = hgnc_client.get_hgnc_id(kinase) up_id = hgnc_client.get_uniprot_id(hgnc_id) up_mnemonic = uniprot_client.get_mnemonic(up_id) df = df.append( { 'Entry': up_id, 'Gene names (primary )': kinase, 'Organism ID': '9606', 'Entry name': up_mnemonic }, ignore_index=True) df.to_csv(fname, sep='\t', index=False)
def standardize_agent_db_refs(agent, map_db_refs, do_rename=True): gene_name = None up_id = map_db_refs.get('UP') hgnc_sym = map_db_refs.get('HGNC') if up_id and not hgnc_sym: gene_name = uniprot_client.get_gene_name(up_id, False) if gene_name: hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: map_db_refs['HGNC'] = hgnc_id elif hgnc_sym and not up_id: # Override the HGNC symbol entry from the grounding # map with an HGNC ID hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym) if hgnc_id: map_db_refs['HGNC'] = hgnc_id # Now get the Uniprot ID for the gene up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: map_db_refs['UP'] = up_id # If there's no HGNC ID for this symbol, raise an # Exception else: raise ValueError('No HGNC ID corresponding to gene ' 'symbol %s in grounding map.' % hgnc_sym) # If we have both, check the gene symbol ID against the # mapping from Uniprot elif up_id and hgnc_sym: # Get HGNC Symbol from Uniprot gene_name = uniprot_client.get_gene_name(up_id) if not gene_name: raise ValueError('No gene name found for Uniprot ' 'ID %s (expected %s)' % (up_id, hgnc_sym)) # We got gene name, compare it to the HGNC name else: if gene_name != hgnc_sym: raise ValueError('Gene name %s for Uniprot ID ' '%s does not match HGNC ' 'symbol %s given in grounding ' 'map.' % (gene_name, up_id, hgnc_sym)) else: hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym) if not hgnc_id: logger.error('No HGNC ID corresponding to gene ' 'symbol %s in grounding map.' % hgnc_sym) else: map_db_refs['HGNC'] = hgnc_id # Assign the DB refs from the grounding map to the agent agent.db_refs = map_db_refs # Are we renaming right now? if do_rename: # If there's a FamPlex ID, prefer that for the name if agent.db_refs.get('FPLX'): agent.name = agent.db_refs.get('FPLX') # Get the HGNC symbol or gene name (retrieved above) elif hgnc_sym is not None: agent.name = hgnc_sym elif gene_name is not None: agent.name = gene_name return
def _make_db_refs(self, entrez_id, text_id): """Looks up the HGNC ID and name, as well as the Uniprot ID. Parameters ---------- entrez_id : str Entrez gene ID. text_id : str or None A plain text systematic name, or None if not listed in the Biogrid data. Returns ------- hgnc_name : str Official HGNC symbol for the gene. db_refs : dict db_refs grounding dictionary, used when constructing the Agent object. """ db_refs = {} if text_id != '-' and text_id is not None: db_refs['TEXT'] = text_id hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id) hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id is not None: db_refs['UP'] = up_id return (hgnc_name, db_refs)
def _add_node(self, agent): node_key = agent.name node_id = self._existing_nodes.get(node_key) if node_id is not None: return node_id db_refs = _get_db_refs(agent) node_id = self._get_new_id() self._existing_nodes[node_key] = node_id node_name = agent.name node_name = node_name.replace('_', ' ') expanded_families = expander.get_children(agent, ns_filter='HGNC') members = {} for member in expanded_families: hgnc_symbol = member[1] hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol) if hgnc_id: up_id = hgnc_client.get_uniprot_id(hgnc_id) member_agent = Agent(hgnc_symbol, db_refs={'HGNC': hgnc_id, 'UP': up_id}) member_db_refs = _get_db_refs(member_agent) else: member_db_refs = {} members[member[1]] = { 'mutation': None, 'expression': None, 'db_refs': member_db_refs } node = {'data': {'id': node_id, 'name': node_name, 'db_refs': db_refs, 'parent': '', 'members': members}} self._nodes.append(node) return node_id
def _get_up_id(hgnc_id): hgnc_id = str(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if not up_id: logger.info("No Uniprot ID for HGNC ID %s" % hgnc_id) return None if ',' in up_id: return None return up_id
def get_target_agent(target): target_hgnc_id = hgnc_client.get_hgnc_id(target) target_up_id = hgnc_client.get_uniprot_id(target_hgnc_id) target_agent = Agent(target, db_refs={ 'HGNC': target_hgnc_id, 'UP': target_up_id }) return target_agent
def _get_agent_from_gene_name(gene_name): db_refs = {} hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id agent = Agent(gene_name, db_refs=db_refs) return agent
def get_agent(raw_name, entrez_id): db_refs = {'TEXT': raw_name} logger.debug('Looking up grounding data for Entrez #%s' % entrez_id) hgnc_id = hgc.get_hgnc_from_entrez(entrez_id) if hgnc_id is not None: db_refs['UP'] = hgc.get_uniprot_id(hgnc_id) name = hgc.get_hgnc_name(hgnc_id) else: name = raw_name agent = Agent(name, db_refs=db_refs) return agent
def _fix_agent(agent): if agent is None: return # First we fix some name spaces db_refs_tmp = copy(agent.db_refs) for db_ns, db_id in agent.db_refs.items(): # Change FA name space if db_ns == 'FA': db_refs_tmp.pop('FA', None) db_refs_tmp['NXPFA'] = db_id # Change IPR name space elif db_ns == 'IPR': db_refs_tmp.pop('IPR', None) db_refs_tmp['IP'] = db_id # Change XFAM name space elif db_ns == 'XFAM': db_refs_tmp.pop('XFAM', None) db_refs_tmp['PF'] = db_id.split('.')[0] agent.db_refs = db_refs_tmp # Check if we have a BE entry be_id = agent.db_refs.get('BE') # Try to map to BE from NXP, IPR, PF, NCIT if not be_id: for db_ns, db_id in agent.db_refs.items(): be_id = bioentities_map.get((db_ns, db_id)) if be_id: break # Try mapping NCIT to specific genes if possible if not be_id and 'NCIT' in agent.db_refs: target = ncit_map.get(agent.db_refs['NCIT']) if target: agent.db_refs[target[0]] = target[1] # Check what entries we have up_id = agent.db_refs.get('UP') hgnc_id = agent.db_refs.get('HGNC') # BE takes precedence if we have it if be_id: agent.db_refs['BE'] = be_id agent.name = be_id elif hgnc_id: gene_name = hgnc_client.get_hgnc_name(hgnc_id) if gene_name: agent.name = gene_name if not up_id: up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: agent.db_refs['UP'] = up_id elif up_id: gene_name = uniprot_client.get_gene_name(up_id) if gene_name: agent.name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: agent.db_refs['HGNC'] = hgnc_id
def _extract_protein(self, name, gene_id): refs = {'EGID': gene_id} hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id) if hgnc_id is not None: refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: refs['UP'] = up_id # If there is a HGNC ID, we standardize the gene name name = hgnc_client.get_hgnc_name(hgnc_id) return Agent(name, db_refs=refs)
def _get_db_refs(bpe): db_refs = {} if _is_protein(bpe) or _is_rna(bpe): hgnc_id = BiopaxProcessor._get_hgnc_id(bpe) uniprot_id = BiopaxProcessor._get_uniprot_id(bpe) # Handle missing HGNC/UP ids if hgnc_id and not uniprot_id: uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) elif uniprot_id and not hgnc_id: if uniprot_client.is_human(uniprot_id): hgnc_name = uniprot_client.get_gene_name(uniprot_id, False) if hgnc_name: hgnc_id = hgnc_client.get_hgnc_id(hgnc_name) # If we have both an HGNC ID and a Uniprot ID, override the # Uniprot ID with the one associated with the HGNC ID elif uniprot_id and hgnc_id: hgnc_up_id = hgnc_client.get_uniprot_id(hgnc_id) if hgnc_up_id != uniprot_id: logger.info('Uniprot ID %s does not match %s obtained ' 'from HGNC ID %s' % (uniprot_id, hgnc_up_id, hgnc_id)) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id if uniprot_id is not None: db_refs['UP'] = uniprot_id elif _is_small_molecule(bpe): chebi_id = BiopaxProcessor._get_chebi_id(bpe) if chebi_id is not None: db_refs['CHEBI'] = chebi_id else: chebi_id = BiopaxProcessor._get_chebi_id(bpe) if chebi_id is not None: db_refs['CHEBI'] = chebi_id hgnc_id = BiopaxProcessor._get_hgnc_id(bpe) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id uniprot_id = BiopaxProcessor._get_uniprot_id(bpe) if uniprot_id is not None: db_refs['UP'] = uniprot_id return db_refs
def run_msa(gene_dict, rs_data, problems): # Next, get sequences and run alignments counter = 0 matches = set() aln_data = {} for gene_sym, rs_ids in gene_dict.items(): counter += 1 #if counter >= 20: # break print("%s: %d of %d genes" % (gene_sym, counter, len(gene_dict))) fasta_lines = [] # Get the main Uniprot sequence from the gene symbol hgnc_id = hgnc_client.get_hgnc_id(gene_sym) up_id_main = hgnc_client.get_uniprot_id(hgnc_id) up_sequence = uniprot_client.get_sequence(up_id_main) fasta_lines.append('>%s\n' % gene_sym) fasta_lines.append('%s\n' % up_sequence) # Now, iterate over the refseq ids and get the sequences seq_ids = [] # The filenames to use if we do an alignment in_file = 'aln/in/%s.fasta' % gene_sym out_file = 'aln/out/%s.fasta' % gene_sym # Iterate over the Refseq IDs for rs_id in rs_ids: seq_info = rs_data.get(rs_id) if not seq_info: problems.add((rs_id, 'no sequence in Refseq')) continue seq_ids.append(rs_id) fasta_header, sequence = seq_info fasta_lines.append('>%s\n%s\n' % (rs_id, sequence)) if sequence == up_sequence: aln_data[rs_id] = (gene_sym, True, None) else: aln_data[rs_id] = (gene_sym, False, out_file) if len(seq_ids) == 0: continue if len(seq_ids) == 1 and sequence == up_sequence: print("\tAll sequences match, no alignment needed.") continue else: # Write the fasta file with open(in_file, 'wt') as f: for line in fasta_lines: f.write(line) # Run the sequence alignment print("\tRunning sequence alignment.") subprocess.call(['./clustal-omega-1.2.3-macosx', '-i', in_file, '-o', out_file, '--force']) return aln_data
def get_grounded_agent(gene_name): """Return a grounded Agent based on an HGNC symbol.""" db_refs = {'TEXT': gene_name} if gene_name in hgnc_map: gene_name = hgnc_map[gene_name] hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id agent = Agent(gene_name, db_refs=db_refs) return agent
def _agent_from_ns_id(ag_ns, ag_id): ag_name = ag_id db_refs = {'TEXT': ag_name} if ag_ns == 'HGNC': hgnc_id = hgnc_client.get_hgnc_id(ag_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id is not None: db_refs['UP'] = up_id else: if ag_id is not None: db_refs[ag_ns] = ag_id return Agent(ag_name, db_refs=db_refs)
def get_mappings(): url = get_script_url() mapping_type = 'lexical' match_type = 'skos:exactMatch' for mesh_name, mesh_id in mesh_client.mesh_name_to_id.items(): match = re.match(r'^(.+) protein, human$', mesh_name) if match: gene_name = match.groups()[0] hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if uniprot_id: yield ('mesh', mesh_id, mesh_name, match_type, 'uniprot', uniprot_id, gene_name, mapping_type, url)
def get_gene_agents(gene_names): agents = [] for gn in gene_names: hgnc_id = hgnc_client.get_hgnc_id(gn) if not hgnc_id: logger.warning('Invalid HGNC gene symbol: %s' % gn) continue db_refs = {'HGNC': hgnc_id} up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id agent = Agent(gn, db_refs=db_refs) agents.append(agent) return agents
def _refs_from_hgnc_id(hgnc_id): ref = {'HGNC_SYMBOL': None, 'HGNC': hgnc_id, 'UP': None} hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if not hgnc_name: logger.warning('Could not get HGNC name for ID %s' % hgnc_id) return None ref['HGNC_SYMBOL'] = hgnc_name uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if not uniprot_id: logger.warning('Could not get UniProt ID for HGNC ID %s' % hgnc_id) return None ref['UP'] = uniprot_id return ref
def normalize_mutation_count(gene_name, num_muts): hgnc_id = get_hgnc_id(gene_name) up_id = get_uniprot_id(hgnc_id) if not up_id: logger.warning("Could not get Uniprot ID for HGNC symbol %s " "with HGNC ID %s" % (gene_name, hgnc_id)) length = 500 # a guess at a default else: length = uniprot_client.get_length(up_id) if not length: logger.warning("Could not get length for Uniprot " "ID %s" % up_id) length = 500 # a guess at a default norm_mutations = num_muts / float(length) return norm_mutations
def get_db_refs(egid): hgnc_id = hgnc_client.get_hgnc_from_entrez(egid) if not hgnc_id: logger.info("No HGNC ID for Entrez ID: %s" % egid) return (None, {}) hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if not hgnc_name: logger.info("No HGNC name for HGNC ID: %s" % hgnc_id) return (None, {}) up_id = hgnc_client.get_uniprot_id(hgnc_id) if not up_id: logger.info("No Uniprot ID for EGID / HGNC ID / Symbol " "%s / %s / %s" % (egid, hgnc_id, hgnc_name)) return (None, {}) return (hgnc_name, {'HGNC': hgnc_id, 'UP': up_id})
def get_mutated_genes(self): """Return dict of gene mutation frequencies based on TCGA studies.""" if self.mutation_cache: logger.info('Loading mutations from %s' % self.mutation_cache) with open(self.mutation_cache, 'r') as fh: self.mutations = json.load(fh) else: logger.info('Getting mutations from cBio web service') mutations = {} for tcga_study_name in tcga_studies[self.tcga_study_prefix]: for idx, hgnc_name_batch in \ enumerate(batch_iter(hgnc_ids.keys(), 200)): logger.info('Fetching mutations for %s and gene batch %s' % (tcga_study_name, idx)) patient_mutations = \ cbio_client.get_profile_data(tcga_study_name, hgnc_name_batch, 'mutation') # e.g. 'ICGC_0002_TD': {'BRAF': None, 'KRAS': 'G12D'} for patient, gene_mut_dict in patient_mutations.items(): # 'BRAF': None for gene, mutated in gene_mut_dict.items(): if mutated is not None: try: mutations[gene] += 1 except KeyError: mutations[gene] = 1 self.mutations = mutations # Normalize mutations by length self.norm_mutations = {} for gene_name, num_muts in self.mutations.items(): hgnc_id = get_hgnc_id(gene_name) up_id = get_uniprot_id(hgnc_id) if not up_id: logger.warning("Could not get Uniprot ID for HGNC symbol %s " "with HGNC ID %s" % (gene_name, hgnc_id)) length = 500 # a guess at a default else: length = uniprot_client.get_length(up_id) if not length: logger.warning("Could not get length for Uniprot " "ID %s" % up_id) length = 500 # a guess at a default self.norm_mutations[gene_name] = num_muts / float(length) return self.mutations, self.norm_mutations
def _add_node(self, agent, uuid=None): node_key = agent.name node_id = self._existing_nodes.get(node_key) # if the node already exists we do not want to add it again # we must however add its uuid if node_id is not None: # fetch the appropriate node n = [x for x in self._nodes if x['data']['id'] == node_id][0] uuid_list = n['data']['uuid_list'] if uuid not in uuid_list: uuid_list.append(uuid) return node_id db_refs = _get_db_refs(agent) node_id = self._get_new_id() self._existing_nodes[node_key] = node_id node_name = agent.name node_name = node_name.replace('_', ' ') expanded_families = expander.get_children(agent, ns_filter='HGNC') members = {} for member in expanded_families: hgnc_symbol = member[1] hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol) if hgnc_id: up_id = hgnc_client.get_uniprot_id(hgnc_id) member_agent = Agent(hgnc_symbol, db_refs={ 'HGNC': hgnc_id, 'UP': up_id }) member_db_refs = _get_db_refs(member_agent) else: member_db_refs = {} members[member[1]] = {'db_refs': member_db_refs} node = { 'data': { 'id': node_id, 'name': node_name, 'db_refs': db_refs, 'parent': '', 'members': members, 'uuid_list': [uuid] } } self._nodes.append(node) return node_id
def map_hgnc_symbols(hgnc_symbols): """Return references based on a list of HGNC symbols.""" refs = [] for hgnc_symbol in hgnc_symbols: ref = {'HGNC_SYMBOL': hgnc_symbol, 'HGNC': None, 'UP': None} hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol) if not hgnc_id: logger.warning('Could not get HGNC ID for symbol %s' % hgnc_symbol) continue ref['HGNC'] = hgnc_id uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if not uniprot_id: logger.warning('Could not get UniProt ID for symbol %s' % hgnc_symbol) continue ref['UP'] = uniprot_id refs.append(ref) return refs
def _initialize_node_agents(self): """Initialize internal dicts containing node information.""" nodes = _get_dict_from_list('nodes', self.cx) invalid_genes = [] for node in nodes: id = node['@id'] cx_db_refs = self.get_aliases(node) node_name = node['n'] up_id = cx_db_refs.get('UP') if up_id: db_refs = {'UP': up_id, 'TEXT': node_name} hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id gene_name = hgnc_client.get_hgnc_name(hgnc_id) else: gene_name = uniprot_client.get_gene_name(up_id) agent = Agent(gene_name, db_refs=db_refs) self._node_names[id] = gene_name self._node_agents[id] = agent continue else: self._node_names[id] = node_name hgnc_id = hgnc_client.get_hgnc_id(node_name) db_refs = {'TEXT': node_name} if not hgnc_id: if not self.require_grounding: self._node_agents[id] = \ Agent(node_name, db_refs=db_refs) invalid_genes.append(node_name) else: db_refs.update({'HGNC': hgnc_id}) up_id = hgnc_client.get_uniprot_id(hgnc_id) # It's possible that a valid HGNC ID will not have a # Uniprot ID, as in the case of HOTAIR (HOX transcript # antisense RNA, HGNC:33510) if up_id: db_refs.update({'UP': up_id}) self._node_agents[id] = Agent(node_name, db_refs=db_refs) if invalid_genes: verb = 'Skipped' if self.require_grounding else 'Included' logger.info('%s invalid gene symbols: %s' % (verb, ', '.join(invalid_genes)))
def get_phospho_antibody_map(fname=antibody_map_file): # First gather the annotations for the phosphosites df = pandas.read_csv(fname, index_col=None, sep=',', encoding='utf8') antibody_map = {} for _, row in df.iterrows(): ps = row['phosphosite'] sub_upid = row['SUB_ID'] if not pandas.isnull(sub_upid): if sub_upid.find('-') != -1: sub_upid = sub_upid.split('-')[0] sub_hgnc_symbol = uniprot_client.get_gene_name(sub_upid) sub_hgnc = hgnc_client.get_hgnc_id(sub_hgnc_symbol) else: sub_hgnc_symbol = row['SUB_GENE'] sub_hgnc_id = hgnc_client.get_hgnc_id(sub_hgnc_symbol) sub_upid = hgnc_client.get_uniprot_id(sub_hgnc_id) if sub_upid is None: continue sub = Agent(sub_hgnc_symbol, db_refs={ 'UP': sub_upid, 'HGNC': sub_hgnc }) residue = row['Actual_site'][0] if len(row['Actual_site']) > 1: position = row['Actual_site'][1:] else: position = None mc = ModCondition('phosphorylation', residue, position) sub.mods = [mc] if ps in antibody_map: found = False for p in antibody_map[ps]: if p.name == sub.name and p.mods[0].residue == residue and \ p.mods[0].position == position: found = True break if not found: antibody_map[ps].append(sub) else: antibody_map[ps] = [sub] return antibody_map
def get_genes_to_refseq_ids(problems): # First, collect refseq IDs for each gene gene_dict = {} for row in read_unicode_csv(peptide_file, delimiter='\t', skiprows=1): site_id = row[0] gene_sym, rem = site_id.split('.', maxsplit=1) refseq_id, site_info = rem.split(':') if gene_sym not in gene_dict: hgnc_id = hgnc_client.get_hgnc_id(gene_sym) if not hgnc_id: problems.add((refseq_id, 'invalid gene symbol')) continue up_id_main = hgnc_client.get_uniprot_id(hgnc_id) if not up_id_main or ', ' in up_id_main: problems.add((refseq_id, 'could not get Uniprot ID from HGNC')) continue gene_dict[gene_sym] = set([refseq_id]) else: gene_dict[gene_sym].add(refseq_id) return gene_dict
def _get_uniprot_id(agent): """Get the Uniprot ID for an agent, looking up in HGNC if necessary. If the Uniprot ID is a list then return the first ID by default. """ up_id = agent.db_refs.get('UP') hgnc_id = agent.db_refs.get('HGNC') if up_id is None: if hgnc_id is None: # If both UniProt and HGNC refs are missing we can't # sequence check and so don't report a failure. return None # Try to get UniProt ID from HGNC up_id = hgnc_client.get_uniprot_id(hgnc_id) # If this fails, again, we can't sequence check if up_id is None: return None # If the UniProt ID is a list then choose the first one. if not isinstance(up_id, basestring) and \ isinstance(up_id[0], basestring): up_id = up_id[0] return up_id
def update_kinases(): logger.info('--Updating kinase list------') url = 'http://www.uniprot.org/uniprot/?' + \ 'sort=entry_name&desc=no&compress=no&query=database:(type:' + \ 'interpro%20ipr011009)%20AND%20reviewed:yes%20AND%20organism:' + \ '%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no' + \ '&format=tab&columns=id,genes(PREFERRED),organism-id,entry%20name' fname = os.path.join(path, 'kinases.tsv') save_from_http(url, fname) from indra.databases import hgnc_client, uniprot_client add_kinases = ['PGK1', 'PKM', 'TAF1', 'NME1', 'BCKDK', 'PDK1', 'PDK2', 'PDK3', 'PDK4', 'BCR', 'FAM20C', 'BAZ1B', 'PIKFYVE'] df = pandas.read_csv(fname, sep='\t') for kinase in add_kinases: hgnc_id = hgnc_client.get_hgnc_id(kinase) up_id = hgnc_client.get_uniprot_id(hgnc_id) up_mnemonic = uniprot_client.get_mnemonic(up_id) df = df.append({'Entry': up_id, 'Gene names (primary )': kinase, 'Organism ID': '9606', 'Entry name': up_mnemonic}, ignore_index=True) df.to_csv(fname, sep='\t', index=False)
def _initialize_node_agents(self): """Initialize internal dicts containing node information.""" nodes = _get_dict_from_list('nodes', self.cx) invalid_genes = [] for node in nodes: id = node['@id'] cx_db_refs = self.get_aliases(node) up_id = cx_db_refs.get('UP') if up_id: gene_name = uniprot_client.get_gene_name(up_id) hgnc_id = hgnc_client.get_hgnc_id(gene_name) db_refs = {'UP': up_id, 'HGNC': hgnc_id, 'TEXT': gene_name} agent = Agent(gene_name, db_refs=db_refs) self._node_names[id] = gene_name self._node_agents[id] = agent continue else: node_name = node['n'] self._node_names[id] = node_name hgnc_id = hgnc_client.get_hgnc_id(node_name) db_refs = {'TEXT': node_name} if not hgnc_id: if not self.require_grounding: self._node_agents[id] = \ Agent(node_name, db_refs=db_refs) invalid_genes.append(node_name) else: db_refs.update({'HGNC': hgnc_id}) up_id = hgnc_client.get_uniprot_id(hgnc_id) # It's possible that a valid HGNC ID will not have a # Uniprot ID, as in the case of HOTAIR (HOX transcript # antisense RNA, HGNC:33510) if up_id: db_refs.update({'UP': up_id}) self._node_agents[id] = Agent(node_name, db_refs=db_refs) if invalid_genes: verb = 'Skipped' if self.require_grounding else 'Included' logger.info('%s invalid gene symbols: %s' % (verb, ', '.join(invalid_genes)))
def _add_node(self, agent, uuid=None): node_key = agent.name node_id = self._existing_nodes.get(node_key) # if the node already exists we do not want to add it again # we must however add its uuid if node_id is not None: # fetch the appropriate node n = [x for x in self._nodes if x['data']['id'] == node_id][0] uuid_list = n['data']['uuid_list'] if uuid not in uuid_list: uuid_list.append(uuid) return node_id db_refs = _get_db_refs(agent) node_id = self._get_new_id() self._existing_nodes[node_key] = node_id node_name = agent.name node_name = node_name.replace('_', ' ') expanded_families = expander.get_children(agent, ns_filter='HGNC') members = {} for member in expanded_families: hgnc_symbol = member[1] hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol) if hgnc_id: up_id = hgnc_client.get_uniprot_id(hgnc_id) member_agent = Agent(hgnc_symbol, db_refs={'HGNC': hgnc_id, 'UP': up_id}) member_db_refs = _get_db_refs(member_agent) else: member_db_refs = {} members[member[1]] = {'db_refs': member_db_refs} node = {'data': {'id': node_id, 'name': node_name, 'db_refs': db_refs, 'parent': '', 'members': members, 'uuid_list': [uuid]}} self._nodes.append(node) return node_id
def test_get_uniprot_id(): hgnc_id = '6840' uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) assert(uniprot_id == 'Q02750')
def _get_agent_from_entity(self, entity_id): qstr = "$.entities.frames[(@.frame_id is \'%s\')]" % entity_id res = self.tree.execute(qstr) if res is None: return None try: entity_term = next(res) except StopIteration: logger.debug(' %s is not an entity' % entity_id) return None # This is the default name, which can be overwritten # below for specific database entries agent_name = self._get_valid_name(entity_term['text']) db_refs = {} for xr in entity_term['xrefs']: ns = xr['namespace'] if ns == 'uniprot': up_id = xr['id'] db_refs['UP'] = up_id # Look up official names in UniProt gene_name = up_client.get_gene_name(up_id) if gene_name is not None: agent_name = self._get_valid_name(gene_name) # If the gene name corresponds to an HGNC ID, add it to the # db_refs hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif ns == 'hgnc': hgnc_id = xr['id'] db_refs['HGNC'] = hgnc_id # Look up the standard gene symbol and set as name hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if hgnc_name: agent_name = hgnc_name # Look up the corresponding uniprot id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id elif ns == 'pfam': be_id = bioentities_map.get(('PF', xr['id'])) if be_id: db_refs['BE'] = be_id db_refs['PF'] = xr['id'] elif ns == 'interpro': be_id = bioentities_map.get(('IP', xr['id'])) if be_id: db_refs['BE'] = be_id db_refs['PF'] = xr['id'] elif ns == 'chebi': db_refs['CHEBI'] = xr['id'] elif ns == 'pubchem': db_refs['PUBCHEM'] = 'PUBCHEM:%s' % xr['id'] elif ns == 'go': db_refs['GO'] = xr['id'] elif ns == 'mesh': db_refs['MESH'] = xr['id'] elif ns == 'hmdb': db_refs['HMDB'] = xr['id'] elif ns == 'simple_chemical': if xr['id'].startswith('HMDB'): db_refs['HMDB'] = xr['id'] elif ns == 'be': db_refs['BE'] = xr['id'] # These name spaces are ignored elif ns in ['uaz']: pass else: logger.warning('Unhandled xref namespace: %s' % ns) db_refs['TEXT'] = entity_term['text'] mod_terms = entity_term.get('modifications') mods = [] muts = [] if mod_terms is not None: for m in mod_terms: if m['type'].lower() == 'mutation': # Evidence is usualy something like "V600E" # We could parse this to get the amino acid # change that happened. mutation_str = m.get('evidence') # TODO: sometimes mutation_str is "mutant", "Mutant", # "mutants" - this indicates that there is a mutation # but not the specific type. We should encode this # somehow as a "blank" mutation condition mut = self._parse_mutation(mutation_str) if mut is not None: muts.append(mut) else: mc = self._get_mod_condition(m) if mc is not None: mods.append(mc) agent = Agent(agent_name, db_refs=db_refs, mods=mods, mutations=muts) return agent
def test_get_uniprot_id(): hgnc_id = '6840' uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) assert uniprot_id == 'Q02750' assert unicode_strs(uniprot_id)
def test_get_uniprot_id_none(): # This HGNC entry doesn't have a UniProt ID hgnc_id = '12027' uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) assert uniprot_id is None
def _urn_to_db_refs(urn): """Converts a Medscan URN to an INDRA db_refs dictionary with grounding information. Parameters ---------- urn : str A Medscan URN Returns ------- db_refs : dict A dictionary with grounding information, mapping databases to database identifiers. If the Medscan URN is not recognized, returns an empty dictionary. db_name : str The Famplex name, if available; otherwise the HGNC name if available; otherwise None """ # Convert a urn to a db_refs dictionary if urn is None: return {}, None m = URN_PATT.match(urn) if m is None: return None, None urn_type, urn_id = m.groups() db_refs = {} db_name = None # TODO: support more types of URNs if urn_type == 'agi-cas': # Identifier is CAS, convert to CHEBI chebi_id = get_chebi_id_from_cas(urn_id) if chebi_id: db_refs['CHEBI'] = 'CHEBI:%s' % chebi_id db_name = get_chebi_name_from_id(chebi_id) elif urn_type == 'agi-llid': # This is an Entrez ID, convert to HGNC hgnc_id = get_hgnc_from_entrez(urn_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id # Convert the HGNC ID to a Uniprot ID uniprot_id = get_uniprot_id(hgnc_id) if uniprot_id is not None: db_refs['UP'] = uniprot_id # Try to lookup HGNC name; if it's available, set it to the # agent name db_name = get_hgnc_name(hgnc_id) elif urn_type in ['agi-meshdis', 'agi-ncimorgan', 'agi-ncimtissue', 'agi-ncimcelltype']: if urn_id.startswith('C') and urn_id[1:].isdigit(): # Identifier is probably UMLS db_refs['UMLS'] = urn_id else: # Identifier is MESH urn_mesh_name = unquote(urn_id) mesh_id, mesh_name = mesh_client.get_mesh_id_name(urn_mesh_name) if mesh_id: db_refs['MESH'] = mesh_id db_name = mesh_name else: db_name = urn_mesh_name elif urn_type == 'agi-gocomplex': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id elif urn_type == 'agi-go': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id # If we have a GO or MESH grounding, see if there is a corresponding # Famplex grounding db_sometimes_maps_to_famplex = ['GO', 'MESH'] for db in db_sometimes_maps_to_famplex: if db in db_refs: key = (db, db_refs[db]) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the urn corresponds to an eccode, groudn to famplex if that eccode # is in the Famplex equivalences table if urn.startswith('urn:agi-enz'): tokens = urn.split(':') eccode = tokens[2] key = ('ECCODE', eccode) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding key = ('MEDSCAN', urn) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If there is a Famplex grounding, use Famplex for entity name if 'FPLX' in db_refs: db_name = db_refs['FPLX'] elif 'GO' in db_refs: db_name = go_client.get_go_label(db_refs['GO']) return db_refs, db_name
def _fix_agent(agent): if agent is None: return # First we fix some name spaces db_refs_tmp = copy(agent.db_refs) for db_ns, db_id in agent.db_refs.items(): # Change FA name space if db_ns == 'FA': db_refs_tmp.pop('FA', None) db_refs_tmp['NXPFA'] = db_id # Change IPR name space elif db_ns == 'IPR': db_refs_tmp.pop('IPR', None) db_refs_tmp['IP'] = db_id # Change XFAM name space elif db_ns == 'XFAM': db_refs_tmp.pop('XFAM', None) db_refs_tmp['PF'] = db_id.split('.')[0] elif db_ns == 'GO': if db_id.startswith('GO:'): db_refs_tmp['GO'] = db_id else: db_refs_tmp['GO'] = 'GO:' + db_id # Change PCID name space elif db_ns == 'PCID': db_refs_tmp.pop('PCID', None) db_refs_tmp['PUBCHEM'] = db_id agent.db_refs = db_refs_tmp # Check if we have a FPLX entry and handle old BE mappings if 'BE' in agent.db_refs: agent.db_refs['FPLX'] = agent.db_refs.pop('BE') be_id = agent.db_refs.get('FPLX') # Try to map to FPLX from NXP, IPR, PF, NCIT if not be_id: for db_ns, db_id in agent.db_refs.items(): be_id = famplex_map.get((db_ns, db_id)) if be_id: break # Try mapping NCIT to specific genes if possible if not be_id and 'NCIT' in agent.db_refs: target = ncit_map.get(agent.db_refs['NCIT']) if target: agent.db_refs[target[0]] = target[1] # Check what entries we have up_id = agent.db_refs.get('UP') hgnc_id = agent.db_refs.get('HGNC') # FPLX takes precedence if we have it if be_id: agent.db_refs['FPLX'] = be_id agent.name = be_id elif hgnc_id: gene_name = hgnc_client.get_hgnc_name(hgnc_id) if gene_name: agent.name = gene_name if not up_id: up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: agent.db_refs['UP'] = up_id elif up_id: gene_name = uniprot_client.get_gene_name(up_id) if gene_name: agent.name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: agent.db_refs['HGNC'] = hgnc_id # If it doesn't have a gene name, it's better to just # use the raw string name otherwise Sparser sets # has Uniprot IDs or mnemonics as the name else: name = agent.db_refs.get('TEXT', agent.name) agent.name = name
def get_agent(node_data, node_modifier_data=None): # FIXME: Handle translocations on the agent for ActiveForms, turn into # location conditions # Check the node type/function node_func = node_data[pc.FUNCTION] if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX, pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA): mod_data = node_modifier_data or 'No node data' logger.info("Nodes of type %s not handled: %s", node_func, mod_data) return None # Skip gene/protein fusions if pc.FUSION in node_data: logger.info("Gene and protein fusions not handled: %s" % str(node_data)) return None # COMPLEXES ------------ # First, handle complexes, which will consist recursively of other agents if node_func == pc.COMPLEX: # First, check for members: if there are no members, we assume this # is a named complex members = node_data.get(pc.MEMBERS) if members is None: return None # Otherwise, get the "main" agent, to which the other members will be # attached as bound conditions main_agent = get_agent(members[0]) # If we can't get the main agent, return None if main_agent is None: return None bound_conditions = [BoundCondition(get_agent(m), True) for m in members[1:]] # Check the bound_conditions for any None agents if any([bc.agent is None for bc in bound_conditions]): return None main_agent.bound_conditions = bound_conditions # Get activity of main agent ac = _get_activity_condition(node_modifier_data) main_agent.activity = ac return main_agent # OTHER NODE TYPES ----- # Get node identifier information name = node_data.get(pc.NAME) ns = node_data[pc.NAMESPACE] ident = node_data.get(pc.IDENTIFIER) # No ID present, get identifier using the name, namespace db_refs = None if not ident: assert name, "Node must have a name if lacking an identifier." if ns == 'HGNC': hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return None db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id # FIXME: Look up go ID in ontology lookup service # FIXME: Look up MESH IDs from name # FIXME: For now, just use node name elif ns in ('GOBP', 'MESHPP', 'MESHD'): db_refs = {} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns in ('MGI', 'RGD'): db_refs = {ns: name} # Map Selventa families to FamPlexes elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns == 'EGID': hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) else: logger.info('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name # CHEBI elif ns == 'CHEBI': chebi_id = chebi_name_id.get(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM'): db_refs = {ns: name} else: print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) # We've already got an identifier, look up other identifiers if necessary else: # Get the name, overwriting existing name if necessary if ns == 'HGNC': name = hgnc_client.get_hgnc_name(ident) db_refs = {'HGNC': ident} up_id = _get_up_id(ident) if up_id: db_refs['UP'] = up_id elif ns == 'UP': db_refs = {'UP': ident} name = uniprot_client.get_gene_name(ident) assert name if uniprot_client.is_human(ident): hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info('Uniprot ID linked to invalid human gene ' 'name %s' % name) else: db_refs['HGNC'] = hgnc_id elif ns in ('MGI', 'RGD'): raise ValueError('Identifiers for MGI and RGD databases are not ' 'currently handled: %s' % node_data) else: print("Unhandled namespace with identifier: %s: %s (%s)" % (ns, name, node_data)) if db_refs is None: logger.info('Unable to get identifier information for node: %s', node_data) return None # Get modification conditions mods, muts = _get_all_pmods(node_data) # Get activity condition ac = _get_activity_condition(node_modifier_data) to_loc = _get_translocation_target(node_modifier_data) # Check for unhandled node modifiers, skip if so if _has_unhandled_modifiers(node_modifier_data): return None # Make the agent ag = Agent(name, db_refs=db_refs, mods=mods, mutations=muts, activity=ac, location=to_loc) return ag
def _get_agent_from_ref(self, ref): # TODO: handle collections if ref.attrib.get('category') == 'collection': #logger.warning('Skipping collection Agent.') return None # Find the name, uid and raw-text tags first and get their text # content if available uid_tag = ref.find("var/[@name='uid']") name_tag = ref.find("var/[@name='name']") text_tag = ref.find("var/[@name='raw-text']") if name_tag is not None and name_tag.text: name = name_tag.text else: name = None if uid_tag is not None and uid_tag.text: uid = uid_tag.text else: uid = None if text_tag is not None and text_tag.text: raw_text = text_tag.text else: raw_text = None # TODO: factor this out and reuse fix_agents db_refs = {} # Save raw text if available if raw_text: db_refs['TEXT'] = raw_text agent_name = raw_text # If we have a proper UID then we try to reconstruct an Agent from that if uid is not None and len(uid.split(':')) == 2: db_ns, db_id = uid.split(':') be_id = famplex_map.get((db_ns, db_id)) if be_id: db_refs[db_ns] = db_id db_refs['FPLX'] = be_id agent_name = be_id elif db_ns in ['UP', 'Uniprot']: db_refs['UP'] = db_id gene_name = uniprot_client.get_gene_name(db_id) if gene_name: agent_name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif db_ns == 'NCIT': db_refs['NCIT'] = db_id target = ncit_map.get(db_id) if target: db_refs[target[0]] = target[1] if target[0] == 'HGNC': up_id = hgnc_client.get_uniprot_id(target[1]) agent_name = hgnc_client.get_hgnc_name(target[1]) if up_id: db_refs['UP'] = up_id elif target[0] == 'UP': agent_name = uniprot_client.get_gene_name(target[1]) if agent_name: hgnc_id = hgnc_client.get_hgnc_id(agent_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif db_ns == 'FA': db_refs['NXP'] = 'FA:' + db_id elif db_ns == 'XFAM': db_refs['PF'] = db_id.split('.')[0] elif db_ns == 'CHEBI': db_refs['CHEBI'] = 'CHEBI:' + db_id elif db_ns in ['GO', 'MESH', 'FPLX']: db_refs[db_ns] = db_id # Handle old BE mappings and add them as FPLX elif db_ns == 'BE': db_refs['FPLX'] = db_id elif db_ns in ['PR', 'CO', 'CVCL', 'EFO', 'ORPHANET']: db_refs[db_ns] = db_id else: logger.warning('Unknown database name space %s' % db_ns) if not agent_name: if raw_text is not None: agent_name = raw_text else: return None assert(agent_name) agent = Agent(agent_name, db_refs=db_refs) return agent
def _make_agent(self, hprd_id, refseq_id=None): if hprd_id is None or hprd_id is nan: return None # Get the basic info (HGNC name/symbol, Entrez ID) from the # ID mappings dataframe try: egid = self.id_df.loc[hprd_id].EGID except KeyError: logger.info('HPRD ID %s not found in mappings table.' % hprd_id) return None if not egid: logger.info('No Entrez ID for HPRD ID %s' % hprd_id) return None # Get the HGNC ID hgnc_id = hgnc_client.get_hgnc_from_entrez(egid) # If we couldn't get an HGNC ID for the Entrez ID, this means that # the Entrez ID has been discontinued or replaced. if not hgnc_id: self.no_hgnc_for_egid.append(egid) return None # Get the (possibly updated) HGNC Symbol hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) assert hgnc_name is not None # See if we can get a Uniprot ID from the HGNC symbol--if there is # a RefSeq ID we wil also try to use it to get an isoform specific # UP ID, but we will have this one to fall back on. But if we can't # get one here, then we skip the Statement up_id_from_hgnc = hgnc_client.get_uniprot_id(hgnc_id) if not up_id_from_hgnc: self.no_up_for_hgnc.append((egid, hgnc_name, hgnc_id)) return None # If we have provided the RefSeq ID, it's because we need to make # sure that we are getting the right isoform-specific ID (for sequence # positions of PTMs). Here we try to get the Uniprot ID from the # Refseq->UP mappings in the protmapper.uniprot_client. if refseq_id is not None: # Get the Uniprot IDs from the uniprot client up_ids = uniprot_client.get_ids_from_refseq(refseq_id, reviewed_only=True) # Nothing for this RefSeq ID (quite likely because the RefSeq ID # is obsolete; take the UP ID from HGNC if len(up_ids) == 0: self.no_up_for_refseq.append(refseq_id) up_id = up_id_from_hgnc # More than one reviewed entry--no thanks, we'll take the one from # HGNC instead elif len(up_ids) > 1: self.many_ups_for_refseq.append(refseq_id) up_id = up_id_from_hgnc # We got a unique, reviewed UP entry for the RefSeq ID else: up_id = up_ids[0] # If it's the canonical isoform, strip off the '-1' if up_id.endswith('-1'): up_id = up_id.split('-')[0] # For completeness, get the Refseq ID from the HPRD ID table else: refseq_id = self.id_df.loc[hprd_id].REFSEQ_PROTEIN up_id = up_id_from_hgnc # Make db_refs, return Agent db_refs = {'HGNC': hgnc_id, 'UP': up_id, 'EGID': egid, 'REFSEQ_PROT': refseq_id} return Agent(hgnc_name, db_refs=db_refs)
def get_participant(agent): # Handle missing Agent as generic protein if agent is None: return get_generic('protein') # The Agent is not missing text_name = agent.db_refs.get('TEXT') if text_name is None: text_name = agent.name participant = {} participant['entity_text'] = [text_name] hgnc_id = agent.db_refs.get('HGNC') uniprot_id = agent.db_refs.get('UP') chebi_id = agent.db_refs.get('CHEBI') pfam_def_ids = agent.db_refs.get('PFAM-DEF') # If HGNC grounding is available, that is the first choice if hgnc_id: uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if uniprot_id: uniprot_mnemonic = str(uniprot_client.get_mnemonic(uniprot_id)) participant['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic participant['entity_type'] = 'protein' elif chebi_id: pubchem_id = chebi_client.get_pubchem_id(chebi_id) participant['identifier'] = 'PUBCHEM:%s' % pubchem_id participant['entity_type'] = 'chemical' elif pfam_def_ids: participant['entity_type'] = 'protein_family' participant['entities'] = [] pfam_def_list = [] for p in pfam_def_ids.split('|'): dbname, dbid = p.split(':') pfam_def_list.append({dbname: dbid}) for pdi in pfam_def_list: # TODO: handle non-uniprot protein IDs here uniprot_id = pdi.get('UP') if uniprot_id: entity_dict = {} uniprot_mnemonic = \ str(uniprot_client.get_mnemonic(uniprot_id)) gene_name = uniprot_client.get_gene_name(uniprot_id) if gene_name is None: gene_name = "" entity_dict['entity_text'] = [gene_name] entity_dict['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic entity_dict['entity_type'] = 'protein' participant['entities'].append(entity_dict) else: participant['identifier'] = '' participant['entity_type'] = 'protein' features = [] not_features = [] # Binding features for bc in agent.bound_conditions: feature = { 'feature_type': 'binding_feature', 'bound_to': { # NOTE: get type and identifier for bound to protein 'entity_type': 'protein', 'entity_text': [bc.agent.name], 'identifier': '' } } if bc.is_bound: features.append(feature) else: not_features.append(feature) # Modification features for mc in agent.mods: feature = { 'feature_type': 'modification_feature', 'modification_type': mc.mod_type.lower(), } if mc.position is not None: pos = int(mc.position) feature['location'] = pos if mc.residue is not None: feature['aa_code'] = mc.residue if mc.is_modified: features.append(feature) else: not_features.append(feature) # Mutation features for mc in agent.mutations: feature = {} feature['feature_type'] = 'mutation_feature' if mc.residue_from is not None: feature['from_aa'] = mc.residue_from if mc.residue_to is not None: feature['to_aa'] = mc.residue_to if mc.position is not None: pos = int(mc.position) feature['location'] = pos features.append(feature) if features: participant['features'] = features if not_features: participant['not_features'] = not_features return participant
import os from urllib import request from pybel import BELGraph from pybel.dsl import * from pybel.language import Entity from pybel.io import from_json_file from pybel.examples import egf_graph from indra.statements import * from indra.sources import bel from indra.sources.bel import processor as pb from indra.sources.bel.api import process_cbn_jgif_file, process_pybel_graph from indra.databases import hgnc_client mek_hgnc_id = hgnc_client.get_hgnc_id('MAP2K1') mek_up_id = hgnc_client.get_uniprot_id(mek_hgnc_id) def test_process_pybel(): pbp = bel.process_pybel_graph(egf_graph) assert pbp.statements def test_process_jgif(): test_file_url = 'https://s3.amazonaws.com/bigmech/travis/Hox-2.0-Hs.jgf' test_file = 'Hox-2.0-Hs.jgf' request.urlretrieve(url=test_file_url, filename=test_file) pbp = process_cbn_jgif_file(test_file) # Clean up os.remove(test_file)
def _get_up_id(hgnc_id): up_id = hgnc_client.get_uniprot_id(hgnc_id) if not up_id: logger.info("No Uniprot ID for HGNC ID %s" % hgnc_id) return up_id
def get_agent(concept, entity): name = term_from_uri(concept) namespace = namespace_from_uri(entity) db_refs = {} if namespace == 'HGNC': agent_name = name hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id is not None: db_refs['HGNC'] = str(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.warning('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.' % (name, hgnc_id)) else: logger.warning("Couldn't get HGNC ID for HGNC symbol %s" % name) elif namespace in ('MGI', 'RGD'): agent_name = name db_refs[namespace] = name elif namespace in ('PFH', 'SFAM'): indra_name = bel_to_indra.get(name) db_refs[namespace] = name if indra_name is None: agent_name = name msg = 'Could not find mapping for BEL family: %s' % name logger.warning(msg) else: db_refs['BE'] = indra_name db_refs['TEXT'] = name agent_name = indra_name elif namespace in ('NCH', 'SCOMP'): indra_name = bel_to_indra.get(name) db_refs[namespace] = name if indra_name is None: agent_name = name msg = 'Could not find mapping for BEL complex: %s' % name logger.warning(msg) else: db_refs['BE'] = indra_name db_refs['TEXT'] = name agent_name = indra_name elif namespace == 'CHEBI': chebi_id = chebi_name_id.get(name) if chebi_id: db_refs['CHEBI'] = chebi_id else: logger.warning('CHEBI name %s not found in map.' % name) agent_name = name elif namespace == 'EGID': hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs['EGID'] = name if hgnc_id is not None: db_refs['HGNC'] = str(hgnc_id) agent_name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.warning('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.' % (name, hgnc_id)) else: logger.warning('Could not map EGID%s to HGNC.' % name) agent_name = 'E%s' % name else: logger.warning('Unhandled entity namespace: %s' % namespace) print('%s, %s' % (concept, entity)) agent_name = name agent = Agent(agent_name, db_refs=db_refs) return agent
def map_agents(self, stmts, do_rename=True): # Make a copy of the stmts mapped_stmts = [] num_skipped = 0 # Iterate over the statements for stmt in stmts: mapped_stmt = deepcopy(stmt) # Iterate over the agents skip_stmt = False for agent in mapped_stmt.agent_list(): if agent is None or agent.db_refs.get('TEXT') is None: continue agent_text = agent.db_refs.get('TEXT') # Look this string up in the grounding map # If not in the map, leave agent alone and continue try: map_db_refs = self.gm[agent_text] except KeyError: continue # If it's in the map but it maps to None, then filter out # this statement by skipping it if map_db_refs is None: # Increase counter if this statement has not already # been skipped via another agent if not skip_stmt: num_skipped += 1 logger.debug("Skipping %s" % agent_text) skip_stmt = True # If it has a value that's not None, map it and add it else: # Otherwise, update the agent's db_refs field gene_name = None map_db_refs = deepcopy(self.gm.get(agent_text)) up_id = map_db_refs.get('UP') hgnc_sym = map_db_refs.get('HGNC') if up_id and not hgnc_sym: gene_name = uniprot_client.get_gene_name(up_id, False) if gene_name: hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: map_db_refs['HGNC'] = hgnc_id elif hgnc_sym and not up_id: # Override the HGNC symbol entry from the grounding # map with an HGNC ID hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym) if hgnc_id: map_db_refs['HGNC'] = hgnc_id # Now get the Uniprot ID for the gene up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: map_db_refs['UP'] = up_id # If there's no HGNC ID for this symbol, raise an # Exception else: raise ValueError('No HGNC ID corresponding to gene ' 'symbol %s in grounding map.' % hgnc_sym) # If we have both, check the gene symbol ID against the # mapping from Uniprot elif up_id and hgnc_sym: # Get HGNC Symbol from Uniprot gene_name = uniprot_client.get_gene_name(up_id) if not gene_name: raise ValueError('No gene name found for Uniprot ' 'ID %s (expected %s)' % (up_id, hgnc_sym)) # We got gene name, compare it to the HGNC name else: if gene_name != hgnc_sym: raise ValueError('Gene name %s for Uniprot ID ' '%s does not match HGNC ' 'symbol %s given in grounding ' 'map.' % (gene_name, up_id, hgnc_sym)) else: hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym) if not hgnc_id: raise ValueError('No HGNC ID ' 'corresponding to gene ' 'symbol %s in grounding ' 'map.' % hgnc_sym) # Assign the DB refs from the grounding map to the agent agent.db_refs = map_db_refs # Are we renaming right now? if do_rename: # If there's a Bioentities ID, prefer that for the name if agent.db_refs.get('BE'): agent.name = agent.db_refs.get('BE') # Get the HGNC symbol or gene name (retrieved above) elif hgnc_sym is not None: agent.name = hgnc_sym elif gene_name is not None: agent.name = gene_name # Check if we should skip the statement if not skip_stmt: mapped_stmts.append(mapped_stmt) logger.info('%s statements filtered out' % num_skipped) return mapped_stmts