def generate_uniprot_terms(download=False): path = os.path.join(resource_dir, 'up_synonyms.tsv') if not os.path.exists(path) or download: url = ('https://www.uniprot.org/uniprot/?format=tab&columns=id,' 'genes(PREFERRED),protein%20names&sort=score&' 'fil=organism:"H**o%20sapiens%20(Human)%20[9606]"' '%20AND%20reviewed:yes') logger.info('Downloading UniProt resource file') res = requests.get(url) with open(path, 'w') as fh: fh.write(res.text) terms = [] for row in read_csv(path, delimiter='\t', header=True): names = parse_uniprot_synonyms(row['Protein names']) up_id = row['Entry'] standard_name = row['Gene names (primary )'] ns = 'UP' id = row['Entry'] # We skip a small number of not critical entries that don't have # standard names if not standard_name: continue hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: ns = 'HGNC' id = hgnc_id standard_name = hgnc_client.get_hgnc_name(hgnc_id) for name in names: # Skip names that are EC codes if name.startswith('EC '): continue term = Term(normalize(name), name, ns, id, standard_name, 'synonym', 'uniprot') terms.append(term) return terms
def _extract_protein(self, line): # Extract key information from the lines. prot_name = line['Protein Name'] prot_id = line['Protein HMS LINCS ID'] # Get available db-refs. db_refs = {} if prot_id: db_refs.update(self._lc.get_protein_refs(prot_id)) # Since the resource only gives us an UP ID (not HGNC), we # try to get that and standardize the name to the gene name up_id = db_refs.get('UP') if up_id: hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id prot_name = hgnc_client.get_hgnc_name(hgnc_id) else: gene_name = uniprot_client.get_gene_name(up_id) if gene_name: prot_name = gene_name # In some cases lines are missing protein information in which # case we return None else: return None # Create the agent. return Agent(prot_name, db_refs=db_refs)
def _get_agent(self, ent_name, ent_type, id, database): # Returns a list of agents corresponding to this id # (If it is a signor complex, returns an Agent object with complex # constituents as BoundConditions if database == 'SIGNOR' and id in self.complex_map: components = self.complex_map[id] agents = self._get_complex_agents(id) # Return the first agent with the remaining agents as a bound # condition agent = agents[0] agent.bound_conditions = \ [BoundCondition(a, True) for a in agents[1:]] return agent else: gnd_type = _type_db_map[(ent_type, database)] if gnd_type == 'UP': up_id = id db_refs = {'UP': up_id} hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) else: name = uniprot_client.get_gene_name(up_id) # Map SIGNOR protein families to FamPlex families elif ent_type == 'proteinfamily': db_refs = { database: id } # Keep the SIGNOR family ID in db_refs key = (database, id) # Use SIGNOR name unless we have a mapping in FamPlex name = ent_name famplex_id = famplex_map.get(key) if famplex_id is None: logger.info('Could not find %s in FamPlex map' % str(key)) else: db_refs['FPLX'] = famplex_id name = famplex_id # Other possible groundings are PUBCHEM, SIGNOR, etc. elif gnd_type is not None: if database not in ('PUBCHEM', 'SIGNOR', 'ChEBI', 'miRBase', 'DRUGBANK'): raise ValueError('Unexpected database %s' % database) if database == 'PUBCHEM' and id.startswith('CID:'): # We take off the CID: prefix plus fix an issue with # SIGNOR's format in which it leaves extra spaces around # the ID, as in 'CID: 923' id = id[4:].strip() db_refs = {gnd_type: id} name = ent_name # If no grounding, include as an untyped/ungrounded node else: name = ent_name db_refs = {} return Agent(name, db_refs=db_refs)
def generate_famplex_terms(ignore_mappings=False): fname = os.path.join(indra_resources, 'famplex', 'grounding_map.csv') logger.info('Loading %s' % fname) terms = [] for row in read_csv(fname, delimiter=','): txt = row[0] norm_txt = normalize(txt) groundings = {k: v for k, v in zip(row[1::2], row[2::2]) if (k and v)} if 'FPLX' in groundings: id = groundings['FPLX'] term = Term(norm_txt, txt, 'FPLX', id, id, 'assertion', 'famplex') elif 'HGNC' in groundings: id = groundings['HGNC'] term = Term(norm_txt, txt, 'HGNC', hgnc_client.get_hgnc_id(id), id, 'assertion', 'famplex', '9606') elif 'UP' in groundings: db = 'UP' id = groundings['UP'] name = id organism = None if uniprot_client.is_human(id): organism = '9606' hgnc_id = uniprot_client.get_hgnc_id(id) if hgnc_id: name = hgnc_client.get_hgnc_name(hgnc_id) if hgnc_id: db = 'HGNC' id = hgnc_id else: logger.warning('No gene name for %s' % id) # TODO: should we add organism info here? term = Term(norm_txt, txt, db, id, name, 'assertion', 'famplex', organism) elif 'CHEBI' in groundings: id = groundings['CHEBI'] name = chebi_client.get_chebi_name_from_id(id[6:]) term = Term(norm_txt, txt, 'CHEBI', id, name, 'assertion', 'famplex') elif 'GO' in groundings: id = groundings['GO'] term = Term(norm_txt, txt, 'GO', id, go_client.get_go_label(id), 'assertion', 'famplex') elif 'MESH' in groundings: id = groundings['MESH'] mesh_mapping = mesh_mappings.get(id) db, db_id, name = mesh_mapping if (mesh_mapping and not ignore_mappings) else \ ('MESH', id, mesh_client.get_mesh_name(id)) term = Term(norm_txt, txt, db, db_id, name, 'assertion', 'famplex') else: # TODO: handle HMDB, PUBCHEM, CHEMBL continue terms.append(term) return terms
def _agent_from_id(db_id): # There are some Ensembl protein IDs which we currently can't normalize # to anything else (unlike ENSG). if db_id.startswith('ENSP'): db_refs = {'ENSEMBL': db_id} name = db_id # All other entries are UniProt IDs else: name = uniprot_client.get_gene_name(db_id) if not name: return None db_refs = {'UP': db_id} hgnc_id = uniprot_client.get_hgnc_id(db_id) if hgnc_id: db_refs['HGNC'] = hgnc_id return Agent(name, db_refs=db_refs)
def _initialize_node_agents(self): """Initialize internal dicts containing node information.""" nodes = _get_dict_from_list('nodes', self.cx) invalid_genes = [] for node in nodes: id = node['@id'] cx_db_refs = self.get_aliases(node) node_name = node['n'] up_id = cx_db_refs.get('UP') if up_id: db_refs = {'UP': up_id, 'TEXT': node_name} hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id gene_name = hgnc_client.get_hgnc_name(hgnc_id) else: gene_name = uniprot_client.get_gene_name(up_id) agent = Agent(gene_name, db_refs=db_refs) self._node_names[id] = gene_name self._node_agents[id] = agent continue else: self._node_names[id] = node_name hgnc_id = hgnc_client.get_hgnc_id(node_name) db_refs = {'TEXT': node_name} if not hgnc_id: if not self.require_grounding: self._node_agents[id] = \ Agent(node_name, db_refs=db_refs) invalid_genes.append(node_name) else: db_refs.update({'HGNC': hgnc_id}) up_id = hgnc_client.get_uniprot_id(hgnc_id) # It's possible that a valid HGNC ID will not have a # Uniprot ID, as in the case of HOTAIR (HOX transcript # antisense RNA, HGNC:33510) if up_id: db_refs.update({'UP': up_id}) self._node_agents[id] = Agent(node_name, db_refs=db_refs) if invalid_genes: verb = 'Skipped' if self.require_grounding else 'Included' logger.info('%s invalid gene symbols: %s' % (verb, ', '.join(invalid_genes)))
def _get_complex_agents(self, complex_id): """Returns a list of agents corresponding to each of the constituents in a SIGNOR complex.""" agents = [] components = self._recursively_lookup_complex(complex_id) for c in components: db_refs = {} if c.startswith('CHEBI'): db_refs['CHEBI'] = c name = chebi_client.get_chebi_name_from_id(c) else: name = uniprot_client.get_gene_name(c) if name is None: db_refs['SIGNOR'] = c else: db_refs['UP'] = c hgnc_id = uniprot_client.get_hgnc_id(c) if hgnc_id: name = hgnc_client.get_hgnc_name(hgnc_id) db_refs['HGNC'] = hgnc_id famplex_key = ('SIGNOR', c) if famplex_key in famplex_map: db_refs['FPLX'] = famplex_map[famplex_key] if not name: # Set agent name to Famplex name if # the Uniprot name is not available name = db_refs['FPLX'] elif not name: # We neither have a Uniprot nor Famplex grounding logger.info('Have neither a Uniprot nor Famplex grounding ' 'for "%s" in complex %s' % (c, complex_id)) if not name: # Set the agent name to the Signor name if neither the # Uniprot nor Famplex names are available name = db_refs['SIGNOR'] assert name is not None agents.append(Agent(name, db_refs=db_refs)) return agents
def get_db_refs_by_name(ns, name, node_data): """Return standard name and grounding based on a namespace and a name. Parameters ---------- ns : str A name space in which the given name is interpreted. name : str The name in the given name space to get grounding for. node_data : dict Node data for logging purposes. Returns ------- name : str The standardized name for the given entity. db_refs : dict The grounding for the given entity. """ db_refs = None if ns == 'HGNC': # Assumption: name is an HGNC symbol hgnc_id = hgnc_client.get_current_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return name, None elif isinstance(hgnc_id, list): logger.info('More than one current HGNC ID for %s, choosing %s' % (name, hgnc_id[0])) hgnc_id = hgnc_id[0] name = hgnc_client.get_hgnc_name(hgnc_id) db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id elif ns in ('UNIPROT', 'UP'): up_id = None # This is a simple test to see if name is a valid UniProt ID, # if we can't get a mnemonic, we assume it's not a UP ID if uniprot_client.get_mnemonic(name, web_fallback=False): up_id = name # We next check if it's a mnemonic else: up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name) if up_id_from_mnem: up_id = up_id_from_mnem if not up_id: logger.info('Couldn\'t get UP ID from %s' % name) return name, None db_refs = {'UP': up_id} hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) else: name = uniprot_client.get_gene_name(up_id) elif ns == 'FPLX': db_refs = {'FPLX': name} elif ns in ('GO', 'GOBP', 'GOCC'): if name == 'cell proliferation': name = 'cell population proliferation' go_id = go_client.get_go_id_from_label(name) if not go_id: logger.info('Could not find GO ID for %s' % name) return name, None db_refs = {'GO': go_id} name = go_client.get_go_label(go_id) elif ns in ('MESHPP', 'MESHD', 'MESH'): mesh_id, mesh_name = mesh_client.get_mesh_id_name(name) if not mesh_id: logger.info('Could not find MESH ID from %s' % name) return name, None name = mesh_name db_refs = {'MESH': mesh_id} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns == 'MGI': up_id = mouse_lookup.get(name) if up_id: db_refs = {'UP': up_id} elif ns == 'RGD': up_id = rat_lookup.get(name) if up_id: db_refs = {'UP': up_id} # Map Selventa families and complexes to FamPlex elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name elif ns == 'SCOMP': db_refs = {'SCOMP': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SCOMP complex: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'): hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info( 'HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id else: logger.debug('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name elif ns == 'MIRBASE': mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name) if not mirbase_id: logger.info('Could not map miRBase name %s to ID', name) return name, None db_refs = {'MIRBASE': mirbase_id} hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) # CHEBI elif ns == 'CHEBI': # We first look up BEL's own namespace map for ChEBI names to IDs chebi_id = chebi_name_id.get(name) # If that fails, we look up INDRA's ChEBI name to ID mapping if not chebi_id: chebi_id = chebi_client.get_chebi_id_from_name(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # These appear in the name slot but are actually IDs elif ns == 'CHEBIID': chebi_id = identifiers.ensure_chebi_prefix(name) db_refs = {'CHEBI': chebi_id} name = chebi_client.get_chebi_name_from_id(chebi_id) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM', 'TEXT'): db_refs = {ns: name} elif ns == 'TAX': tid = taxonomy_client.get_taxonomy_id(name) if tid: db_refs = {'TAXONOMY': tid} else: logger.info('Could not get taxonomy ID for %s' % name) else: logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) return name, db_refs
def fix_agent(agent): """Fix naming and grounding issues in an Agent, changes Agent in place.""" if agent is None: return # First we fix some name spaces db_refs_tmp = copy(agent.db_refs) for db_ns, db_id in agent.db_refs.items(): # Make sure CHEBI prefix is there if db_ns == 'CHEBI': if not db_id.startswith('CHEBI:'): db_refs_tmp['CHEBI'] = 'CHEBI:%s' % db_id # Change FA name space elif db_ns == 'FA': db_refs_tmp.pop('FA', None) db_refs_tmp['NXPFA'] = db_id # Change IPR name space elif db_ns == 'IPR': db_refs_tmp.pop('IPR', None) db_refs_tmp['IP'] = db_id # Change XFAM name space elif db_ns == 'XFAM': db_refs_tmp.pop('XFAM', None) db_refs_tmp['PF'] = db_id.split('.')[0] elif db_ns == 'GO': if db_id.startswith('GO:'): db_refs_tmp['GO'] = db_id else: db_refs_tmp['GO'] = 'GO:' + db_id # Change PCID name space elif db_ns == 'PCID': db_refs_tmp.pop('PCID', None) db_refs_tmp['PUBCHEM'] = db_id agent.db_refs = db_refs_tmp # Check if we have a FPLX entry and handle old BE mappings if 'BE' in agent.db_refs: agent.db_refs['FPLX'] = agent.db_refs.pop('BE') be_id = agent.db_refs.get('FPLX') # Try to map to FPLX from NXP, IPR, PF, NCIT if not be_id: for db_ns, db_id in agent.db_refs.items(): be_id = famplex_map.get((db_ns, db_id)) if be_id: break # Try mapping NCIT to specific genes if possible if not be_id and 'NCIT' in agent.db_refs: target = ncit_map.get(agent.db_refs['NCIT']) if target: agent.db_refs[target[0]] = target[1] # If the name is an UP ID, change it if agent.name and 'UP' not in agent.db_refs \ and 'FPLX' not in agent.db_refs: if uniprot_client.get_gene_name(agent.name, web_fallback=False): agent.db_refs['UP'] = agent.name # Check what entries we have up_id = agent.db_refs.get('UP') hgnc_id = agent.db_refs.get('HGNC') # This is a special case that happens sometimes where agent.name is 'UP: # db_refs['UP'] is an empty string, and there is no other grounding. # In this case, we remove the empty UP grounding and reset the name to the # agent text. if not be_id and not hgnc_id and up_id == '': agent.name = agent.db_refs.get('TEXT', agent.name) agent.db_refs.pop('UP') # FPLX takes precedence if we have it elif be_id: agent.db_refs['FPLX'] = be_id agent.name = be_id elif hgnc_id: gene_name = hgnc_client.get_hgnc_name(hgnc_id) if gene_name: agent.name = gene_name if not up_id: up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: if ', ' in up_id: up_id = up_id.split(', ')[0] agent.db_refs['UP'] = up_id elif up_id: hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: agent.db_refs['HGNC'] = hgnc_id agent.name = hgnc_client.get_hgnc_name(hgnc_id) else: gene_name = uniprot_client.get_gene_name(up_id, web_fallback=False) if gene_name: agent.name = gene_name # If it doesn't have a gene name, it's better to just # use the raw string name otherwise Sparser sets # has Uniprot IDs or mnemonics as the name else: name = agent.db_refs.get('TEXT', agent.name) agent.name = name
def _get_agent_from_ref(self, ref): # TODO: handle collections if ref.attrib.get('category') == 'collection': #logger.warning('Skipping collection Agent.') return None # Find the name, uid and raw-text tags first and get their text # content if available uid_tag = ref.find("var/[@name='uid']") name_tag = ref.find("var/[@name='name']") text_tag = ref.find("var/[@name='raw-text']") if name_tag is not None and name_tag.text: name = name_tag.text else: name = None if uid_tag is not None and uid_tag.text: uid = uid_tag.text else: uid = None if text_tag is not None and text_tag.text: raw_text = text_tag.text else: raw_text = None # TODO: factor this out and reuse fix_agents db_refs = {} # Save raw text if available if raw_text: db_refs['TEXT'] = raw_text agent_name = raw_text # If we have a proper UID then we try to reconstruct an Agent from that if uid is not None and len(uid.split(':')) == 2: db_ns, db_id = uid.split(':') be_id = famplex_map.get((db_ns, db_id)) if be_id: db_refs[db_ns] = db_id db_refs['FPLX'] = be_id agent_name = be_id elif db_ns in ['UP', 'Uniprot']: id_from_mnemonic = uniprot_client.get_id_from_mnemonic(db_id) if id_from_mnemonic: db_id = id_from_mnemonic db_refs['UP'] = db_id hgnc_id = uniprot_client.get_hgnc_id(db_id) if hgnc_id: db_refs['HGNC'] = hgnc_id agent_name = hgnc_client.get_hgnc_name(hgnc_id) else: gene_name = uniprot_client.get_gene_name(db_id) if gene_name: agent_name = gene_name elif db_ns == 'NCIT': db_refs['NCIT'] = db_id target = ncit_map.get(db_id) if target: db_refs[target[0]] = target[1] if target[0] == 'HGNC': up_id = hgnc_client.get_uniprot_id(target[1]) agent_name = hgnc_client.get_hgnc_name(target[1]) if up_id: db_refs['UP'] = up_id elif target[0] == 'UP': agent_name = uniprot_client.get_gene_name(target[1]) if agent_name: hgnc_id = hgnc_client.get_hgnc_id(agent_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif db_ns == 'FA': db_refs['NXP'] = 'FA:' + db_id elif db_ns == 'XFAM': db_refs['PF'] = db_id.split('.')[0] elif db_ns == 'CHEBI': db_refs['CHEBI'] = 'CHEBI:' + db_id elif db_ns in ['GO', 'MESH', 'FPLX']: db_refs[db_ns] = db_id # Handle old BE mappings and add them as FPLX elif db_ns == 'BE': db_refs['FPLX'] = db_id elif db_ns in ['PR', 'CO', 'CVCL', 'EFO', 'ORPHANET']: db_refs[db_ns] = db_id else: logger.warning('Unknown database name space %s' % db_ns) if not agent_name: if raw_text is not None: agent_name = raw_text else: return None assert (agent_name) agent = Agent(agent_name, db_refs=db_refs) return agent
def generate_uniprot_terms(download=False, organisms=None): if not organisms: organisms = popular_organisms path = os.path.join(resource_dir, 'up_synonyms.tsv') org_filter_str = ' OR '.join(organisms) if not os.path.exists(path) or download: url = ( f'https://www.uniprot.org/uniprot/?format=tab&columns=id,' f'genes(PREFERRED),genes(ALTERNATIVE),protein%20names,organism-id&sort=score&' f'query=reviewed:yes&fil=organism:{org_filter_str}') logger.info('Downloading UniProt resource file') res = requests.get(url) with open(path, 'w') as fh: fh.write(res.text) terms = [] for row in read_csv(path, delimiter='\t', header=True): up_id = row['Entry'] organism = row['Organism ID'] protein_names = parse_uniprot_synonyms(row['Protein names']) primary_gene_name = row['Gene names (primary )'].strip() if primary_gene_name == ';': primary_gene_name = None gene_synonyms_str = row['Gene names (synonym )'].strip() if gene_synonyms_str == ';': gene_synonyms_str = None # We generally use the gene name as the standard name # except when there are multiple gene names (separated by # semi-colons) in which case we take the first protein name. if not primary_gene_name or ';' in primary_gene_name: standard_name = protein_names[0] else: standard_name = primary_gene_name # We skip a small number of not critical entries that don't have # standard names if not standard_name: continue ns = 'UP' id = up_id hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: ns = 'HGNC' id = hgnc_id standard_name = hgnc_client.get_hgnc_name(hgnc_id) for name in protein_names: # Skip names that are EC codes if name.startswith('EC '): continue if name == standard_name: continue term = Term(normalize(name), name, ns, id, standard_name, 'synonym', 'uniprot', organism) terms.append(term) term = Term(normalize(standard_name), standard_name, ns, id, standard_name, 'name', 'uniprot', organism) terms.append(term) if gene_synonyms_str: # This is to deal with all the variations in which # synonyms are listed, including degenerate strings # like "; ;" for synonym_group in gene_synonyms_str.split('; '): for synonym in synonym_group.split(' '): if not synonym or synonym == ';': continue term = Term(normalize(synonym), synonym, ns, id, standard_name, 'synonym', 'uniprot', organism) terms.append(term) return terms
def get_agent_from_entity_info(entity_info): """Return an INDRA Agent by processing an entity_info dict.""" # This will be the default name. If we get a gene name, it will # override this rawtext name. raw_text = entity_info['entityText'] name = raw_text # Get the db refs. refs = {'TEXT': raw_text} entries = entity_info['entityId'] if entries is None: entries = [] ref_counts = Counter([entry['source'] for entry in entries]) for source, count in ref_counts.items(): if source in ('Entrez', 'UniProt') and count > 1: logger.info('%s has %d entries for %s, skipping' % (raw_text, count, source)) return None, None muts = [] for id_dict in entries: if id_dict['source'] == 'Entrez': refs['EGID'] = id_dict['idString'] hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString']) if hgnc_id is not None: # Check against what we may have already inferred from # UniProt. If it disagrees with this, let it be. Inference # from Entrez isn't as reliable. if 'HGNC' in refs.keys(): if refs['HGNC'] != hgnc_id: msg = ('HGNC:%s previously set does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], hgnc_id, refs['EGID']) logger.info(msg) else: refs['HGNC'] = hgnc_id elif id_dict['source'] == 'UniProt': refs['UP'] = id_dict['idString'] hgnc_id = uniprot_client.get_hgnc_id(id_dict['idString']) if hgnc_id: # Check to see if we have a conflict with an HGNC id # found from the Entrez id. If so, overwrite with this # one, in which we have greater faith. if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id: msg = ('Inferred HGNC:%s from UP:%s does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], refs['UP'], hgnc_id, refs['EGID']) logger.info(msg) refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) else: gene_name = uniprot_client.get_gene_name(id_dict['idString']) if gene_name is not None: name = gene_name elif id_dict['source'] in ('Tax', 'NCBI'): refs['TAX'] = id_dict['idString'] elif id_dict['source'] == 'CHEBI': refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString'] # These we take as is elif id_dict['source'] in ('MESH', 'OMIM', 'CTD'): refs[id_dict['source']] = id_dict['idString'] # Handle mutations elif id_dict['source'] == 'Unk' and \ id_dict['entityType'] == 'ProteinMutation': # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk', # 'tool': 'PubTator', 'entityType': 'ProteinMutation'} # Mpk1(Y268A)' if id_dict['idString'].startswith('p|SUB|'): try: # Handle special cases like p|SUB|A|30|P;RS#:104893878 parts = id_dict['idString'].split(';')[0].split('|') residue_from, pos, residue_to = parts[2:5] mut = MutCondition(pos, residue_from, residue_to) muts.append(mut) except Exception as e: logger.info('Could not process mutation %s' % id_dict['idString']) else: logger.info('Unhandled mutation: %s' % id_dict['idString']) else: logger.warning("Unhandled id type: {source}={idString}" .format(**id_dict)) raw_coords = (entity_info['charStart'], entity_info['charEnd']) return Agent(name, db_refs=refs, mutations=muts), raw_coords