def _check_agent_mod(self, agent, mods, do_methionine_offset=True, do_orthology_mapping=True, do_isoform_mapping=True): """Check an agent for invalid sites and look for mappings. Look up each modification site on the agent in Uniprot and then the site map. Parameters ---------- agent : :py:class:`indra.statements.Agent` Agent to check for invalid modification sites. mods : list of :py:class:`indra.statements.ModCondition` Modifications to check for validity and map. do_methionine_offset : boolean Whether to check for off-by-one errors in site position (possibly) attributable to site numbering from mature proteins after cleavage of the initial methionine. If True, checks the reference sequence for a known modification at 1 site position greater than the given one; if there exists such a site, creates the mapping. Default is True. do_orthology_mapping : boolean Whether to check sequence positions for known modification sites in mouse or rat sequences (based on PhosphoSitePlus data). If a mouse/rat site is found that is linked to a site in the human reference sequence, a mapping is created. Default is True. do_isoform_mapping : boolean Whether to check sequence positions for known modifications in other human isoforms of the protein (based on PhosphoSitePlus data). If a site is found that is linked to a site in the human reference sequence, a mapping is created. Default is True. Returns ------- list A list of invalid sites, where each entry in the list has two elements: ((gene_name, residue, position), mapped_site). If the invalid position was not found in the site map, mapped_site is None; otherwise it is a tuple consisting of (residue, position, comment). """ invalid_sites = [] up_id = _get_uniprot_id(agent) # If the uniprot entry is not found, let it pass if not up_id: logger.debug("No uniprot ID for %s" % agent.name) return [] # Same effect as valid sites # Look up all of the modifications in uniprot, and add them to the list # of invalid sites if they are missing for old_mod in mods: # If no site information for this residue, skip if old_mod.position is None or old_mod.residue is None: continue site_key = (agent.name, old_mod.residue, old_mod.position) # Increase our count for this site self._sitecount[site_key] = self._sitecount.get(site_key, 0) + 1 # First, check the cache to potentially avoid a costly sequence # lookup cached_site = self._cache.get(site_key) if cached_site is not None: if cached_site == 'VALID': pass else: invalid_sites.append((site_key, cached_site)) continue # If not cached, continue # Look up the residue/position in uniprot site_valid = uniprot_client.verify_location(up_id, old_mod.residue, old_mod.position) # If it's not found in Uniprot, then look it up in the site map if site_valid: self._cache[site_key] = 'VALID' continue # Check the agent for a Uniprot ID up_id = agent.db_refs.get('UP') hgnc_id = agent.db_refs.get('HGNC') if not hgnc_id: logger.debug("No HGNC ID for %s, only curated sites will be " "mapped" % agent.name) # NOTE: The following lookups can only be performed if the # Phosphosite Data is available. if phosphosite_client.has_data(): # First, look for other entries in phosphosite for this protein # where this sequence position is legit (i.e., other isoforms) if do_isoform_mapping and up_id and hgnc_id: human_pos = phosphosite_client.map_to_human_site( up_id, old_mod.residue, old_mod.position) if human_pos: mapped_site = (old_mod.residue, human_pos, 'INFERRED_ALTERNATIVE_ISOFORM') self._cache[site_key] = mapped_site invalid_sites.append((site_key, mapped_site)) continue # Try looking for rat or mouse sites if do_orthology_mapping and up_id and hgnc_id: # Get the mouse ID for this protein up_mouse = uniprot_client.get_mouse_id(up_id) # Get mouse sequence human_pos = phosphosite_client.map_to_human_site( up_mouse, old_mod.residue, old_mod.position) if human_pos: mapped_site = (old_mod.residue, human_pos, 'INFERRED_MOUSE_SITE') self._cache[site_key] = mapped_site invalid_sites.append((site_key, mapped_site)) continue # Try the rat sequence up_rat = uniprot_client.get_rat_id(up_id) human_pos = phosphosite_client.map_to_human_site( up_rat, old_mod.residue, old_mod.position) if human_pos: mapped_site = (old_mod.residue, human_pos, 'INFERRED_RAT_SITE') self._cache[site_key] = mapped_site invalid_sites.append((site_key, mapped_site)) continue # Check for methionine offset (off by one) if do_methionine_offset and up_id and hgnc_id: try: offset_pos = str(int(old_mod.position) + 1) except ValueError: logger.warning("Invalid position: %s" % old_mod.position) continue human_pos = phosphosite_client.map_to_human_site( up_id, old_mod.residue, offset_pos) # If it's valid at the offset position, create the mapping # and continue if human_pos: mapped_site = (old_mod.residue, human_pos, 'INFERRED_METHIONINE_CLEAVAGE') self._cache[site_key] = mapped_site invalid_sites.append((site_key, mapped_site)) continue # Now check the site map mapped_site = self.site_map.get(site_key, None) if mapped_site is None: # No entry in the site map--set site info to None self._cache[site_key] = None invalid_sites.append((site_key, None)) # Manually mapped in the site map else: self._cache[site_key] = mapped_site invalid_sites.append((site_key, mapped_site)) return invalid_sites
def test_rat_from_human(): assert uniprot_client.get_rat_id('P04049') == 'P11345'