Пример #1
0
    def _get_name_by_id(self, entity_id):
        entity_term = self.tree.find("TERM/[@id='%s']" % entity_id)
        if entity_term is None:
            logger.debug('Term %s for entity not found' % entity_id)
            return None
        name = entity_term.find("name")
        if name is None:
            logger.debug('Entity without a name')
            return None
        try:
            dbid = entity_term.attrib["dbid"]
        except:
            #logger.debug('No grounding information for %s' % name.text)
            return self._get_valid_name(name.text)

        dbids = dbid.split('|')
        hgnc_ids = [i for i in dbids if i.startswith('HGNC')]
        up_ids = [i for i in dbids if i.startswith('UP')]


        #TODO: handle protein families like 14-3-3 with IDs like
        # XFAM:PF00244.15, FA:00007
        if hgnc_ids:
            if len(hgnc_ids) > 1:
                lisp_str = entity_term.attrib.get('lisp')
                if lisp_str is None:
                    hgnc_id = re.match(r'HGNC\:([0-9]*)',
                                       hgnc_ids[0]).groups()[0]
                else:
                    parts = lisp_str.split('(TERM :ID ')
                    scores = {}
                    for p in parts:
                        res = re.findall('HGNC::\|(.*)\|', p)
                        if res:
                            hgnc_id = res[0]
                            score = re.findall(':SCORE ([^ ]+)', p)[0]
                            scores[hgnc_id] = float(score)
                    if scores:
                        sorted_ids = sorted(scores.items(), key=operator.itemgetter(1))
                        hgnc_id = sorted_ids[-1][0]
            else:
                hgnc_id = re.match(r'HGNC\:([0-9]*)', hgnc_ids[0]).groups()[0]
            hgnc_name = self._get_hgnc_name(hgnc_id)
            return self._get_valid_name(hgnc_name)
        elif up_ids:
            if len(hgnc_ids) > 1:
                logger.debug('%d UniProt IDs reported.' % len(up_ids))
            up_id = re.match(r'UP\:([A-Z0-9]*)', up_ids[0]).groups()[0]
            # First try to get HGNC name
            hgnc_name = up_client.get_hgnc_name(up_id)
            if hgnc_name is not None:
                return self._get_valid_name(hgnc_name)
            # Next, try to get the gene name
            gene_name = up_client.get_gene_name(up_id)
            if gene_name is not None:
                return self._get_valid_name(gene_name)
        # By default, return the text of the name tag
        name_txt = name.text.strip('|')
        return self._get_valid_name(name_txt)
Пример #2
0
 def _get_name_by_id(self, entity_id):
     entity_term = self.tree.find("TERM/[@id='%s']" % entity_id)
     name = entity_term.find("name")
     if name is None:
         warnings.warn('Entity without a name')
         return ''
     try:
         dbid = entity_term.attrib["dbid"]
     except:
         warnings.warn('No grounding information for %s' % name.text)
         return self._get_valid_component_name(name.text)
     dbids = dbid.split('|')
     hgnc_ids = [i for i in dbids if i.startswith('HGNC')]
     up_ids = [i for i in dbids if i.startswith('UP')]
     #TODO: handle protein families like 14-3-3 with IDs like
     # XFAM:PF00244.15, FA:00007
     if hgnc_ids:
         if len(hgnc_ids) > 1:
             warnings.warn('%d HGNC IDs reported.' % len(hgnc_ids))
         hgnc_id = re.match(r'HGNC\:([0-9]*)', hgnc_ids[0]).groups()[0]
         hgnc_name = self._get_hgnc_name(hgnc_id)
         return self._get_valid_component_name(hgnc_name)
     elif up_ids:
         if len(hgnc_ids) > 1:
             warnings.warn('%d UniProt IDs reported.' % len(up_ids))
         up_id = re.match(r'UP\:([A-Z0-9]*)', up_ids[0]).groups()[0]
         up_rdf = up_client.query_protein(up_id)
         # First try to get HGNC name
         hgnc_name = up_client.get_hgnc_name(up_rdf)
         if hgnc_name is not None:
             return self._get_valid_component_name(hgnc_name)
         # Next, try to get the gene name
         gene_name = up_client.get_gene_name(up_rdf)
         if gene_name is not None:
             return self._get_valid_component_name(gene_name)
     # By default, return the text of the name tag
     name_txt = name.text.strip('|')
     return self._get_valid_component_name(name_txt)
Пример #3
0
 def _get_name_by_id(self, entity_id):
     entity_term = self.tree.find("TERM/[@id='%s']" % entity_id)
     name = entity_term.find("name")
     if name is None:
         warnings.warn('Entity without a name')
         return ''
     try:
         dbid = entity_term.attrib["dbid"]
     except:
         warnings.warn('No grounding information for %s' % name.text)
         return self._get_valid_component_name(name.text)
     dbids = dbid.split('|')
     hgnc_ids = [i for i in dbids if i.startswith('HGNC')]
     up_ids = [i for i in dbids if i.startswith('UP')]
     #TODO: handle protein families like 14-3-3 with IDs like
     # XFAM:PF00244.15, FA:00007
     if hgnc_ids:
         if len(hgnc_ids) > 1:
             warnings.warn('%d HGNC IDs reported.' % len(hgnc_ids))
         hgnc_id = re.match(r'HGNC\:([0-9]*)', hgnc_ids[0]).groups()[0]
         hgnc_name = self._get_hgnc_name(hgnc_id)
         return self._get_valid_component_name(hgnc_name)
     elif up_ids:
         if len(hgnc_ids) > 1:
             warnings.warn('%d UniProt IDs reported.' % len(up_ids))
         up_id = re.match(r'UP\:([A-Z0-9]*)', up_ids[0]).groups()[0]
         up_rdf = up_client.query_protein(up_id)
         # First try to get HGNC name
         hgnc_name = up_client.get_hgnc_name(up_rdf)
         if hgnc_name is not None:
             return self._get_valid_component_name(hgnc_name)
         # Next, try to get the gene name
         gene_name = up_client.get_gene_name(up_rdf)
         if gene_name is not None:
             return self._get_valid_component_name(gene_name)
     # By default, return the text of the name tag
     name_txt = name.text.strip('|')
     return self._get_valid_component_name(name_txt)
Пример #4
0
    def _get_agent_from_entity(self, entity_id):
        qstr = "$.entities.frames[(@.frame_id is \'%s\')]" % entity_id
        res = self.tree.execute(qstr)
        if res is None:
            return None
        try:
            entity_term = res.next()
        except StopIteration:
            logger.debug(' %s is not an entity' % entity_id)
            return None
        # This is the default name, which can be overwritten 
        # below for specific database entries
        agent_name = self._get_valid_name(entity_term['text'])
        db_refs = {}
        for xr in entity_term['xrefs']:
            ns = xr['namespace']
            if ns == 'uniprot':
                up_id = xr['id']
                db_refs['UP'] = up_id
                # Look up official names in UniProt
                hgnc_name = up_client.get_hgnc_name(up_id)
                if hgnc_name is not None:
                    agent_name = self._get_valid_name(hgnc_name)
                else:
                    gene_name = up_client.get_gene_name(up_id)
                    if gene_name is not None:
                        agent_name = self._get_valid_name(gene_name)
            elif ns == 'interpro':
                db_refs['IP'] = xr['id']
            elif ns == 'chebi':
                db_refs['CHEBI'] = xr['id'][6:]
            elif ns == 'go':
                db_refs['GO'] = xr['id'][3:]
            elif ns == 'hmdb':
                db_refs['HMDB'] = xr['id'][4:]
        db_refs['TEXT'] = entity_term['text']

        mod_terms = entity_term.get('modifications')
        mods = []
        muts = []
        if mod_terms is not None:
            for m in mod_terms:
                if m['type'].lower() == 'mutation':
                    # Evidence is usualy something like "V600E"
                    # We could parse this to get the amino acid
                    # change that happened.
                    mutation_str = m.get('evidence')
                    # TODO: sometimes mutation_str is "mutant", "Mutant",
                    # "mutants" - this indicates that there is a mutation
                    # but not the specific type. We should encode this
                    # somehow as a "blank" mutation condition
                    mut = self._parse_mutation(mutation_str)
                    if mut is not None:
                        muts.append(mut)
                elif m['type'].lower() == 'phosphorylation' or\
                     m['type'].lower() == 'phosphorylated':
                    site = m.get('site')
                    if site is not None:
                        mod_res, mod_pos = self._parse_site_text(site)
                        mod = ModCondition('phosphorylation', mod_res, mod_pos)
                        mods.append(mod)
                    else:
                        mods.append(ModCondition('phosphorylation'))
                elif m['type'].lower() == 'ubiquitination':
                    mods.append(ModCondition('ubiquitination'))
                else:
                    logger.warning('Unhandled entity modification type: %s' %
                                   m['type'])

        agent = Agent(agent_name, db_refs=db_refs, mods=mods, mutations=muts)
        return agent
Пример #5
0
def test_get_hgnc_name_nonhuman():
    hgnc_name = uniprot_client.get_hgnc_name('P31938')
    assert(hgnc_name is None)
Пример #6
0
def test_get_hgnc_name_human():
    hgnc_name = uniprot_client.get_hgnc_name('P00533')
    assert(hgnc_name == 'EGFR')
Пример #7
0
def test_query_protein_deprecated():
    g = uniprot_client.query_protein('Q8NHX1')
    assert(g is not None)
    assert(uniprot_client.get_hgnc_name('Q8NHX1') == 'MAPK3')