예제 #1
0
 def standardize_agent_db_refs(agent, map_db_refs, do_rename=True):
     gene_name = None
     up_id = map_db_refs.get('UP')
     hgnc_sym = map_db_refs.get('HGNC')
     if up_id and not hgnc_sym:
         gene_name = uniprot_client.get_gene_name(up_id, False)
         if gene_name:
             hgnc_id = hgnc_client.get_hgnc_id(gene_name)
             if hgnc_id:
                 map_db_refs['HGNC'] = hgnc_id
     elif hgnc_sym and not up_id:
         # Override the HGNC symbol entry from the grounding
         # map with an HGNC ID
         hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
         if hgnc_id:
             map_db_refs['HGNC'] = hgnc_id
             # Now get the Uniprot ID for the gene
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 map_db_refs['UP'] = up_id
         # If there's no HGNC ID for this symbol, raise an
         # Exception
         else:
             raise ValueError('No HGNC ID corresponding to gene '
                              'symbol %s in grounding map.' % hgnc_sym)
     # If we have both, check the gene symbol ID against the
     # mapping from Uniprot
     elif up_id and hgnc_sym:
         # Get HGNC Symbol from Uniprot
         gene_name = uniprot_client.get_gene_name(up_id)
         if not gene_name:
             raise ValueError('No gene name found for Uniprot '
                              'ID %s (expected %s)' % (up_id, hgnc_sym))
         # We got gene name, compare it to the HGNC name
         else:
             if gene_name != hgnc_sym:
                 raise ValueError('Gene name %s for Uniprot ID '
                                  '%s does not match HGNC '
                                  'symbol %s given in grounding '
                                  'map.' % (gene_name, up_id, hgnc_sym))
             else:
                 hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
                 if not hgnc_id:
                     logger.error('No HGNC ID corresponding to gene '
                                  'symbol %s in grounding map.' % hgnc_sym)
                 else:
                     map_db_refs['HGNC'] = hgnc_id
     # Assign the DB refs from the grounding map to the agent
     agent.db_refs = map_db_refs
     # Are we renaming right now?
     if do_rename:
         # If there's a FamPlex ID, prefer that for the name
         if agent.db_refs.get('FPLX'):
             agent.name = agent.db_refs.get('FPLX')
         # Get the HGNC symbol or gene name (retrieved above)
         elif hgnc_sym is not None:
             agent.name = hgnc_sym
         elif gene_name is not None:
             agent.name = gene_name
     return
예제 #2
0
 def test_specific_query(self):
     """Test whether we can get a "fully" specified statement."""
     resp = self.__check_good_statement_query(object='MAP2K1',
                                              subject='MAPK1',
                                              type='Phosphorylation')
     _check_stmt_agents(resp,
                        agents=[
                            (0, 'HGNC', hgnc_client.get_hgnc_id('MAPK1')),
                            (1, 'HGNC', hgnc_client.get_hgnc_id('MAP2K1'))
                        ])
예제 #3
0
def read_phosphosite(fname):
    df = pandas.read_csv(fname, index_col=None)
    statements = []
    antibody_map = {}
    for _, row in df.iterrows():
        sub_upid = row['SUB_ID']
        if not pandas.isnull(sub_upid):
            sub_hgnc_symbol = uniprot_client.get_gene_name(sub_upid)
            sub_hgnc = hgnc_client.get_hgnc_id(sub_hgnc_symbol)
        else:
            sub_hgnc_symbol = row['SUB_GENE']
            sub_hgnc_id = hgnc_client.get_hgnc_id(sub_hgnc_symbol)
            sub_upid = hgnc_client.get_uniprot_id(sub_hgnc_id)
        sub = Agent(sub_hgnc_symbol,
                    db_refs={'UP': sub_upid,'HGNC': sub_hgnc})
        residue = row['Actual_site'][0]
        if len(row['Actual_site']) > 1:
            position = row['Actual_site'][1:]
        else:
            position = None

        sub_readout = deepcopy(sub)
        mc = ModCondition('phosphorylation', residue, position)
        sub_readout.mods = [mc]
        ps = row['phosphosite']
        if ps in antibody_map:
            found = False
            for p in antibody_map[ps]:
                if p.name == sub.name and p.mods[0].residue == residue and \
                    p.mods[0].position == position:
                    found = True
                    break
            if not found:
                antibody_map[ps].append(sub_readout)
        else:
            antibody_map[ps] = [sub_readout]

        kin_upid = row['KIN_ID']
        if not pandas.isnull(kin_upid):
            if not uniprot_client.is_human(kin_upid):
                print('%s non human' % kin_upid)
                continue
            kin_hgnc_symbol = uniprot_client.get_gene_name(kin_upid)
            kin_hgnc = hgnc_client.get_hgnc_id(kin_hgnc_symbol)
        else:
            kin_hgnc_symbol = row['KINASE_GENE_SYMBOL']
            kin_hgnc_id = hgnc_client.get_hgnc_id(kin_hgnc_symbol)
            kin_upid = hgnc_client.get_uniprot_id(kin_hgnc_id)
        kin = Agent(kin_hgnc_symbol,
                    db_refs={'UP': kin_upid,'HGNC': kin_hgnc})

        ev = Evidence(source_api='phosphosite')
        st = Phosphorylation(kin, sub, residue, position, evidence = [ev])
        statements.append(st)
    return statements, antibody_map
예제 #4
0
 def test_query_with_two_agents(self):
     """Test a query were the roles of the agents are not given."""
     resp = self.__check_good_statement_query(agent0='MAP2K1',
                                              agent1='MAPK1',
                                              type='Phosphorylation')
     _check_stmt_agents(resp,
                        agents=[(None, 'HGNC',
                                 hgnc_client.get_hgnc_id('MAPK1')),
                                (None, 'HGNC',
                                 hgnc_client.get_hgnc_id('MAP2K1'))])
     return
예제 #5
0
    def get_agent(self, acsn_agent: str) -> Union[Agent, None]:
        """Return an INDRA Agent corresponding to an ACSN agent.

        Parameters
        ----------
        acsn_agent :
            Agent extracted from the relations statement data frame

        Returns
        -------
        :
            Returns INDRA agent with HGNC or FamPlex ID in db_refs. If there
            are no groundings available, we return None.
        """
        mapping = self.correspondence_dict.get(acsn_agent)
        if not mapping:
            return None
        if len(mapping) == 1:
            hgnc_id = get_hgnc_id(mapping[0])
            if hgnc_id:
                db_refs = {'HGNC': hgnc_id}
                return get_standard_agent(mapping[0], db_refs=db_refs)
        else:
            fplx_rel = self.fplx_lookup.get(
                tuple(sorted(self.correspondence_dict[acsn_agent])))
            if fplx_rel:
                db_refs = {'FPLX': fplx_rel}
                return get_standard_agent(fplx_rel, db_refs=db_refs)
        return None
예제 #6
0
    def _get_complex_agents(self, complex_id):
        """Returns a list of agents corresponding to each of the constituents
        in a SIGNOR complex."""
        agents = []
        components = self._recursively_lookup_complex(complex_id)

        for c in components:
            db_refs = {}
            name = uniprot_client.get_gene_name(c)
            if name is None:
                db_refs['SIGNOR'] = c
            else:
                db_refs['UP'] = c
                hgnc_id = hgnc_client.get_hgnc_id(name)
                if hgnc_id:
                    db_refs['HGNC'] = hgnc_id

            famplex_key = ('SIGNOR', c)
            if famplex_key in famplex_map:
                db_refs['FPLX'] = famplex_map[famplex_key]
                if not name:
                    name = db_refs['FPLX']  # Set agent name to Famplex name if
                                            # the Uniprot name is not available
            elif not name:
                # We neither have a Uniprot nor Famplex grounding
                logger.info('Have neither a Uniprot nor Famplex grounding ' + \
                            'for ' + c)
                if not name:
                    name = db_refs['SIGNOR']  # Set the agent name to the
                                              # Signor name if neither the
                                              # Uniprot nor Famplex names are
                                              # available
            assert(name is not None)
            agents.append(Agent(name, db_refs=db_refs))
        return agents
예제 #7
0
파일: test_signor.py 프로젝트: budakn/INDRA
def test_get_agent():
    # Protein/gene
    # Create an empty Signor processor
    sp = SignorProcessor([])
    test_ag = Agent('RELA',
                    db_refs={
                        'HGNC': hgnc_client.get_hgnc_id('RELA'),
                        'UP': 'Q04206'
                    })
    sp_ag = sp._get_agent(test_row.ENTITYA, test_row.TYPEA, test_row.IDA,
                          test_row.DATABASEA)
    assert test_ag.matches(sp_ag)
    # Chemical
    test_ag = Agent('AZD1480', db_refs={'PUBCHEM': 'CID:16659841'})
    sp_ag = sp._get_agent('AZD1480', 'chemical', 'CID:16659841', 'PUBCHEM')
    assert test_ag.matches(sp_ag)
    # Signor phenotype
    test_ag = Agent('Cell cycle progr.', db_refs={'SIGNOR': 'SIGNOR-PH42'})
    sp_ag = sp._get_agent('Cell cycle progr.', 'phenotype', 'SIGNOR-PH42',
                          'SIGNOR')
    assert test_ag.matches(sp_ag)
    # Ungrounded -- couldn't find a real example in the dataset
    test_ag = Agent('Foobar', db_refs={})
    sp_ag = sp._get_agent('Foobar', 'pathway', None, None)
    assert test_ag.matches(sp_ag)
    sp_ag = sp._get_agent('Foobar', 'antibody', None, None)
    assert test_ag.matches(sp_ag)
예제 #8
0
 def _add_node(self, agent):
     node_key = agent.name
     node_id = self._existing_nodes.get(node_key)
     if node_id is not None:
         return node_id
     db_refs = _get_db_refs(agent)
     node_id = self._get_new_id()
     self._existing_nodes[node_key] = node_id
     node_name = agent.name
     node_name = node_name.replace('_', ' ')
     expanded_families = expander.get_children(agent, ns_filter='HGNC')
     members = {}
     for member in expanded_families:
         hgnc_symbol = member[1]
         hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol)
         if hgnc_id:
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             member_agent = Agent(hgnc_symbol,
                                  db_refs={'HGNC': hgnc_id,
                                           'UP': up_id})
             member_db_refs = _get_db_refs(member_agent)
         else:
             member_db_refs = {}
         members[member[1]] = {
                 'mutation': None,
                 'expression': None,
                 'db_refs': member_db_refs
                 }
     node = {'data': {'id': node_id, 'name': node_name,
                      'db_refs': db_refs, 'parent': '',
                      'members': members}}
     self._nodes.append(node)
     return node_id
예제 #9
0
def fix_stmts(stmts):
    new_stmts = []
    for stmt in stmts:
        for ev in stmt.evidence:
            if ev.pmid and ev.pmid.startswith('PMID'):
                ev.pmid = ev.pmid[:-4]
        # Skip if no subject
        if isinstance(stmt, RegulateActivity):
            if stmt.subj is None:
                continue
        # Skip if no locations
        if isinstance(stmt, Translocation):
            if not (stmt.from_location or stmt.to_location):
                continue
        for agent in stmt.agent_list():
            if agent is not None:
                upid = agent.db_refs.get('UP')
                if upid:
                    gene_name = uniprot_client.get_gene_name(upid)
                    if gene_name:
                        agent.name = gene_name
                        if uniprot_client.is_human(upid):
                            hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                            if hgnc_id:
                                agent.db_refs['HGNC'] = hgnc_id

        new_stmts.append(stmt)
    return new_stmts
예제 #10
0
def get_grounding_from_name(name):
    """Return grounding given an agent name."""
    # See if it's a gene name
    hgnc_id = get_hgnc_id(name)
    if hgnc_id:
        return ('HGNC', hgnc_id)

    # Check if it's in the grounding map
    try:
        refs = gm[name]
        if isinstance(refs, dict):
            for dbn, dbi in refs.items():
                if dbn != 'TEXT':
                    return (dbn, dbi)
    # If not, search by text
    except KeyError:
        pass

    chebi_id = get_chebi_id_from_name(name)
    if chebi_id:
        return ('CHEBI', f'CHEBI:{chebi_id}')

    mesh_id, _ = get_mesh_id_name(name)
    if mesh_id:
        return ('MESH', mesh_id)

    return None
def get_mappings() -> Iterable[PredictionTuple]:
    """Iterate high-confidence lexical mappings between MeSH and UniProt human proteins."""
    url = get_script_url(__file__)
    mapping_type = "lexical"
    match_type = "skos:exactMatch"
    confidence = 0.999
    for mesh_name, mesh_id in mesh_client.mesh_name_to_id.items():
        match = MESH_PROTEIN_RE.match(mesh_name)
        if not match:
            continue
        gene_name = match.groups()[0]
        hgnc_id = hgnc_client.get_hgnc_id(gene_name)
        if not hgnc_id:
            continue
        uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
        if not uniprot_id or "," in uniprot_id:
            continue
        yield PredictionTuple(
            "mesh",
            mesh_id,
            mesh_name,
            match_type,
            "uniprot",
            uniprot_id,
            gene_name,
            mapping_type,
            confidence,
            url,
        )
예제 #12
0
    def get_relevant_nodes(self, pct_heat_threshold):
        """Return a list of the relevant nodes in the prior.

        Heat diffusion is applied to the prior network based on initial
        heat on nodes that are mutated according to patient statistics.
        """
        logger.info('Setting heat for relevant nodes in prior network')
        heats = np.zeros(len(self.prior_graph))
        mut_nodes = {}
        for gene_name, muts in self.norm_mutations.items():
            if muts:
                hgnc_id = get_hgnc_id(gene_name)
                node_key = 'HGNC:%s' % hgnc_id
                mut_nodes[node_key] = muts

        for idx, node in enumerate(self.prior_graph.nodes()):
            if node in mut_nodes:
                heats[idx] = mut_nodes[node]

        gamma = -0.1
        logger.info('Calculating Laplacian matrix')
        lp_mx = nx.normalized_laplacian_matrix(self.prior_graph,
                                               weight='weight')
        logger.info('Diffusing heat')
        Df = expm_multiply(gamma * lp_mx, heats)
        heat_thresh = np.percentile(Df, pct_heat_threshold)
        logger.info('Filtering to relevant nodes with heat threshold %.2f '
                    '(%s percentile)' % (heat_thresh, pct_heat_threshold))
        # Zip the nodes with their heats and sort
        node_heats = sorted(list(zip(self.prior_graph.nodes(), Df)),
                            key=lambda x: x[1], reverse=True)
        relevant_nodes = [n for n, heat in node_heats if heat >= heat_thresh]
        return relevant_nodes
예제 #13
0
파일: bot.py 프로젝트: pagreene/indrabot
def get_grounding_from_name(name):
    # See if it's a gene name
    hgnc_id = hgnc_client.get_hgnc_id(name)
    if hgnc_id:
        return ('HGNC', hgnc_id)

    # Check if it's in the grounding map
    try:
        refs = gm[name]
        if isinstance(refs, dict):
            for dbn, dbi in refs.items():
                if dbn != 'TEXT':
                    return (dbn, dbi)
    # If not, search by text
    except KeyError:
        pass

    # If none of these, we try TRIPS
    try:
        print('Looking up %s with TRIPS' % name)
        tp = trips.process_text(name, service_endpoint='drum-dev')
        terms = tp.tree.findall('TERM')
        if not terms:
            return ('TEXT', name)
        term_id = terms[0].attrib['id']
        agent = tp._get_agent_by_id(term_id, None)
        if 'HGNC' in agent.db_refs:
            return ('HGNC', agent.db_refs['HGNC'])
        if 'FPLX' in agent.db_refs:
            return ('FPLX', agent.db_refs['FPLX'])
    except Exception as e:
        print(e)
        return ('TEXT', name)
    return ('TEXT', name)
예제 #14
0
def get_all_gene_names(data):
    gene_names = data['antibody']['Gene Name']
    uniprot_ids = data['antibody']['UniProt ID']
    all_genes = set()
    invalid_genes = set()
    for gn, upid in zip(gene_names, uniprot_ids):
        # Some entries are lists of genes separated by commas
        # and we also strip off extra spaces
        names = [x.strip() for x in gn.split(',')]
        ids = [x.strip() for x in upid.split(',')]
        names_from_ids = [uniprot_client.get_gene_name(x) for x in ids]
        # Find invalid gene names
        for name in names:
            if not hgnc_client.get_hgnc_id(name):
                print('Invalid or deprecated gene symbol: %s' % name)
                invalid_genes.add(name)
        # Find inconsistent gene names and UniProt IDs
        if set(names) != set(names_from_ids):
            print('Inconsistent entries:')
            print('- Given gene names: %s' % ','.join(names))
            print('- Genes from uniprot IDs: %s' % ','.join(names_from_ids))
        # Add both the gene names and the gene names derived from UniProt IDs
        all_genes = all_genes.union(set(names)).union(set(names_from_ids))
    # Finally remove the invalid gene names
    all_genes = all_genes.difference(invalid_genes)
    all_genes = sorted(list(all_genes))
    return all_genes
예제 #15
0
def test_get_agent():
    # Protein/gene
    # Create an empty Signor processor
    sp = SignorProcessor([])
    test_ag = Agent('RELA', db_refs={'HGNC': hgnc_client.get_hgnc_id('RELA'),
                                     'UP': 'Q04206'})
    sp_ag = sp._get_agent(test_row.ENTITYA, test_row.TYPEA,
                                       test_row.IDA, test_row.DATABASEA)
    assert test_ag.matches(sp_ag)
    # Chemical
    test_ag = Agent('AZD1480', db_refs={'PUBCHEM': '16659841'})
    sp_ag = sp._get_agent('AZD1480', 'chemical', 'CID: 16659841',
                                       'PUBCHEM')
    assert test_ag.matches(sp_ag)
    # Signor phenotype
    test_ag = Agent('Cell cycle progr.', db_refs={'SIGNOR': 'SIGNOR-PH42'})
    sp_ag = sp._get_agent('Cell cycle progr.', 'phenotype',
                                       'SIGNOR-PH42', 'SIGNOR')
    assert test_ag.matches(sp_ag)
    # Ungrounded -- couldn't find a real example in the dataset
    test_ag = Agent('Foobar', db_refs={})
    sp_ag = sp._get_agent('Foobar', 'pathway', None, None)
    assert test_ag.matches(sp_ag)
    sp_ag = sp._get_agent('Foobar', 'antibody', None, None)
    assert test_ag.matches(sp_ag)
예제 #16
0
def agent_from_gene_name(gene_name):
    """Return an Agent based on a gene name."""
    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
    up_id = hgnc_client.get_uniprot_id(hgnc_id)
    agent = Agent(gene_name, db_refs={'HGNC': hgnc_id,
                                      'UP': up_id})
    return agent
예제 #17
0
 def rename_agents(self, stmts):
     # Make a copy of the stmts
     mapped_stmts = deepcopy(stmts)
     # Iterate over the statements
     for stmt_ix, stmt in enumerate(mapped_stmts):
         # Iterate over the agents
         for agent in stmt.agent_list():
             if agent is None:
                 continue
             old_name = agent.name
             # If there's a Bioentities ID, prefer that for the name
             if agent.db_refs.get('BE'):
                 agent.name = agent.db_refs.get('BE')
             # Take a HGNC name from Uniprot next
             elif agent.db_refs.get('UP'):
                 # Try for the gene name
                 gene_name = uniprot_client.get_gene_name(
                     agent.db_refs.get('UP'), web_fallback=False)
                 if gene_name:
                     agent.name = gene_name
                     hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                     if hgnc_id:
                         agent.db_refs['HGNC'] = hgnc_id
                 # Take the text string
                 #if agent.db_refs.get('TEXT'):
                 #    agent.name = agent.db_refs.get('TEXT')
                 # If this fails, then we continue with no change
             # Fall back to the text string
             #elif agent.db_refs.get('TEXT'):
             #    agent.name = agent.db_refs.get('TEXT')
     return mapped_stmts
예제 #18
0
def agent_from_gene_name(name):
    """Return a grounded Agent based on a gene name."""
    agent = Agent(name)
    hgnc_id = hgnc_client.get_hgnc_id(name)
    uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
    agent.db_refs = {'HGNC': hgnc_id, 'UP': uniprot_id}
    return agent
예제 #19
0
 def rename_agents(self, stmts):
     # Make a copy of the stmts
     mapped_stmts = deepcopy(stmts)
     # Iterate over the statements
     for stmt_ix, stmt in enumerate(mapped_stmts):
         # Iterate over the agents
         for agent in stmt.agent_list():
             if agent is None:
                 continue
             old_name = agent.name
             # If there's a Bioentities ID, prefer that for the name
             if agent.db_refs.get('BE'):
                 agent.name = agent.db_refs.get('BE')
             # Take a HGNC name from Uniprot next
             elif agent.db_refs.get('UP'):
                 # Try for the gene name
                 gene_name = uniprot_client.get_gene_name(
                                                 agent.db_refs.get('UP'),
                                                 web_fallback=False)
                 if gene_name:
                     agent.name = gene_name
                     hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                     if hgnc_id:
                         agent.db_refs['HGNC'] = hgnc_id
                 # Take the text string
                 #if agent.db_refs.get('TEXT'):
                 #    agent.name = agent.db_refs.get('TEXT')
                 # If this fails, then we continue with no change
             # Fall back to the text string
             #elif agent.db_refs.get('TEXT'):
             #    agent.name = agent.db_refs.get('TEXT')
     return mapped_stmts
예제 #20
0
    def _get_complex_agents(self, complex_id):
        """Returns a list of agents corresponding to each of the constituents
        in a SIGNOR complex."""
        agents = []
        components = self._recursively_lookup_complex(complex_id)

        for c in components:
            db_refs = {}
            name = uniprot_client.get_gene_name(c)
            if name is None:
                db_refs['SIGNOR'] = c
            else:
                db_refs['UP'] = c
                hgnc_id = hgnc_client.get_hgnc_id(name)
                if hgnc_id:
                    db_refs['HGNC'] = hgnc_id

            famplex_key = ('SIGNOR', c)
            if famplex_key in famplex_map:
                db_refs['FPLX'] = famplex_map[famplex_key]
                if not name:
                    name = db_refs['FPLX']  # Set agent name to Famplex name if
                    # the Uniprot name is not available
            elif not name:
                # We neither have a Uniprot nor Famplex grounding
                logger.info('Have neither a Uniprot nor Famplex grounding ' + \
                            'for ' + c)
                if not name:
                    name = db_refs['SIGNOR']  # Set the agent name to the
                    # Signor name if neither the
                    # Uniprot nor Famplex names are
                    # available
            assert (name is not None)
            agents.append(Agent(name, db_refs=db_refs))
        return agents
예제 #21
0
파일: processor.py 프로젝트: jmuhlich/indra
 def _get_db_refs(bpe):
     db_refs = {}
     if _is_protein(bpe):
         hgnc_id = BiopaxProcessor._get_hgnc_id(bpe)
         uniprot_id = BiopaxProcessor._get_uniprot_id(bpe)
         # Handle missing HGNC/UP ids
         if hgnc_id and not uniprot_id:
             uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
         if uniprot_id and not hgnc_id:
             if uniprot_client.is_human(uniprot_id):
                 hgnc_name = uniprot_client.get_gene_name(uniprot_id, False)
                 if hgnc_name:
                     hgnc_id = hgnc_client.get_hgnc_id(hgnc_name)
         if hgnc_id is not None:
             db_refs['HGNC'] = hgnc_id
         if uniprot_id is not None:
             db_refs['UP'] = uniprot_id
     elif _is_small_molecule(bpe):
         chebi_id = BiopaxProcessor._get_chebi_id(bpe)
         if chebi_id is not None:
             db_refs['CHEBI'] = chebi_id
     else:
         chebi_id = BiopaxProcessor._get_chebi_id(bpe)
         if chebi_id is not None:
             db_refs['CHEBI'] = chebi_id
         hgnc_id = BiopaxProcessor._get_hgnc_id(bpe)
         if hgnc_id is not None:
             db_refs['HGNC'] = hgnc_id
         uniprot_id = BiopaxProcessor._get_uniprot_id(bpe)
         if uniprot_id is not None:
             db_refs['UP'] = uniprot_id
     return db_refs
예제 #22
0
def update_kinases():
    logger.info('--Updating kinase list------')
    url = 'http://www.uniprot.org/uniprot/?' + \
        'sort=entry_name&desc=no&compress=no&query=database:(type:' + \
        'interpro%20ipr011009)%20AND%20reviewed:yes%20AND%20organism:' + \
        '%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no' + \
        '&format=tab&columns=id,genes(PREFERRED),organism-id,entry%20name'
    fname = os.path.join(path, 'kinases.tsv')
    save_from_http(url, fname)

    from indra.databases import hgnc_client, uniprot_client
    add_kinases = [
        'PGK1', 'PKM', 'TAF1', 'NME1', 'BCKDK', 'PDK1', 'PDK2', 'PDK3', 'PDK4',
        'BCR', 'FAM20C', 'BAZ1B', 'PIKFYVE'
    ]
    df = pandas.read_csv(fname, sep='\t')
    for kinase in add_kinases:
        hgnc_id = hgnc_client.get_hgnc_id(kinase)
        up_id = hgnc_client.get_uniprot_id(hgnc_id)
        up_mnemonic = uniprot_client.get_mnemonic(up_id)
        df = df.append(
            {
                'Entry': up_id,
                'Gene names  (primary )': kinase,
                'Organism ID': '9606',
                'Entry name': up_mnemonic
            },
            ignore_index=True)
    df.to_csv(fname, sep='\t', index=False)
예제 #23
0
    def _extract_protein(self, line):
        # Extract key information from the lines.
        prot_name = line['Protein Name']
        prot_id = line['Protein HMS LINCS ID']

        # Get available db-refs.
        db_refs = {}
        if prot_id:
            db_refs.update(self._lc.get_protein_refs(prot_id))
            # Since the resource only gives us an UP ID (not HGNC), we
            # try to get that and standardize the name to the gene name
            up_id = db_refs.get('UP')
            if up_id:
                gene_name = uniprot_client.get_gene_name(up_id)
                if gene_name:
                    prot_name = gene_name
                    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                    if hgnc_id:
                        db_refs['HGNC'] = hgnc_id
        # In some cases lines are missing protein information in which
        # case we return None
        else:
            return None

        # Create the agent.
        return Agent(prot_name, db_refs=db_refs)
예제 #24
0
 def test_query_with_other(self):
     """Test that we can get an ActiveForm."""
     resp = self.__check_good_statement_query(agent='MAPK1',
                                              type='ActiveForm')
     _check_stmt_agents(resp,
                        agents=[(0, 'HGNC',
                                 hgnc_client.get_hgnc_id('MAPK1'))])
     return
예제 #25
0
 def test_object_only_query(self):
     """Test whether we can get an object only statement."""
     resp = self.__check_good_statement_query(object='GLUL',
                                              type='IncreaseAmount')
     _check_stmt_agents(resp,
                        agents=[(1, 'HGNC', hgnc_client.get_hgnc_id('GLUL'))
                                ])
     return
예제 #26
0
def test_active_form():
    ras = Agent('KRAS', mutations=[MutCondition('12', 'G', 'V')],
                db_refs={'HGNC':'6407'})
    mapk1_p = Agent('MAP2K1',
                    mods=[ModCondition('phosphorylation', 'T', '185')],
                    db_refs={'HGNC': hgnc_client.get_hgnc_id('MAP2K1')})
    mapk1_pp = Agent('MAP2K1',
                     mods=[ModCondition('phosphorylation', 'T', '185'),
                           ModCondition('phosphorylation', 'Y', '187')],
                     db_refs={'HGNC': hgnc_client.get_hgnc_id('MAP2K1')})
    stmt1 = ActiveForm(ras, 'gtpbound', True)
    stmt2 = ActiveForm(mapk1_p, 'kinase', True)
    stmt3 = ActiveForm(mapk1_pp, 'kinase', True)
    for stmt in (stmt1, stmt2, stmt3):
        pba = pa.PybelAssembler([stmt])
        belgraph = pba.make_model()
        assert len(belgraph) == 2
예제 #27
0
 def test_bad_camel(self):
     """Test that a type can be poorly formatted and resolve correctly."""
     resp = self.__check_good_statement_query(agent='MAPK1',
                                              type='acTivefOrm')
     _check_stmt_agents(resp,
                        agents=[(0, 'HGNC',
                                 hgnc_client.get_hgnc_id('MAPK1'))])
     return
예제 #28
0
 def test_query_with_hgnc_symbol_ns(self):
     """Test specifying HGNC-SYMBOL as a namespace."""
     resp = self.__check_good_statement_query(subject='MAPK1@HGNC-SYMBOL',
                                              type='Phosphorylation')
     _check_stmt_agents(resp,
                        agents=[(0, 'HGNC',
                                 hgnc_client.get_hgnc_id('MAPK1'))])
     return
예제 #29
0
def test_active_form():
    ras = Agent('KRAS', mutations=[MutCondition('12', 'G', 'V')],
                db_refs={'HGNC': '6407'})
    mapk1_p = Agent('MAP2K1',
                    mods=[ModCondition('phosphorylation', 'T', '185')],
                    db_refs={'HGNC': hgnc_client.get_hgnc_id('MAP2K1')})
    mapk1_pp = Agent('MAP2K1',
                     mods=[ModCondition('phosphorylation', 'T', '185'),
                           ModCondition('phosphorylation', 'Y', '187')],
                     db_refs={'HGNC': hgnc_client.get_hgnc_id('MAP2K1')})
    stmt1 = ActiveForm(ras, 'gtpbound', True)
    stmt2 = ActiveForm(mapk1_p, 'kinase', True)
    stmt3 = ActiveForm(mapk1_pp, 'kinase', True)
    for stmt in (stmt1, stmt2, stmt3):
        pba = pa.PybelAssembler([stmt])
        belgraph = pba.make_model()
        assert len(belgraph) == 2
예제 #30
0
def get_db_refs_by_ident(ns, ident, node_data):
    """Return standard name and grounding based on a namespace and an ID.

    Parameters
    ----------
    ns : str
        A name space in which the given identifier is interpreted.
    ident : str
        The identifier in the given name space to get grounding for.
    node_data : dict
        Node data for logging purposes.

    Returns
    -------
    name : str
        The standardized name for the given entity.
    db_refs : dict
        The grounding for the given entity.
    """
    name = node_data.get(pc.NAME)
    db_refs = None
    if ns == 'HGNC':
        name = hgnc_client.get_hgnc_name(ident)
        if not name:
            return None, None
        db_refs = {'HGNC': ident}
        up_id = _get_up_id(ident)
        if up_id:
            db_refs['UP'] = up_id
        mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(ident)
        if mirbase_id:
            db_refs['MIRBASE'] = mirbase_id
    elif ns == 'UP':
        db_refs = {'UP': ident}
        name = uniprot_client.get_gene_name(ident)
        if not name:
            return None, None
        if uniprot_client.is_human(ident):
            hgnc_id = hgnc_client.get_hgnc_id(name)
            if not hgnc_id:
                logger.info('Uniprot ID linked to invalid human gene '
                            'name %s' % name)
            else:
                db_refs['HGNC'] = hgnc_id
    elif ns == 'MIRBASE':
        db_refs = {'MIRBASE': ident}
    elif ns in ('MGI', 'RGD', 'CHEBI', 'HMDB', 'MESH'):
        db_refs = {ns: ident}
        # raise ValueError('Identifiers for MGI and RGD databases are not '
        #                 'currently handled: %s' % node_data)
    elif ns == 'PUBCHEM.COMPOUND':
        db_refs = {'PUBCHEM': ident}
    else:
        logger.info("Unhandled namespace %s with name %s and "
                    "identifier %s (%s)." % (ns, name,
                                             node_data.identifier,
                                             node_data))
    return name, db_refs
예제 #31
0
def get_target_agent(target):
    target_hgnc_id = hgnc_client.get_hgnc_id(target)
    target_up_id = hgnc_client.get_uniprot_id(target_hgnc_id)
    target_agent = Agent(target,
                         db_refs={
                             'HGNC': target_hgnc_id,
                             'UP': target_up_id
                         })
    return target_agent
예제 #32
0
def get_gene_agent(name, gene_entrez_id):
    db_refs = {'EGID': gene_entrez_id}
    hgnc_id = hgnc_client.get_hgnc_id(name)
    if hgnc_id:
        db_refs['HGNC'] = hgnc_id
    standard_name, db_refs = standardize_name_db_refs(db_refs)
    if standard_name:
        name = standard_name
    return Agent(name, db_refs=db_refs)
def validate(db_ns, db_id):
    """Validate identifier, accepting HGNC name or ID"""
    if db_ns == 'HGNC':
        if db_id.isdigit():
            return validate_id(db_ns, db_id)
        else:
            return get_hgnc_id(db_id) is not None
    else:
        return validate_id(db_ns, db_id)
예제 #34
0
def get_ras220_hgnc_ids():
    # RAS 220 genes
    hgnc_ids = []
    with open('../../indra/data/ras_pathway_proteins.csv', 'r') as fh:
        for row in csv.reader(fh, delimiter='\t'):
            hgnc_symbol = row[0]
            hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol)
            hgnc_ids.append(hgnc_id)
    return hgnc_ids
예제 #35
0
 def _get_agent_from_gene_name(gene_name):
     db_refs = {}
     hgnc_id = hgnc_client.get_hgnc_id(gene_name)
     if hgnc_id:
         db_refs['HGNC'] = hgnc_id
         up_id = hgnc_client.get_uniprot_id(hgnc_id)
         if up_id:
             db_refs['UP'] = up_id
     agent = Agent(gene_name, db_refs=db_refs)
     return agent
예제 #36
0
def get_dark_kinase_hgnc_ids():
    # All dark kinases
    fname = '../../indra_analysis/Table_005_IDG_dark_kinome.csv'
    hgnc_ids = []
    with open(fname, 'r') as fh:
        for row in csv.reader(fh):
            hgnc_symbol = row[1]
            hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol)
            hgnc_ids.append(hgnc_id)
    return hgnc_ids
예제 #37
0
 def get_agent(concept, entity):
     name = gene_name_from_uri(concept)
     namespace = namespace_from_uri(entity)
     db_refs = {}
     if namespace == 'HGNC':
         hgnc_id = hgnc_client.get_hgnc_id(name)
         if hgnc_id is not None:
             db_refs['HGNC'] = hgnc_id
     agent = Agent(name, db_refs=db_refs)
     return agent
예제 #38
0
파일: api.py 프로젝트: johnbachman/indra
 def _get_agent_ref(agent):
     """Get the preferred ref for an agent for db web api."""
     if agent is None:
         return None
     ag_hgnc_id = hgnc_client.get_hgnc_id(agent.name)
     if ag_hgnc_id is not None:
         return ag_hgnc_id + "@HGNC"
     db_refs = agent.db_refs
     for namespace in ['HGNC', 'FPLX', 'CHEBI', 'TEXT']:
         if namespace in db_refs.keys():
             return '%s@%s' % (db_refs[namespace], namespace)
     return '%s@%s' % (agent.name, 'TEXT')
예제 #39
0
    def rename_agents(self, stmts):
        """Return a list of mapped statements with updated agent names.

        Creates a new list of statements without modifying the original list.

        The agents in a statement should be renamed if the grounding map has
        updated their db_refs. If an agent contains a FamPlex grounding, the
        FamPlex ID is used as a name. Otherwise if it contains a Uniprot ID,
        an attempt is made to find the associated HGNC gene name. If one can
        be found it is used as the agent name and the associated HGNC ID is
        added as an entry to the db_refs. If neither a FamPlex ID or HGNC name
        can be found, falls back to the original name.

        Parameters
        ----------
        stmts : list of :py:class:`indra.statements.Statement`
            List of statements whose Agents need their names updated.

        Returns
        -------
        mapped_stmts : list of :py:class:`indra.statements.Statement`
            A new list of Statements with updated Agent names
        """
        # Make a copy of the stmts
        mapped_stmts = deepcopy(stmts)
        # Iterate over the statements
        for _, stmt in enumerate(mapped_stmts):
            # Iterate over the agents
            for agent in stmt.agent_list():
                if agent is None:
                    continue
                # If there's a FamPlex ID, prefer that for the name
                if agent.db_refs.get('FPLX'):
                    agent.name = agent.db_refs.get('FPLX')
                # Take a HGNC name from Uniprot next
                elif agent.db_refs.get('UP'):
                    # Try for the gene name
                    gene_name = uniprot_client.get_gene_name(
                                                    agent.db_refs.get('UP'),
                                                    web_fallback=False)
                    if gene_name:
                        agent.name = gene_name
                        hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                        if hgnc_id:
                            agent.db_refs['HGNC'] = hgnc_id
                    # Take the text string
                    #if agent.db_refs.get('TEXT'):
                    #    agent.name = agent.db_refs.get('TEXT')
                    # If this fails, then we continue with no change
                # Fall back to the text string
                #elif agent.db_refs.get('TEXT'):
                #    agent.name = agent.db_refs.get('TEXT')
        return mapped_stmts
예제 #40
0
 def _initialize_node_agents(self):
     """Initialize internal dicts containing node information."""
     nodes = _get_dict_from_list('nodes', self.cx)
     invalid_genes = []
     for node in nodes:
         id = node['@id']
         cx_db_refs = self.get_aliases(node)
         up_id = cx_db_refs.get('UP')
         if up_id:
             gene_name = uniprot_client.get_gene_name(up_id)
             hgnc_id = hgnc_client.get_hgnc_id(gene_name)
             db_refs = {'UP': up_id, 'HGNC': hgnc_id, 'TEXT': gene_name}
             agent = Agent(gene_name, db_refs=db_refs)
             self._node_names[id] = gene_name
             self._node_agents[id] = agent
             continue
         else:
             node_name = node['n']
             self._node_names[id] = node_name
             hgnc_id = hgnc_client.get_hgnc_id(node_name)
             db_refs = {'TEXT': node_name}
             if not hgnc_id:
                 if not self.require_grounding:
                     self._node_agents[id] = \
                             Agent(node_name, db_refs=db_refs)
                 invalid_genes.append(node_name)
             else:
                 db_refs.update({'HGNC': hgnc_id})
                 up_id = hgnc_client.get_uniprot_id(hgnc_id)
                 # It's possible that a valid HGNC ID will not have a
                 # Uniprot ID, as in the case of HOTAIR (HOX transcript
                 # antisense RNA, HGNC:33510)
                 if up_id:
                     db_refs.update({'UP': up_id})
                 self._node_agents[id] = Agent(node_name, db_refs=db_refs)
     if invalid_genes:
         verb = 'Skipped' if self.require_grounding else 'Included'
         logger.info('%s invalid gene symbols: %s' %
                     (verb, ', '.join(invalid_genes)))
예제 #41
0
def get_grounded_agent(gene_name):
    """Return a grounded Agent based on an HGNC symbol."""
    db_refs = {'TEXT': gene_name}
    if gene_name in hgnc_map:
        gene_name = hgnc_map[gene_name]
    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
    if hgnc_id:
        db_refs['HGNC'] = hgnc_id
        up_id = hgnc_client.get_uniprot_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
    agent = Agent(gene_name, db_refs=db_refs)
    return agent
예제 #42
0
 def _get_agent(self, ent_name, ent_type, id, database):
     # Returns a list of agents corresponding to this id
     # (If it is a signor complex, returns an Agent object with complex
     # constituents as BoundConditions
     if database == 'SIGNOR' and id in self.complex_map:
         components = self.complex_map[id]
         agents = self._get_complex_agents(id)
         # Return the first agent with the remaining agents as a bound
         # condition
         agent = agents[0]
         agent.bound_conditions = \
                 [BoundCondition(a, True) for a in agents[1:]]
         return agent
     else:
         gnd_type = _type_db_map[(ent_type, database)]
         if gnd_type == 'UP':
             up_id = id
             db_refs = {'UP': up_id}
             name = uniprot_client.get_gene_name(up_id)
             hgnc_id = hgnc_client.get_hgnc_id(name)
             if hgnc_id:
                 db_refs['HGNC'] = hgnc_id
         # Map SIGNOR protein families to FamPlex families
         elif ent_type == 'proteinfamily':
             db_refs = {database: id} # Keep the SIGNOR family ID in db_refs
             key = (database, id)
             # Use SIGNOR name unless we have a mapping in FamPlex
             name = ent_name
             famplex_id = famplex_map.get(key)
             if famplex_id is None:
                 logger.info('Could not find %s in FamPlex map' %
                             str(key))
             else:
                 db_refs['FPLX'] = famplex_id
                 name = famplex_id
         # Other possible groundings are PUBCHEM, SIGNOR, etc.
         elif gnd_type is not None:
             if database not in ('PUBCHEM', 'SIGNOR', 'ChEBI', 'miRBase'):
                 raise ValueError('Unexpected database %s' % database)
             if database == 'PUBCHEM' and id.startswith('CID:'):
                 # We take off the CID: prefix plus fix an issue with
                 # SIGNOR's format in which it leaves extra spaces around
                 # the ID, as in 'CID: 923'
                 id = id[4:].strip()
             db_refs = {gnd_type: id}
             name = ent_name
         # If no grounding, include as an untyped/ungrounded node
         else:
             name = ent_name
             db_refs = {}
         return Agent(name, db_refs=db_refs)
예제 #43
0
def get_gene_agents(gene_names):
    agents = []
    for gn in gene_names:
        hgnc_id = hgnc_client.get_hgnc_id(gn)
        if not hgnc_id:
            logger.warning('Invalid HGNC gene symbol: %s' % gn)
            continue
        db_refs = {'HGNC': hgnc_id}
        up_id = hgnc_client.get_uniprot_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
        agent = Agent(gn, db_refs=db_refs)
        agents.append(agent)
    return agents
예제 #44
0
def _agent_from_ns_id(ag_ns, ag_id):
    ag_name = ag_id
    db_refs = {'TEXT': ag_name}
    if ag_ns == 'HGNC':
        hgnc_id = hgnc_client.get_hgnc_id(ag_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id is not None:
                db_refs['UP'] = up_id
    else:
        if ag_id is not None:
            db_refs[ag_ns] = ag_id
    return Agent(ag_name, db_refs=db_refs)
예제 #45
0
def _get_agent_ref(agent):
    """Get the preferred ref for an agent for db web api."""
    if agent is None:
        return None

    # TODO: This will no longer be needed when the database is refreshed.
    ag_hgnc_id = hgnc_client.get_hgnc_id(agent.name)
    if ag_hgnc_id is not None:
        return ag_hgnc_id + "@HGNC"
    db_refs = agent.db_refs
    for namespace in ['HGNC', 'FPLX', 'CHEBI', 'TEXT']:
        if namespace in db_refs.keys():
            return '%s@%s' % (db_refs[namespace], namespace)
    return '%s@%s' % (agent.name, 'TEXT')
예제 #46
0
파일: __init__.py 프로젝트: jmuhlich/indra
def get_kinase_activities():
    kinase_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                               '../../resources/kinases.tsv')
    kinases = []
    with open(kinase_file, 'rt') as fh:
        lines = [l.strip() for l in fh.readlines()]
        for lin in lines[1:]:
            up_id, hgnc_name, _, _ = lin.split('\t')
            hgnc_id = hgnc_client.get_hgnc_id(hgnc_name)
            agent = Agent(hgnc_name, db_refs={'UP': up_id, 'HGNC': hgnc_id})
            kinases.append(agent)
    kin_activities = []
    from indra.statements import HasActivity
    for kin in kinases:
        stmt = HasActivity(kin, 'kinase', True)
        kin_activities.append(stmt)
    return kin_activities
예제 #47
0
def get_all_gene_names(data, out_file='prior_genes.txt'):
    """Return all gene names corresponding to all ABs."""
    filt = pandas.notnull(data['antibody']['Protein Data ID'])
    data_filt = data['antibody'][filt]
    gene_names = data_filt['Gene Name']
    uniprot_ids = data_filt['UniProt ID']
    all_genes = set()
    invalid_genes = set()
    for gn, upid in zip(gene_names, uniprot_ids):
        # Some entries are lists of genes separated by commas
        # and we also strip off extra spaces
        names = [x.strip() for x in gn.split(',')]
        ids = [x.strip() for x in upid.split(',')]
        names_from_ids = [uniprot_client.get_gene_name(x) for x in ids]
        # Find invalid gene names
        for name in names:
            if not hgnc_client.get_hgnc_id(name):
                print('Invalid or deprecated gene symbol: %s' % name)
                invalid_genes.add(name)
        # Find inconsistent gene names and UniProt IDs
        if set(names) != set(names_from_ids):
            print('Inconsistent entries:')
            print('- Given gene names: %s' % ','.join(names))
            print('- Genes from uniprot IDs: %s' % ','.join(names_from_ids))
        # Add both the gene names and the gene names derived from UniProt IDs
        all_genes = all_genes.union(set(names)).union(set(names_from_ids))
    # Finally remove the invalid gene names
    all_genes = list(all_genes.difference(invalid_genes))
    # Add the unannotated genes
    unannotated_ab_genes = get_unannotated_antibody_genes(data)
    all_genes += unannotated_ab_genes
    # Add drug target genes
    drug_targets = get_drug_targets()
    for targets in drug_targets.values():
        all_genes += targets
    # Add other important genes, for now, the RAS pathway
    all_genes += get_ras227_genes()
    all_genes = sorted(list(set(all_genes)))
    print('%d genes in total' % len(all_genes))
    with open(out_file, 'wb') as fh:
        for gene in all_genes:
            fh.write(('%s\n' % gene).encode('utf-8'))
    return all_genes
예제 #48
0
def get_ids_for_gene(hgnc_name, **kwargs):
    """Get the curated set of articles for a gene in the Entrez database.

    Search parameters for the Gene database query can be passed in as
    keyword arguments. 

    Parameters
    ----------
    hgnc_name : string
        The HGNC name of the gene. This is used to obtain the HGNC ID
        (using the hgnc_client module) and in turn used to obtain the Entrez
        ID associated with the gene. Entrez is then queried for that ID.
    """

    # Get the HGNC ID for the HGNC name
    hgnc_id = hgnc_client.get_hgnc_id(hgnc_name)
    if hgnc_id is None:
        raise ValueError('Invalid HGNC name.')
    # Get the Entrez ID
    entrez_id = hgnc_client.get_entrez_id(hgnc_id)
    if entrez_id is None:
        raise ValueError('Entrez ID not found in HGNC table.')
    # Query the Entrez Gene database
    params = {'db': 'gene',
              'retmode': 'xml',
              'id': entrez_id}
    params.update(kwargs)
    tree = send_request(pubmed_fetch, params)
    if tree is None:
        return []
    if tree.find('ERROR') is not None:
        logger.error(tree.find('ERROR').text)
        return []
    # Get all PMIDs from the XML tree
    id_terms = tree.findall('.//PubMedId')
    if id_terms is None:
        return []
    # Use a set to remove duplicate IDs
    ids = list(set([idt.text for idt in id_terms]))
    return ids
예제 #49
0
def update_kinases():
    logger.info('--Updating kinase list------')
    url = 'http://www.uniprot.org/uniprot/?' + \
        'sort=entry_name&desc=no&compress=no&query=database:(type:' + \
        'interpro%20ipr011009)%20AND%20reviewed:yes%20AND%20organism:' + \
        '%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no' + \
        '&format=tab&columns=id,genes(PREFERRED),organism-id,entry%20name'
    fname = os.path.join(path, 'kinases.tsv')
    save_from_http(url, fname)

    from indra.databases import hgnc_client, uniprot_client
    add_kinases = ['PGK1', 'PKM', 'TAF1', 'NME1', 'BCKDK', 'PDK1', 'PDK2',
                   'PDK3', 'PDK4', 'BCR', 'FAM20C', 'BAZ1B', 'PIKFYVE']
    df = pandas.read_csv(fname, sep='\t')
    for kinase in add_kinases:
        hgnc_id = hgnc_client.get_hgnc_id(kinase)
        up_id = hgnc_client.get_uniprot_id(hgnc_id)
        up_mnemonic = uniprot_client.get_mnemonic(up_id)
        df = df.append({'Entry': up_id, 'Gene names  (primary )': kinase,
                        'Organism ID': '9606', 'Entry name': up_mnemonic},
                       ignore_index=True)
    df.to_csv(fname, sep='\t', index=False)
예제 #50
0
 def _add_node(self, agent, uuid=None):
     node_key = agent.name
     node_id = self._existing_nodes.get(node_key)
     # if the node already exists we do not want to add it again
     # we must however add its uuid
     if node_id is not None:
         # fetch the appropriate node
         n = [x for x in self._nodes if x['data']['id'] == node_id][0]
         uuid_list = n['data']['uuid_list']
         if uuid not in uuid_list:
             uuid_list.append(uuid)
         return node_id
     db_refs = _get_db_refs(agent)
     node_id = self._get_new_id()
     self._existing_nodes[node_key] = node_id
     node_name = agent.name
     node_name = node_name.replace('_', ' ')
     expanded_families = expander.get_children(agent, ns_filter='HGNC')
     members = {}
     for member in expanded_families:
         hgnc_symbol = member[1]
         hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol)
         if hgnc_id:
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             member_agent = Agent(hgnc_symbol,
                                  db_refs={'HGNC': hgnc_id,
                                           'UP': up_id})
             member_db_refs = _get_db_refs(member_agent)
         else:
             member_db_refs = {}
         members[member[1]] = {'db_refs': member_db_refs}
     node = {'data': {'id': node_id, 'name': node_name,
                      'db_refs': db_refs, 'parent': '',
                      'members': members, 'uuid_list': [uuid]}}
     self._nodes.append(node)
     return node_id
예제 #51
0
파일: processor.py 프로젝트: jmuhlich/indra
 def get_agent(concept, entity):
     name = term_from_uri(concept)
     namespace = namespace_from_uri(entity)
     db_refs = {}
     if namespace == 'HGNC':
         agent_name = name
         hgnc_id = hgnc_client.get_hgnc_id(name)
         if hgnc_id is not None:
             db_refs['HGNC'] = str(hgnc_id)
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 db_refs['UP'] = up_id
             else:
                 logger.warning('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.' %
                                (name, hgnc_id))
         else:
             logger.warning("Couldn't get HGNC ID for HGNC symbol %s" %
                            name)
     elif namespace in ('MGI', 'RGD'):
         agent_name = name
         db_refs[namespace] = name
     elif namespace in ('PFH', 'SFAM'):
         indra_name = bel_to_indra.get(name)
         db_refs[namespace] = name
         if indra_name is None:
             agent_name = name
             msg = 'Could not find mapping for BEL family: %s' % name
             logger.warning(msg)
         else:
             db_refs['BE'] = indra_name
             db_refs['TEXT'] = name
             agent_name = indra_name
     elif namespace in ('NCH', 'SCOMP'):
         indra_name = bel_to_indra.get(name)
         db_refs[namespace] = name
         if indra_name is None:
             agent_name = name
             msg = 'Could not find mapping for BEL complex: %s' % name
             logger.warning(msg)
         else:
             db_refs['BE'] = indra_name
             db_refs['TEXT'] = name
             agent_name = indra_name
     elif namespace == 'CHEBI':
         chebi_id = chebi_name_id.get(name)
         if chebi_id:
             db_refs['CHEBI'] = chebi_id
         else:
             logger.warning('CHEBI name %s not found in map.' % name)
         agent_name = name
     elif namespace == 'EGID':
         hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
         db_refs['EGID'] = name
         if hgnc_id is not None:
             db_refs['HGNC'] = str(hgnc_id)
             agent_name = hgnc_client.get_hgnc_name(hgnc_id)
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 db_refs['UP'] = up_id
             else:
                 logger.warning('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.' %
                                (name, hgnc_id))
         else:
             logger.warning('Could not map EGID%s to HGNC.' % name)
             agent_name = 'E%s' % name
     else:
         logger.warning('Unhandled entity namespace: %s' % namespace)
         print('%s, %s' % (concept, entity))
         agent_name = name
     agent = Agent(agent_name, db_refs=db_refs)
     return agent
예제 #52
0
def id(gene_name):
    return hgnc_client.get_hgnc_id(gene_name)
예제 #53
0
 def standardize_agent_db_refs(agent, map_db_refs, do_rename=True):
     gene_name = None
     up_id = map_db_refs.get('UP')
     hgnc_sym = map_db_refs.get('HGNC')
     if up_id and not hgnc_sym:
         gene_name = uniprot_client.get_gene_name(up_id, False)
         if gene_name:
             hgnc_id = hgnc_client.get_hgnc_id(gene_name)
             if hgnc_id:
                 map_db_refs['HGNC'] = hgnc_id
     elif hgnc_sym and not up_id:
         # Override the HGNC symbol entry from the grounding
         # map with an HGNC ID
         hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
         if hgnc_id:
             map_db_refs['HGNC'] = hgnc_id
             # Now get the Uniprot ID for the gene
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 map_db_refs['UP'] = up_id
         # If there's no HGNC ID for this symbol, raise an
         # Exception
         else:
             raise ValueError('No HGNC ID corresponding to gene '
                              'symbol %s in grounding map.' %
                              hgnc_sym)
     # If we have both, check the gene symbol ID against the
     # mapping from Uniprot
     elif up_id and hgnc_sym:
         # Get HGNC Symbol from Uniprot
         gene_name = uniprot_client.get_gene_name(up_id)
         if not gene_name:
             raise ValueError('No gene name found for Uniprot '
                              'ID %s (expected %s)' %
                              (up_id, hgnc_sym))
         # We got gene name, compare it to the HGNC name
         else:
             if gene_name != hgnc_sym:
                 raise ValueError('Gene name %s for Uniprot ID '
                                  '%s does not match HGNC '
                                  'symbol %s given in grounding '
                                  'map.' %
                                  (gene_name, up_id, hgnc_sym))
             else:
                 hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
                 if not hgnc_id:
                     logger.error('No HGNC ID corresponding to gene '
                                  'symbol %s in grounding map.' % hgnc_sym)
                 else:
                     map_db_refs['HGNC'] = hgnc_id
     # Assign the DB refs from the grounding map to the agent
     agent.db_refs = map_db_refs
     # Are we renaming right now?
     if do_rename:
         # If there's a FamPlex ID, prefer that for the name
         if agent.db_refs.get('FPLX'):
             agent.name = agent.db_refs.get('FPLX')
         # Get the HGNC symbol or gene name (retrieved above)
         elif hgnc_sym is not None:
             agent.name = hgnc_sym
         elif gene_name is not None:
             agent.name = gene_name
     return
예제 #54
0
    def _get_agent_from_ref(self, ref):
        # TODO: handle collections
        if ref.attrib.get('category') == 'collection':
            #logger.warning('Skipping collection Agent.')
            return None

        # Find the name, uid and raw-text tags first and get their text
        # content if available
        uid_tag = ref.find("var/[@name='uid']")
        name_tag = ref.find("var/[@name='name']")
        text_tag = ref.find("var/[@name='raw-text']")
        if name_tag is not None and name_tag.text:
            name = name_tag.text
        else:
            name = None
        if uid_tag is not None and uid_tag.text:
            uid = uid_tag.text
        else:
            uid = None
        if text_tag is not None and text_tag.text:
            raw_text = text_tag.text
        else:
            raw_text = None

        # TODO: factor this out and reuse fix_agents
        db_refs = {}
        # Save raw text if available
        if raw_text:
            db_refs['TEXT'] = raw_text
        agent_name = raw_text
        # If we have a proper UID then we try to reconstruct an Agent from that
        if uid is not None and len(uid.split(':')) == 2:
            db_ns, db_id = uid.split(':')
            be_id = famplex_map.get((db_ns, db_id))
            if be_id:
                db_refs[db_ns] = db_id
                db_refs['FPLX'] = be_id
                agent_name = be_id
            elif db_ns in ['UP', 'Uniprot']:
                db_refs['UP'] = db_id
                gene_name = uniprot_client.get_gene_name(db_id)
                if gene_name:
                    agent_name = gene_name
                    hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                    if hgnc_id:
                        db_refs['HGNC'] = hgnc_id
            elif db_ns == 'NCIT':
                db_refs['NCIT'] = db_id
                target = ncit_map.get(db_id)
                if target:
                    db_refs[target[0]] = target[1]
                    if target[0] == 'HGNC':
                        up_id = hgnc_client.get_uniprot_id(target[1])
                        agent_name = hgnc_client.get_hgnc_name(target[1])
                        if up_id:
                            db_refs['UP'] = up_id
                    elif target[0] == 'UP':
                        agent_name = uniprot_client.get_gene_name(target[1])
                        if agent_name:
                            hgnc_id = hgnc_client.get_hgnc_id(agent_name)
                            if hgnc_id:
                                db_refs['HGNC'] = hgnc_id
            elif db_ns == 'FA':
                db_refs['NXP'] = 'FA:' + db_id
            elif db_ns == 'XFAM':
                db_refs['PF'] = db_id.split('.')[0]
            elif db_ns == 'CHEBI':
                db_refs['CHEBI'] = 'CHEBI:' + db_id
            elif db_ns in ['GO', 'MESH', 'FPLX']:
                db_refs[db_ns] = db_id
            # Handle old BE mappings and add them as FPLX
            elif db_ns == 'BE':
                db_refs['FPLX'] = db_id
            elif db_ns in ['PR', 'CO', 'CVCL', 'EFO', 'ORPHANET']:
                db_refs[db_ns] = db_id
            else:
                logger.warning('Unknown database name space %s' % db_ns)
        if not agent_name:
            if raw_text is not None:
                agent_name = raw_text
            else:
                return None

        assert(agent_name)

        agent = Agent(agent_name, db_refs=db_refs)
        return agent
예제 #55
0
def _fix_agent(agent):
    if agent is None:
        return
    # First we fix some name spaces
    db_refs_tmp = copy(agent.db_refs)
    for db_ns, db_id in agent.db_refs.items():
        # Change FA name space
        if db_ns == 'FA':
            db_refs_tmp.pop('FA', None)
            db_refs_tmp['NXPFA'] = db_id
        # Change IPR name space
        elif db_ns == 'IPR':
            db_refs_tmp.pop('IPR', None)
            db_refs_tmp['IP'] = db_id
        # Change XFAM name space
        elif db_ns == 'XFAM':
            db_refs_tmp.pop('XFAM', None)
            db_refs_tmp['PF'] = db_id.split('.')[0]
        elif db_ns == 'GO':
            if db_id.startswith('GO:'):
                db_refs_tmp['GO'] = db_id
            else:
                db_refs_tmp['GO'] = 'GO:' + db_id
        # Change PCID name space
        elif db_ns == 'PCID':
            db_refs_tmp.pop('PCID', None)
            db_refs_tmp['PUBCHEM'] = db_id
    agent.db_refs = db_refs_tmp
    # Check if we have a FPLX entry and handle old BE mappings
    if 'BE' in agent.db_refs:
        agent.db_refs['FPLX'] = agent.db_refs.pop('BE')
    be_id = agent.db_refs.get('FPLX')
    # Try to map to FPLX from NXP, IPR, PF, NCIT
    if not be_id:
        for db_ns, db_id in agent.db_refs.items():
            be_id = famplex_map.get((db_ns, db_id))
            if be_id:
                break
    # Try mapping NCIT to specific genes if possible
    if not be_id and 'NCIT' in agent.db_refs:
        target = ncit_map.get(agent.db_refs['NCIT'])
        if target:
            agent.db_refs[target[0]] = target[1]
    # Check what entries we have
    up_id = agent.db_refs.get('UP')
    hgnc_id = agent.db_refs.get('HGNC')
    # FPLX takes precedence if we have it
    if be_id:
        agent.db_refs['FPLX'] = be_id
        agent.name = be_id
    elif hgnc_id:
        gene_name = hgnc_client.get_hgnc_name(hgnc_id)
        if gene_name:
            agent.name = gene_name
        if not up_id:
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                agent.db_refs['UP'] = up_id
    elif up_id:
        gene_name = uniprot_client.get_gene_name(up_id)
        if gene_name:
            agent.name = gene_name
            hgnc_id = hgnc_client.get_hgnc_id(gene_name)
            if hgnc_id:
                agent.db_refs['HGNC'] = hgnc_id
        # If it doesn't have a gene name, it's better to just
        # use the raw string name otherwise Sparser sets
        # has Uniprot IDs or mnemonics as the name
        else:
            name = agent.db_refs.get('TEXT', agent.name)
            agent.name = name
예제 #56
0
def get_agent(node_data, node_modifier_data=None):
    # FIXME: Handle translocations on the agent for ActiveForms, turn into
    # location conditions
    # Check the node type/function
    node_func = node_data[pc.FUNCTION]
    if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX,
                         pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA):
        mod_data = node_modifier_data or 'No node data'
        logger.info("Nodes of type %s not handled: %s",
                    node_func, mod_data)
        return None
    # Skip gene/protein fusions
    if pc.FUSION in node_data:
        logger.info("Gene and protein fusions not handled: %s" % str(node_data))
        return None
    # COMPLEXES ------------
    # First, handle complexes, which will consist recursively of other agents
    if node_func == pc.COMPLEX:
        # First, check for members: if there are no members, we assume this
        # is a named complex
        members = node_data.get(pc.MEMBERS)
        if members is None:
            return None
        # Otherwise, get the "main" agent, to which the other members will be
        # attached as bound conditions
        main_agent = get_agent(members[0])
        # If we can't get the main agent, return None
        if main_agent is None:
            return None
        bound_conditions = [BoundCondition(get_agent(m), True)
                            for m in members[1:]]
        # Check the bound_conditions for any None agents
        if any([bc.agent is None for bc in bound_conditions]):
            return None
        main_agent.bound_conditions = bound_conditions
        # Get activity of main agent
        ac = _get_activity_condition(node_modifier_data)
        main_agent.activity = ac
        return main_agent
    # OTHER NODE TYPES -----
    # Get node identifier information
    name = node_data.get(pc.NAME)
    ns = node_data[pc.NAMESPACE]
    ident = node_data.get(pc.IDENTIFIER)
    # No ID present, get identifier using the name, namespace
    db_refs = None
    if not ident:
        assert name, "Node must have a name if lacking an identifier."
        if ns == 'HGNC':
            hgnc_id = hgnc_client.get_hgnc_id(name)
            if not hgnc_id:
                logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
                return None
            db_refs = {'HGNC': hgnc_id}
            up_id = _get_up_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
        # FIXME: Look up go ID in ontology lookup service
        # FIXME: Look up MESH IDs from name
        # FIXME: For now, just use node name
        elif ns in ('GOBP', 'MESHPP', 'MESHD'):
            db_refs = {}
        # For now, handle MGI/RGD but putting the name into the db_refs so
        # it's clear what namespace the name belongs to
        # FIXME: Full implementation would look up MGI/RGD identifiers from
        # the names, and obtain corresponding Uniprot IDs
        elif ns in ('MGI', 'RGD'):
            db_refs = {ns: name}
        # Map Selventa families to FamPlexes
        elif ns == 'SFAM':
            db_refs = {'SFAM': name}
            indra_name = bel_to_indra.get(name)
            if indra_name is None:
                logger.info('Could not find mapping for BEL/SFAM family: '
                            '%s (%s)' % (name, node_data))
            else:
                db_refs['FPLX'] = indra_name
                name = indra_name
        # Map Entrez genes to HGNC/UP
        elif ns == 'EGID':
            hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
            db_refs = {'EGID': name}
            if hgnc_id is not None:
                db_refs['HGNC'] = hgnc_id
                name = hgnc_client.get_hgnc_name(hgnc_id)
                up_id = hgnc_client.get_uniprot_id(hgnc_id)
                if up_id:
                    db_refs['UP'] = up_id
                else:
                    logger.info('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.',
                                name, hgnc_id)
            else:
                logger.info('Could not map EGID%s to HGNC.' % name)
                name = 'E%s' % name
        # CHEBI
        elif ns == 'CHEBI':
            chebi_id = chebi_name_id.get(name)
            if chebi_id:
                db_refs = {'CHEBI': chebi_id}
            else:
                logger.info('CHEBI name %s not found in map.' % name)
        # SDIS, SCHEM: Include the name as the ID for the namespace
        elif ns in ('SDIS', 'SCHEM'):
            db_refs = {ns: name}
        else:
            print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data))
    # We've already got an identifier, look up other identifiers if necessary
    else:
        # Get the name, overwriting existing name if necessary
        if ns == 'HGNC':
            name = hgnc_client.get_hgnc_name(ident)
            db_refs = {'HGNC': ident}
            up_id = _get_up_id(ident)
            if up_id:
                db_refs['UP'] = up_id
        elif ns == 'UP':
            db_refs = {'UP': ident}
            name = uniprot_client.get_gene_name(ident)
            assert name
            if uniprot_client.is_human(ident):
                hgnc_id = hgnc_client.get_hgnc_id(name)
                if not hgnc_id:
                    logger.info('Uniprot ID linked to invalid human gene '
                                'name %s' % name)
                else:
                    db_refs['HGNC'] = hgnc_id
        elif ns in ('MGI', 'RGD'):
            raise ValueError('Identifiers for MGI and RGD databases are not '
                             'currently handled: %s' % node_data)
        else:
            print("Unhandled namespace with identifier: %s: %s (%s)" %
                  (ns, name, node_data))
    if db_refs is None:
        logger.info('Unable to get identifier information for node: %s',
                    node_data)
        return None
    # Get modification conditions
    mods, muts = _get_all_pmods(node_data)
    # Get activity condition
    ac = _get_activity_condition(node_modifier_data)
    to_loc = _get_translocation_target(node_modifier_data)
    # Check for unhandled node modifiers, skip if so
    if _has_unhandled_modifiers(node_modifier_data):
        return None
    # Make the agent
    ag = Agent(name, db_refs=db_refs, mods=mods, mutations=muts, activity=ac,
               location=to_loc)
    return ag
예제 #57
0
 def map_agents(self, stmts, do_rename=True):
     # Make a copy of the stmts
     mapped_stmts = []
     num_skipped = 0
     # Iterate over the statements
     for stmt in stmts:
         mapped_stmt = deepcopy(stmt)
         # Iterate over the agents
         skip_stmt = False
         for agent in mapped_stmt.agent_list():
             if agent is None or agent.db_refs.get('TEXT') is None:
                 continue
             agent_text = agent.db_refs.get('TEXT')
             # Look this string up in the grounding map
             # If not in the map, leave agent alone and continue
             try:
                 map_db_refs = self.gm[agent_text]
             except KeyError:
                 continue
             # If it's in the map but it maps to None, then filter out
             # this statement by skipping it
             if map_db_refs is None:
                 # Increase counter if this statement has not already
                 # been skipped via another agent
                 if not skip_stmt:
                     num_skipped += 1
                 logger.debug("Skipping %s" % agent_text)
                 skip_stmt = True
             # If it has a value that's not None, map it and add it
             else:
                 # Otherwise, update the agent's db_refs field
                 gene_name = None
                 map_db_refs = deepcopy(self.gm.get(agent_text))
                 up_id = map_db_refs.get('UP')
                 hgnc_sym = map_db_refs.get('HGNC')
                 if up_id and not hgnc_sym:
                     gene_name = uniprot_client.get_gene_name(up_id, False)
                     if gene_name:
                         hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                         if hgnc_id:
                             map_db_refs['HGNC'] = hgnc_id
                 elif hgnc_sym and not up_id:
                     # Override the HGNC symbol entry from the grounding
                     # map with an HGNC ID
                     hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
                     if hgnc_id:
                         map_db_refs['HGNC'] = hgnc_id
                         # Now get the Uniprot ID for the gene
                         up_id = hgnc_client.get_uniprot_id(hgnc_id)
                         if up_id:
                             map_db_refs['UP'] = up_id
                     # If there's no HGNC ID for this symbol, raise an
                     # Exception
                     else:
                         raise ValueError('No HGNC ID corresponding to gene '
                                          'symbol %s in grounding map.' %
                                          hgnc_sym)
                 # If we have both, check the gene symbol ID against the
                 # mapping from Uniprot
                 elif up_id and hgnc_sym:
                     # Get HGNC Symbol from Uniprot
                     gene_name = uniprot_client.get_gene_name(up_id)
                     if not gene_name:
                         raise ValueError('No gene name found for Uniprot '
                                          'ID %s (expected %s)' %
                                          (up_id, hgnc_sym))
                     # We got gene name, compare it to the HGNC name
                     else:
                         if gene_name != hgnc_sym:
                             raise ValueError('Gene name %s for Uniprot ID '
                                              '%s does not match HGNC '
                                              'symbol %s given in grounding '
                                              'map.' %
                                              (gene_name, up_id, hgnc_sym))
                         else:
                             hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
                             if not hgnc_id:
                                 raise ValueError('No HGNC ID '
                                                  'corresponding to gene '
                                                  'symbol %s in grounding '
                                                  'map.' % hgnc_sym)
                 # Assign the DB refs from the grounding map to the agent
                 agent.db_refs = map_db_refs
                 # Are we renaming right now?
                 if do_rename:
                     # If there's a Bioentities ID, prefer that for the name
                     if agent.db_refs.get('BE'):
                         agent.name = agent.db_refs.get('BE')
                     # Get the HGNC symbol or gene name (retrieved above)
                     elif hgnc_sym is not None:
                         agent.name = hgnc_sym
                     elif gene_name is not None:
                         agent.name = gene_name
         # Check if we should skip the statement
         if not skip_stmt:
             mapped_stmts.append(mapped_stmt)
     logger.info('%s statements filtered out' % num_skipped)
     return mapped_stmts
예제 #58
0
def get_agent_from_entity_info(entity_info):
    """Return an INDRA Agent by processing an entity_info dict."""
    # This will be the default name. If we get a gene name, it will
    # override this rawtext name.
    raw_text = entity_info['entityText']
    name = raw_text

    # Get the db refs.
    refs = {'TEXT': raw_text}

    ref_counts = Counter([entry['source'] for entry in
                          entity_info['entityId']])
    for source, count in ref_counts.items():
        if source in ('Entrez', 'UniProt') and count > 1:
            logger.info('%s has %d entries for %s, skipping'
                        % (raw_text, count, source))
            return None, None
    muts = []
    for id_dict in entity_info['entityId']:
        if id_dict['source'] == 'Entrez':
            refs['EGID'] = id_dict['idString']
            hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString'])
            if hgnc_id is not None:
                # Check against what we may have already inferred from
                # UniProt. If it disagrees with this, let it be. Inference
                # from Entrez isn't as reliable.
                if 'HGNC' in refs.keys():
                    if refs['HGNC'] != hgnc_id:
                        msg = ('HGNC:%s previously set does not'
                               ' match HGNC:%s from EGID:%s') % \
                               (refs['HGNC'], hgnc_id, refs['EGID'])
                        logger.info(msg)
                else:
                    refs['HGNC'] = hgnc_id
        elif id_dict['source'] == 'UniProt':
            refs['UP'] = id_dict['idString']
            gene_name = uniprot_client.get_gene_name(id_dict['idString'])
            if gene_name is not None:
                name = gene_name
                hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                if hgnc_id is not None:
                    # Check to see if we have a conflict with an HGNC id
                    # found from the Entrez id. If so, overwrite with this
                    # one, in which we have greater faith.
                    if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id:
                        msg = ('Inferred HGNC:%s from UP:%s does not'
                               ' match HGNC:%s from EGID:%s') % \
                               (refs['HGNC'], refs['UP'], hgnc_id,
                                refs['EGID'])
                        logger.info(msg)
                    refs['HGNC'] = hgnc_id
        elif id_dict['source'] in ('Tax', 'NCBI'):
            refs['TAX'] = id_dict['idString']
        elif id_dict['source'] == 'CHEBI':
            refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString']
        # These we take as is
        elif id_dict['source'] in ('MESH', 'OMIM', 'CTD'):
            refs[id_dict['source']] = id_dict['idString']
        # Handle mutations
        elif id_dict['source'] == 'Unk' and \
                id_dict['entityType'] == 'ProteinMutation':
            # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk',
            #  'tool': 'PubTator', 'entityType': 'ProteinMutation'}
            # Mpk1(Y268A)'
            if id_dict['idString'].startswith('p|SUB|'):
                try:
                    # Handle special cases like p|SUB|A|30|P;RS#:104893878
                    parts = id_dict['idString'].split(';')[0].split('|')
                    residue_from, pos, residue_to = parts[2:5]
                    mut = MutCondition(pos, residue_from, residue_to)
                    muts.append(mut)
                except Exception as e:
                    logger.info('Could not process mutation %s' %
                                id_dict['idString'])
            else:
                logger.info('Unhandled mutation: %s' % id_dict['idString'])
        else:
            logger.warning("Unhandled id type: {source}={idString}"
                           .format(**id_dict))

    raw_coords = (entity_info['charStart'], entity_info['charEnd'])
    return Agent(name, db_refs=refs, mutations=muts), raw_coords