Пример #1
0
    def _make_db_refs(self, entrez_id, text_id):
        """Looks up the HGNC ID  and name, as well as the Uniprot ID.

        Parameters
        ----------
        entrez_id : str
            Entrez gene ID.
        text_id : str or None
            A plain text systematic name, or None if not listed in the
            Biogrid data.

        Returns
        -------
        hgnc_name : str
            Official HGNC symbol for the gene.
        db_refs : dict
            db_refs grounding dictionary, used when constructing the Agent
            object.
        """
        db_refs = {}
        if text_id != '-' and text_id is not None:
            db_refs['TEXT'] = text_id

        hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id)
        hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id is not None:
                db_refs['UP'] = up_id
        return (hgnc_name, db_refs)
Пример #2
0
    def _make_db_refs(self, entrez_id, text_id):
        """Looks up the HGNC ID  and name, as well as the Uniprot ID.

        Parameters
        ----------
        entrez_id : str
            Entrez gene ID.
        text_id : str or None
            A plain text systematic name, or None if not listed in the
            Biogrid data.

        Returns
        -------
        hgnc_name : str
            Official HGNC symbol for the gene.
        db_refs : dict
            db_refs grounding dictionary, used when constructing the Agent
            object.
        """
        db_refs = {}
        if text_id != '-' and text_id is not None:
            db_refs['TEXT'] = text_id

        hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id)
        hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id is not None:
                db_refs['UP'] = up_id
        return (hgnc_name, db_refs)
Пример #3
0
 def read_variant_pairs(self):
     mutations_by_gene = defaultdict(list)
     for mutation in self.patient.mutations:
         gene_name = get_hgnc_name(
             get_hgnc_from_entrez(str(mutation['entrezGeneId'])))
         change = mutation['proteinChange']
         mutations_by_gene[gene_name].append(change)
     return list(mutations_by_gene.items())
Пример #4
0
def get_agent(raw_name, entrez_id):
    db_refs = {'TEXT': raw_name, 'EGID': entrez_id}
    logger.debug('Looking up grounding data for Entrez #%s' % entrez_id)
    hgnc_id = hgc.get_hgnc_from_entrez(entrez_id)
    if hgnc_id:
        db_refs['HGNC'] = hgnc_id
    agent = Agent(raw_name, db_refs=db_refs)
    standardize_agent_name(agent, standardize_refs=True)
    return agent
Пример #5
0
 def _extract_protein(self, name, gene_id):
     refs = {'EGID': gene_id}
     hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id)
     if hgnc_id is not None:
         refs['HGNC'] = hgnc_id
     standard_name, db_refs = standardize_name_db_refs(refs)
     if standard_name:
         name = standard_name
     return Agent(name, db_refs=db_refs)
Пример #6
0
def get_agent(raw_name, entrez_id):
    db_refs = {'TEXT': raw_name}
    logger.debug('Looking up grounding data for Entrez #%s' % entrez_id)
    hgnc_id = hgc.get_hgnc_from_entrez(entrez_id)
    if hgnc_id is not None:
        db_refs['UP'] = hgc.get_uniprot_id(hgnc_id)
        name = hgc.get_hgnc_name(hgnc_id)
    else:
        name = raw_name
    agent = Agent(name, db_refs=db_refs)
    return agent
Пример #7
0
 def _extract_protein(self, name, gene_id):
     refs = {'EGID': gene_id}
     hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id)
     if hgnc_id is not None:
         refs['HGNC'] = hgnc_id
         up_id = hgnc_client.get_uniprot_id(hgnc_id)
         if up_id:
             refs['UP'] = up_id
         # If there is a HGNC ID, we standardize the gene name
         name = hgnc_client.get_hgnc_name(hgnc_id)
     return Agent(name, db_refs=refs)
Пример #8
0
def align_identifiers_urls(indra_groundings, dm_urls):
    matches = []
    identifiers_prefix = 'https://identifiers.org/'
    for dm_url in dm_urls:
        # We do it this way instead of splitting because of DOIs which have
        # extra slashes
        entity = dm_url[len(identifiers_prefix):]
        db_ns, db_id = entity.split(':', maxsplit=1)
        if db_ns == 'CHEBI':
            db_refs = [
                standardize_db_refs({'CHEBI': '%s:%s' % (db_ns, db_id)})
            ]
        elif db_ns == 'hgnc':
            db_refs = [standardize_db_refs({'HGNC': db_id})]
        elif db_ns == 'hgnc.symbol':
            hgnc_id = hgnc_client.get_current_hgnc_id(db_id)
            db_refs = [standardize_db_refs({'HGNC': hgnc_id})]
        elif db_ns == 'pubchem.compound':
            db_refs = [standardize_db_refs({'PUBCHEM': db_id})]
        elif db_ns == 'uniprot':
            db_refs = [standardize_db_refs({'UP': db_id})]
        elif db_ns == 'bigg.metabolite':
            chebi_ids = bigg_to_chebi.get(db_id)
            if chebi_ids:
                db_refs = [
                    standardize_db_refs({'CHEBI': chebi_id})
                    for chebi_id in chebi_ids
                ]
            else:
                db_refs = [{}]
        elif db_ns == 'ncbigene':
            hgnc_id = hgnc_client.get_hgnc_from_entrez(db_id)
            if hgnc_id:
                db_refs = [standardize_db_refs({'HGNC': hgnc_id})]
            else:
                db_refs = [{}]
        # Skip literature references that aren't entities
        elif db_ns in {'doi', 'pubmed'}:
            continue
        else:
            print('Unhandled namespace %s' % db_ns)
            db_refs = {}

        matched = None
        for db_ref in db_refs:
            for k, v in db_ref.items():
                if (k, v) in indra_groundings:
                    matched = (k, v)
                    break

        matches.append(
            (dm_url, get_identifiers_url(*matched) if matched else None))
    return matches
Пример #9
0
 def get_db_refs(egid):
     hgnc_id = hgnc_client.get_hgnc_from_entrez(egid)
     if not hgnc_id:
         logger.info("No HGNC ID for Entrez ID: %s" % egid)
         return (None, {})
     hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
     if not hgnc_name:
         logger.info("No HGNC name for HGNC ID: %s" % hgnc_id)
         return (None, {})
     up_id = hgnc_client.get_uniprot_id(hgnc_id)
     if not up_id:
         logger.info("No Uniprot ID for EGID / HGNC ID / Symbol "
                     "%s / %s / %s" % (egid, hgnc_id, hgnc_name))
         return (None, {})
     return (hgnc_name, {'HGNC': hgnc_id, 'UP': up_id})
Пример #10
0
 def read_cna_pairs(self):
     alteration_map = {
         -2: 'DEL',
         -1: 'del',
         0: 'neu',
         1: 'amp',
         2: 'AMP',
     }
     cna_pairs = []
     for cna in self.patient.cnas:
         gene_name = get_hgnc_name(
             get_hgnc_from_entrez(str(cna['entrezGeneId'])))
         alteration_str = alteration_map[cna['alteration']]
         cna_pairs.append((gene_name, alteration_str))
     return cna_pairs
Пример #11
0
 def get_db_refs(egid):
     hgnc_id = hgnc_client.get_hgnc_from_entrez(egid)
     if not hgnc_id:
         logger.info("No HGNC ID for Entrez ID: %s" % egid)
         return (None, {})
     hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
     if not hgnc_name:
         logger.info("No HGNC name for HGNC ID: %s" % hgnc_id)
         return (None, {})
     up_id = hgnc_client.get_uniprot_id(hgnc_id)
     if not up_id:
         logger.info("No Uniprot ID for EGID / HGNC ID / Symbol "
                     "%s / %s / %s" % (egid, hgnc_id, hgnc_name))
         return (None, {})
     return (hgnc_name, {'HGNC': hgnc_id, 'UP': up_id})
Пример #12
0
def map_entrez_human(entrez_ids):
    """Return references based on human Entrez gene IDs."""
    refs = []
    for entrez_id in entrez_ids:
        ref = {'EGID': entrez_id}
        hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id)
        if hgnc_id is None:
            logger.warning("Could not find HGNC ID for Entrez ID %s" %
                           entrez_id)
            continue
        hgnc_ref = _refs_from_hgnc_id(hgnc_id)
        if hgnc_ref is None:
            continue
        ref.update(hgnc_ref)
        refs.append(ref)
    return refs
Пример #13
0
def test_entrez_hgnc_none():
    entrez_id = 'xxx'
    hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id)
    assert hgnc_id is None
Пример #14
0
def test_entrez_hgnc():
    entrez_id = '653509'
    hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id)
    assert hgnc_id == '10798'
Пример #15
0
 def _extract_protein(self, name, gene_id):
     db_refs = {'EGID': gene_id}
     hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id)
     if hgnc_id is not None:
         db_refs['HGNC'] = hgnc_id
     return get_standard_agent(name, db_refs=db_refs)
Пример #16
0
 def _make_agent(self, hprd_id, refseq_id=None):
     if hprd_id is None or hprd_id is nan:
         return None
     # Get the basic info (HGNC name/symbol, Entrez ID) from the
     # ID mappings dataframe
     try:
         egid = self.id_df.loc[hprd_id].EGID
     except KeyError:
         logger.info('HPRD ID %s not found in mappings table.' % hprd_id)
         return None
     if not egid:
         logger.info('No Entrez ID for HPRD ID %s' % hprd_id)
         return None
     # Get the HGNC ID
     hgnc_id = hgnc_client.get_hgnc_from_entrez(egid)
     # If we couldn't get an HGNC ID for the Entrez ID, this means that
     # the Entrez ID has been discontinued or replaced.
     if not hgnc_id:
         self.no_hgnc_for_egid.append(egid)
         return None
     # Get the (possibly updated) HGNC Symbol
     hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
     assert hgnc_name is not None
     # See if we can get a Uniprot ID from the HGNC symbol--if there is
     # a RefSeq ID we wil also try to use it to get an isoform specific
     # UP ID, but we will have this one to fall back on. But if we can't
     # get one here, then we skip the Statement
     up_id_from_hgnc = hgnc_client.get_uniprot_id(hgnc_id)
     if not up_id_from_hgnc:
         self.no_up_for_hgnc.append((egid, hgnc_name, hgnc_id))
         return None
     # If we have provided the RefSeq ID, it's because we need to make
     # sure that we are getting the right isoform-specific ID (for sequence
     # positions of PTMs). Here we try to get the Uniprot ID from the
     # Refseq->UP mappings in the protmapper.uniprot_client.
     if refseq_id is not None:
         # Get the Uniprot IDs from the uniprot client
         up_ids = uniprot_client.get_ids_from_refseq(refseq_id,
                                                     reviewed_only=True)
         # Nothing for this RefSeq ID (quite likely because the RefSeq ID
         # is obsolete; take the UP ID from HGNC
         if len(up_ids) == 0:
             self.no_up_for_refseq.append(refseq_id)
             up_id = up_id_from_hgnc
         # More than one reviewed entry--no thanks, we'll take the one from
         # HGNC instead
         elif len(up_ids) > 1:
             self.many_ups_for_refseq.append(refseq_id)
             up_id = up_id_from_hgnc
         # We got a unique, reviewed UP entry for the RefSeq ID
         else:
             up_id = up_ids[0]
             # If it's the canonical isoform, strip off the '-1'
             if up_id.endswith('-1'):
                 up_id = up_id.split('-')[0]
     # For completeness, get the Refseq ID from the HPRD ID table
     else:
         refseq_id = self.id_df.loc[hprd_id].REFSEQ_PROTEIN
         up_id = up_id_from_hgnc
     # Make db_refs, return Agent
     db_refs = {
         'HGNC': hgnc_id,
         'UP': up_id,
         'EGID': egid,
         'REFSEQ_PROT': refseq_id
     }
     return Agent(hgnc_name, db_refs=db_refs)
Пример #17
0
 def get_agent(concept, entity):
     name = term_from_uri(concept)
     namespace = namespace_from_uri(entity)
     db_refs = {}
     if namespace == 'HGNC':
         agent_name = name
         hgnc_id = hgnc_client.get_hgnc_id(name)
         if hgnc_id is not None:
             db_refs['HGNC'] = str(hgnc_id)
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 db_refs['UP'] = up_id
             else:
                 logger.warning('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.' %
                                (name, hgnc_id))
         else:
             logger.warning("Couldn't get HGNC ID for HGNC symbol %s" %
                            name)
     elif namespace in ('MGI', 'RGD'):
         agent_name = name
         db_refs[namespace] = name
     elif namespace in ('PFH', 'SFAM'):
         indra_name = bel_to_indra.get(name)
         db_refs[namespace] = name
         if indra_name is None:
             agent_name = name
             msg = 'Could not find mapping for BEL family: %s' % name
             logger.warning(msg)
         else:
             db_refs['BE'] = indra_name
             db_refs['TEXT'] = name
             agent_name = indra_name
     elif namespace in ('NCH', 'SCOMP'):
         indra_name = bel_to_indra.get(name)
         db_refs[namespace] = name
         if indra_name is None:
             agent_name = name
             msg = 'Could not find mapping for BEL complex: %s' % name
             logger.warning(msg)
         else:
             db_refs['BE'] = indra_name
             db_refs['TEXT'] = name
             agent_name = indra_name
     elif namespace == 'CHEBI':
         chebi_id = chebi_name_id.get(name)
         if chebi_id:
             db_refs['CHEBI'] = chebi_id
         else:
             logger.warning('CHEBI name %s not found in map.' % name)
         agent_name = name
     elif namespace == 'EGID':
         hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
         db_refs['EGID'] = name
         if hgnc_id is not None:
             db_refs['HGNC'] = str(hgnc_id)
             agent_name = hgnc_client.get_hgnc_name(hgnc_id)
             up_id = hgnc_client.get_uniprot_id(hgnc_id)
             if up_id:
                 db_refs['UP'] = up_id
             else:
                 logger.warning('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.' %
                                (name, hgnc_id))
         else:
             logger.warning('Could not map EGID%s to HGNC.' % name)
             agent_name = 'E%s' % name
     else:
         logger.warning('Unhandled entity namespace: %s' % namespace)
         print('%s, %s' % (concept, entity))
         agent_name = name
     agent = Agent(agent_name, db_refs=db_refs)
     return agent
Пример #18
0
def get_agent_from_entity_info(entity_info):
    """Return an INDRA Agent by processing an entity_info dict."""
    # This will be the default name. If we get a gene name, it will
    # override this rawtext name.
    raw_text = entity_info['entityText']
    name = raw_text

    # Get the db refs.
    refs = {'TEXT': raw_text}

    ref_counts = Counter([entry['source'] for entry in
                          entity_info['entityId']])
    for source, count in ref_counts.items():
        if source in ('Entrez', 'UniProt') and count > 1:
            logger.info('%s has %d entries for %s, skipping'
                        % (raw_text, count, source))
            return None, None
    muts = []
    for id_dict in entity_info['entityId']:
        if id_dict['source'] == 'Entrez':
            refs['EGID'] = id_dict['idString']
            hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString'])
            if hgnc_id is not None:
                # Check against what we may have already inferred from
                # UniProt. If it disagrees with this, let it be. Inference
                # from Entrez isn't as reliable.
                if 'HGNC' in refs.keys():
                    if refs['HGNC'] != hgnc_id:
                        msg = ('HGNC:%s previously set does not'
                               ' match HGNC:%s from EGID:%s') % \
                               (refs['HGNC'], hgnc_id, refs['EGID'])
                        logger.info(msg)
                else:
                    refs['HGNC'] = hgnc_id
        elif id_dict['source'] == 'UniProt':
            refs['UP'] = id_dict['idString']
            gene_name = uniprot_client.get_gene_name(id_dict['idString'])
            if gene_name is not None:
                name = gene_name
                hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                if hgnc_id is not None:
                    # Check to see if we have a conflict with an HGNC id
                    # found from the Entrez id. If so, overwrite with this
                    # one, in which we have greater faith.
                    if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id:
                        msg = ('Inferred HGNC:%s from UP:%s does not'
                               ' match HGNC:%s from EGID:%s') % \
                               (refs['HGNC'], refs['UP'], hgnc_id,
                                refs['EGID'])
                        logger.info(msg)
                    refs['HGNC'] = hgnc_id
        elif id_dict['source'] in ('Tax', 'NCBI'):
            refs['TAX'] = id_dict['idString']
        elif id_dict['source'] == 'CHEBI':
            refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString']
        # These we take as is
        elif id_dict['source'] in ('MESH', 'OMIM', 'CTD'):
            refs[id_dict['source']] = id_dict['idString']
        # Handle mutations
        elif id_dict['source'] == 'Unk' and \
                id_dict['entityType'] == 'ProteinMutation':
            # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk',
            #  'tool': 'PubTator', 'entityType': 'ProteinMutation'}
            # Mpk1(Y268A)'
            if id_dict['idString'].startswith('p|SUB|'):
                try:
                    # Handle special cases like p|SUB|A|30|P;RS#:104893878
                    parts = id_dict['idString'].split(';')[0].split('|')
                    residue_from, pos, residue_to = parts[2:5]
                    mut = MutCondition(pos, residue_from, residue_to)
                    muts.append(mut)
                except Exception as e:
                    logger.info('Could not process mutation %s' %
                                id_dict['idString'])
            else:
                logger.info('Unhandled mutation: %s' % id_dict['idString'])
        else:
            logger.warning("Unhandled id type: {source}={idString}"
                           .format(**id_dict))

    raw_coords = (entity_info['charStart'], entity_info['charEnd'])
    return Agent(name, db_refs=refs, mutations=muts), raw_coords
Пример #19
0
def get_db_refs_by_name(ns, name, node_data):
    """Return standard name and grounding based on a namespace and a name.

    Parameters
    ----------
    ns : str
        A name space in which the given name is interpreted.
    name : str
        The name in the given name space to get grounding for.
    node_data : dict
        Node data for logging purposes.

    Returns
    -------
    name : str
        The standardized name for the given entity.
    db_refs : dict
        The grounding for the given entity.
    """
    db_refs = None
    if ns == 'HGNC':
        hgnc_id = hgnc_client.get_hgnc_id(name)
        if not hgnc_id:
            logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
            return name, None
        db_refs = {'HGNC': hgnc_id}
        up_id = _get_up_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
        mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
        if mirbase_id:
            db_refs['MIRBASE'] = mirbase_id

    elif ns in ('UNIPROT', 'UP'):
        up_id = None
        gene_name = uniprot_client.get_gene_name(name)
        if gene_name:
            up_id = name
        else:
            up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name)
            if up_id_from_mnem:
                up_id = up_id_from_mnem
                gene_name = uniprot_client.get_gene_name(up_id)
        if not up_id:
            logger.info('Couldn\'t get UP ID from %s' % name)
            return name, None
        db_refs = {'UP': up_id}
        if uniprot_client.is_human(up_id):
            hgnc_id = hgnc_client.get_hgnc_id(gene_name)
            if not hgnc_id:
                logger.info('Uniprot ID linked to invalid human gene '
                            'name %s' % name)
            else:
                db_refs['HGNC'] = hgnc_id
    elif ns == 'FPLX':
        db_refs = {'FPLX': name}
    elif ns in ('GO', 'GOBP', 'GOCC'):
        go_id = go_client.get_go_id_from_label(name)
        if not go_id:
            logger.info('Could not find GO ID for %s' % name)
            return name, None
        db_refs = {'GO': go_id}
    elif ns in ('MESHPP', 'MESHD', 'MESH'):
        mesh_id = mesh_client.get_mesh_id_name(name)
        if not mesh_id:
            logger.info('Could not find MESH ID fro %s' % name)
            return name, None
        db_refs = {'MESH': mesh_id}
    # For now, handle MGI/RGD but putting the name into the db_refs so
    # it's clear what namespace the name belongs to
    # FIXME: Full implementation would look up MGI/RGD identifiers from
    # the names, and obtain corresponding Uniprot IDs
    elif ns in ('MGI', 'RGD'):
        db_refs = {ns: name}
    # Map Selventa families to FamPlexes
    elif ns == 'SFAM':
        db_refs = {'SFAM': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SFAM family: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    # Map Entrez genes to HGNC/UP
    elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'):
        hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
        db_refs = {'EGID': name}
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
            else:
                logger.info('HGNC entity %s with HGNC ID %s has no '
                            'corresponding Uniprot ID.',
                            name, hgnc_id)
            mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
            if mirbase_id:
                db_refs['MIRBASE'] = mirbase_id
        else:
            logger.info('Could not map EGID%s to HGNC.' % name)
            name = 'E%s' % name
    elif ns == 'MIRBASE':
        mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name)
        if not mirbase_id:
            logger.info('Could not map miRBase name %s to ID', name)
            return
        db_refs = {'MIRBASE': mirbase_id}
        hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
    # CHEBI
    elif ns == 'CHEBI':
        chebi_id = chebi_name_id.get(name)
        if not chebi_id:
            chebi_id = chebi_client.get_chebi_id_from_name(name)
        if chebi_id:
            db_refs = {'CHEBI': chebi_id}
        else:
            logger.info('CHEBI name %s not found in map.' % name)
    # SDIS, SCHEM: Include the name as the ID for the namespace
    elif ns in ('SDIS', 'SCHEM'):
        db_refs = {ns: name}
    else:
        logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name,
                                                          node_data))
    return name, db_refs
Пример #20
0
def get_agent_from_entity_info(entity_info):
    """Return an INDRA Agent by processing an entity_info dict."""
    # This will be the default name. If we get a gene name, it will
    # override this rawtext name.
    raw_text = entity_info['entityText']
    name = raw_text

    # Get the db refs.
    refs = {'TEXT': raw_text}

    ref_counts = Counter(
        [entry['source'] for entry in entity_info['entityId']])
    for source, count in ref_counts.items():
        if source in ('Entrez', 'UniProt') and count > 1:
            logger.info('%s has %d entries for %s, skipping' %
                        (raw_text, count, source))
            return None, None
    muts = []
    for id_dict in entity_info['entityId']:
        if id_dict['source'] == 'Entrez':
            refs['EGID'] = id_dict['idString']
            hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString'])
            if hgnc_id is not None:
                # Check against what we may have already inferred from
                # UniProt. If it disagrees with this, let it be. Inference
                # from Entrez isn't as reliable.
                if 'HGNC' in refs.keys():
                    if refs['HGNC'] != hgnc_id:
                        msg = ('HGNC:%s previously set does not'
                               ' match HGNC:%s from EGID:%s') % \
                               (refs['HGNC'], hgnc_id, refs['EGID'])
                        logger.info(msg)
                else:
                    refs['HGNC'] = hgnc_id
        elif id_dict['source'] == 'UniProt':
            refs['UP'] = id_dict['idString']
            gene_name = uniprot_client.get_gene_name(id_dict['idString'])
            if gene_name is not None:
                name = gene_name
                hgnc_id = hgnc_client.get_hgnc_id(gene_name)
                if hgnc_id is not None:
                    # Check to see if we have a conflict with an HGNC id
                    # found from the Entrez id. If so, overwrite with this
                    # one, in which we have greater faith.
                    if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id:
                        msg = ('Inferred HGNC:%s from UP:%s does not'
                               ' match HGNC:%s from EGID:%s') % \
                               (refs['HGNC'], refs['UP'], hgnc_id,
                                refs['EGID'])
                        logger.info(msg)
                    refs['HGNC'] = hgnc_id
        elif id_dict['source'] in ('Tax', 'NCBI'):
            refs['TAX'] = id_dict['idString']
        elif id_dict['source'] == 'CHEBI':
            refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString']
        # These we take as is
        elif id_dict['source'] in ('MESH', 'OMIM', 'CTD'):
            refs[id_dict['source']] = id_dict['idString']
        # Handle mutations
        elif id_dict['source'] == 'Unk' and \
                id_dict['entityType'] == 'ProteinMutation':
            # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk',
            #  'tool': 'PubTator', 'entityType': 'ProteinMutation'}
            # Mpk1(Y268A)'
            if id_dict['idString'].startswith('p|SUB|'):
                try:
                    # Handle special cases like p|SUB|A|30|P;RS#:104893878
                    parts = id_dict['idString'].split(';')[0].split('|')
                    residue_from, pos, residue_to = parts[2:5]
                    mut = MutCondition(pos, residue_from, residue_to)
                    muts.append(mut)
                except Exception as e:
                    logger.info('Could not process mutation %s' %
                                id_dict['idString'])
            else:
                logger.info('Unhandled mutation: %s' % id_dict['idString'])
        else:
            logger.warning(
                "Unhandled id type: {source}={idString}".format(**id_dict))

    raw_coords = (entity_info['charStart'], entity_info['charEnd'])
    return Agent(name, db_refs=refs, mutations=muts), raw_coords
Пример #21
0
def _urn_to_db_refs(urn):
    """Converts a Medscan URN to an INDRA db_refs dictionary with grounding
    information.

    Parameters
    ----------
    urn : str
        A Medscan URN

    Returns
    -------
    db_refs : dict
        A dictionary with grounding information, mapping databases to database
        identifiers. If the Medscan URN is not recognized, returns an empty
        dictionary.
    db_name : str
        The Famplex name, if available; otherwise the HGNC name if available;
        otherwise None
    """
    # Convert a urn to a db_refs dictionary
    if urn is None:
        return {}, None

    m = URN_PATT.match(urn)
    if m is None:
        return None, None

    urn_type, urn_id = m.groups()

    db_refs = {}
    db_name = None

    # TODO: support more types of URNs
    if urn_type == 'agi-cas':
        # Identifier is CAS, convert to CHEBI
        chebi_id = get_chebi_id_from_cas(urn_id)
        if chebi_id:
            db_refs['CHEBI'] = 'CHEBI:%s' % chebi_id
            db_name = get_chebi_name_from_id(chebi_id)
    elif urn_type == 'agi-llid':
        # This is an Entrez ID, convert to HGNC
        hgnc_id = get_hgnc_from_entrez(urn_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id

            # Convert the HGNC ID to a Uniprot ID
            uniprot_id = get_uniprot_id(hgnc_id)
            if uniprot_id is not None:
                db_refs['UP'] = uniprot_id

            # Try to lookup HGNC name; if it's available, set it to the
            # agent name
            db_name = get_hgnc_name(hgnc_id)
    elif urn_type in ['agi-meshdis', 'agi-ncimorgan', 'agi-ncimtissue',
                      'agi-ncimcelltype']:
        if urn_id.startswith('C') and urn_id[1:].isdigit():
            # Identifier is probably UMLS
            db_refs['UMLS'] = urn_id
        else:
            # Identifier is MESH
            urn_mesh_name = unquote(urn_id)
            mesh_id, mesh_name = mesh_client.get_mesh_id_name(urn_mesh_name)
            if mesh_id:
                db_refs['MESH'] = mesh_id
                db_name = mesh_name
            else:
                db_name = urn_mesh_name
    elif urn_type == 'agi-gocomplex':
        # Identifier is GO
        db_refs['GO'] = 'GO:%s' % urn_id
    elif urn_type == 'agi-go':
        # Identifier is GO
        db_refs['GO'] = 'GO:%s' % urn_id

    # If we have a GO or MESH grounding, see if there is a corresponding
    # Famplex grounding
    db_sometimes_maps_to_famplex = ['GO', 'MESH']
    for db in db_sometimes_maps_to_famplex:
        if db in db_refs:
            key = (db, db_refs[db])
            if key in famplex_map:
                db_refs['FPLX'] = famplex_map[key]

    # If the urn corresponds to an eccode, groudn to famplex if that eccode
    # is in the Famplex equivalences table
    if urn.startswith('urn:agi-enz'):
        tokens = urn.split(':')
        eccode = tokens[2]
        key = ('ECCODE', eccode)
        if key in famplex_map:
            db_refs['FPLX'] = famplex_map[key]

    # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding
    key = ('MEDSCAN', urn)
    if key in famplex_map:
        db_refs['FPLX'] = famplex_map[key]

    # If there is a Famplex grounding, use Famplex for entity name
    if 'FPLX' in db_refs:
        db_name = db_refs['FPLX']
    elif 'GO' in db_refs:
        db_name = go_client.get_go_label(db_refs['GO'])

    return db_refs, db_name
Пример #22
0
 def _make_agent(self, hprd_id, refseq_id=None):
     if hprd_id is None or hprd_id is nan:
         return None
     # Get the basic info (HGNC name/symbol, Entrez ID) from the
     # ID mappings dataframe
     try:
         egid = self.id_df.loc[hprd_id].EGID
     except KeyError:
         logger.info('HPRD ID %s not found in mappings table.' % hprd_id)
         return None
     if not egid:
         logger.info('No Entrez ID for HPRD ID %s' % hprd_id)
         return None
     # Get the HGNC ID
     hgnc_id = hgnc_client.get_hgnc_from_entrez(egid)
     # If we couldn't get an HGNC ID for the Entrez ID, this means that
     # the Entrez ID has been discontinued or replaced.
     if not hgnc_id:
         self.no_hgnc_for_egid.append(egid)
         return None
     # Get the (possibly updated) HGNC Symbol
     hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
     assert hgnc_name is not None
     # See if we can get a Uniprot ID from the HGNC symbol--if there is
     # a RefSeq ID we wil also try to use it to get an isoform specific
     # UP ID, but we will have this one to fall back on. But if we can't
     # get one here, then we skip the Statement
     up_id_from_hgnc = hgnc_client.get_uniprot_id(hgnc_id)
     if not up_id_from_hgnc:
         self.no_up_for_hgnc.append((egid, hgnc_name, hgnc_id))
         return None
     # If we have provided the RefSeq ID, it's because we need to make
     # sure that we are getting the right isoform-specific ID (for sequence
     # positions of PTMs). Here we try to get the Uniprot ID from the
     # Refseq->UP mappings in the protmapper.uniprot_client.
     if refseq_id is not None:
         # Get the Uniprot IDs from the uniprot client
         up_ids = uniprot_client.get_ids_from_refseq(refseq_id,
                                                     reviewed_only=True)
         # Nothing for this RefSeq ID (quite likely because the RefSeq ID
         # is obsolete; take the UP ID from HGNC
         if len(up_ids) == 0:
             self.no_up_for_refseq.append(refseq_id)
             up_id = up_id_from_hgnc
         # More than one reviewed entry--no thanks, we'll take the one from
         # HGNC instead
         elif len(up_ids) > 1:
             self.many_ups_for_refseq.append(refseq_id)
             up_id = up_id_from_hgnc
         # We got a unique, reviewed UP entry for the RefSeq ID
         else:
             up_id = up_ids[0]
             # If it's the canonical isoform, strip off the '-1'
             if up_id.endswith('-1'):
                 up_id = up_id.split('-')[0]
     # For completeness, get the Refseq ID from the HPRD ID table
     else:
         refseq_id = self.id_df.loc[hprd_id].REFSEQ_PROTEIN
         up_id = up_id_from_hgnc
     # Make db_refs, return Agent
     db_refs = {'HGNC': hgnc_id, 'UP': up_id, 'EGID': egid,
                'REFSEQ_PROT': refseq_id}
     return Agent(hgnc_name, db_refs=db_refs)
Пример #23
0
def _urn_to_db_refs(urn):
    """Converts a Medscan URN to an INDRA db_refs dictionary with grounding
    information.

    Parameters
    ----------
    url : str
        A Medscan URN

    Returns
    -------
    db_refs : dict
        A dictionary with grounding information, mapping databases to database
        identifiers. If the Medscan URN is not recognized, returns an empty
        dictionary.
    db_name : str
        The Famplex name, if available; otherwise the HGNC name if available;
        otherwise None
    """
    # Convert a urn to a db_refs dictionary
    if urn is None:
        return {}, None

    p = 'urn:([^:]+):([^:]+)'
    m = re.match(p, urn)
    if m is None:
        return None, None

    urn_type = m.group(1)
    urn_id = m.group(2)

    db_refs = {}
    db_name = None

    # TODO: support more types of URNs
    if urn_type == 'agi-cas':
        # Identifier is CAS, convert to CHEBI
        chebi_id = get_chebi_id_from_cas(urn_id)
        if chebi_id:
            db_refs['CHEBI'] = 'CHEBI:%s' % chebi_id
    elif urn_type == 'agi-llid':
        # This is an Entrez ID, convert to HGNC
        hgnc_id = get_hgnc_from_entrez(urn_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id

            # Convert the HGNC ID to a Uniprot ID
            uniprot_id = get_uniprot_id(hgnc_id)
            db_refs['UP'] = uniprot_id

            # Try to lookup HGNC name; if it's available, set it to the
            # agent name
            db_name = get_hgnc_name(hgnc_id)
    elif urn_type == 'agi-ncimorgan':
        # Identifier is MESH
        db_refs['MESH'] = urn_id
    elif urn_type == 'agi-ncimcelltype':
        # Identifier is MESH
        db_refs['MESH'] = urn_id
    elif urn_type == 'agi-meshdis':
        # Identifier is MESH
        db_refs['MESHDIS'] = urn_id
    elif urn_type == 'agi-gocomplex':
        # Identifier is GO
        db_refs['GO'] = 'GO:%s' % urn_id
    elif urn_type == 'agi-go':
        # Identifier is GO
        db_refs['GO'] = 'GO:%s' % urn_id
    elif urn_type == 'agi-ncimtissue':
        # Identifier is MESH
        db_refs['MESH'] = urn_id

    # If we have a GO or MESH grounding, see if there is a corresponding
    # Famplex grounding
    db_sometimes_maps_to_famplex = ['GO', 'MESH']
    for db in db_sometimes_maps_to_famplex:
        if db in db_refs:
            key = (db, db_refs[db])
            if key in famplex_map:
                db_refs['FPLX'] = famplex_map[key]

    # If the urn corresponds to an eccode, groudn to famplex if that eccode
    # is in the Famplex equivalences table
    if urn.startswith('urn:agi-enz'):
        tokens = urn.split(':')
        eccode = tokens[2]
        key = ('ECCODE', eccode)
        if key in famplex_map:
            db_refs['FPLX'] = famplex_map[key]

    # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding
    key = ('MEDSCAN', urn)
    if key in famplex_map:
        db_refs['FPLX'] = famplex_map[key]

    # If there is a Famplex grounding, use Famplex for entity name
    if 'FPLX' in db_refs:
        db_name = db_refs['FPLX']

    return db_refs, db_name
Пример #24
0
def get_agent(node_data, node_modifier_data=None):
    # FIXME: Handle translocations on the agent for ActiveForms, turn into
    # location conditions
    # Check the node type/function
    node_func = node_data[pc.FUNCTION]
    if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX,
                         pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA):
        mod_data = node_modifier_data or 'No node data'
        logger.info("Nodes of type %s not handled: %s",
                    node_func, mod_data)
        return None
    # Skip gene/protein fusions
    if pc.FUSION in node_data:
        logger.info("Gene and protein fusions not handled: %s" % str(node_data))
        return None
    # COMPLEXES ------------
    # First, handle complexes, which will consist recursively of other agents
    if node_func == pc.COMPLEX:
        # First, check for members: if there are no members, we assume this
        # is a named complex
        members = node_data.get(pc.MEMBERS)
        if members is None:
            return None
        # Otherwise, get the "main" agent, to which the other members will be
        # attached as bound conditions
        main_agent = get_agent(members[0])
        # If we can't get the main agent, return None
        if main_agent is None:
            return None
        bound_conditions = [BoundCondition(get_agent(m), True)
                            for m in members[1:]]
        # Check the bound_conditions for any None agents
        if any([bc.agent is None for bc in bound_conditions]):
            return None
        main_agent.bound_conditions = bound_conditions
        # Get activity of main agent
        ac = _get_activity_condition(node_modifier_data)
        main_agent.activity = ac
        return main_agent
    # OTHER NODE TYPES -----
    # Get node identifier information
    name = node_data.get(pc.NAME)
    ns = node_data[pc.NAMESPACE]
    ident = node_data.get(pc.IDENTIFIER)
    # No ID present, get identifier using the name, namespace
    db_refs = None
    if not ident:
        assert name, "Node must have a name if lacking an identifier."
        if ns == 'HGNC':
            hgnc_id = hgnc_client.get_hgnc_id(name)
            if not hgnc_id:
                logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
                return None
            db_refs = {'HGNC': hgnc_id}
            up_id = _get_up_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
        # FIXME: Look up go ID in ontology lookup service
        # FIXME: Look up MESH IDs from name
        # FIXME: For now, just use node name
        elif ns in ('GOBP', 'MESHPP', 'MESHD'):
            db_refs = {}
        # For now, handle MGI/RGD but putting the name into the db_refs so
        # it's clear what namespace the name belongs to
        # FIXME: Full implementation would look up MGI/RGD identifiers from
        # the names, and obtain corresponding Uniprot IDs
        elif ns in ('MGI', 'RGD'):
            db_refs = {ns: name}
        # Map Selventa families to FamPlexes
        elif ns == 'SFAM':
            db_refs = {'SFAM': name}
            indra_name = bel_to_indra.get(name)
            if indra_name is None:
                logger.info('Could not find mapping for BEL/SFAM family: '
                            '%s (%s)' % (name, node_data))
            else:
                db_refs['FPLX'] = indra_name
                name = indra_name
        # Map Entrez genes to HGNC/UP
        elif ns == 'EGID':
            hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
            db_refs = {'EGID': name}
            if hgnc_id is not None:
                db_refs['HGNC'] = hgnc_id
                name = hgnc_client.get_hgnc_name(hgnc_id)
                up_id = hgnc_client.get_uniprot_id(hgnc_id)
                if up_id:
                    db_refs['UP'] = up_id
                else:
                    logger.info('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.',
                                name, hgnc_id)
            else:
                logger.info('Could not map EGID%s to HGNC.' % name)
                name = 'E%s' % name
        # CHEBI
        elif ns == 'CHEBI':
            chebi_id = chebi_name_id.get(name)
            if chebi_id:
                db_refs = {'CHEBI': chebi_id}
            else:
                logger.info('CHEBI name %s not found in map.' % name)
        # SDIS, SCHEM: Include the name as the ID for the namespace
        elif ns in ('SDIS', 'SCHEM'):
            db_refs = {ns: name}
        else:
            print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data))
    # We've already got an identifier, look up other identifiers if necessary
    else:
        # Get the name, overwriting existing name if necessary
        if ns == 'HGNC':
            name = hgnc_client.get_hgnc_name(ident)
            db_refs = {'HGNC': ident}
            up_id = _get_up_id(ident)
            if up_id:
                db_refs['UP'] = up_id
        elif ns == 'UP':
            db_refs = {'UP': ident}
            name = uniprot_client.get_gene_name(ident)
            assert name
            if uniprot_client.is_human(ident):
                hgnc_id = hgnc_client.get_hgnc_id(name)
                if not hgnc_id:
                    logger.info('Uniprot ID linked to invalid human gene '
                                'name %s' % name)
                else:
                    db_refs['HGNC'] = hgnc_id
        elif ns in ('MGI', 'RGD'):
            raise ValueError('Identifiers for MGI and RGD databases are not '
                             'currently handled: %s' % node_data)
        else:
            print("Unhandled namespace with identifier: %s: %s (%s)" %
                  (ns, name, node_data))
    if db_refs is None:
        logger.info('Unable to get identifier information for node: %s',
                    node_data)
        return None
    # Get modification conditions
    mods, muts = _get_all_pmods(node_data)
    # Get activity condition
    ac = _get_activity_condition(node_modifier_data)
    to_loc = _get_translocation_target(node_modifier_data)
    # Check for unhandled node modifiers, skip if so
    if _has_unhandled_modifiers(node_modifier_data):
        return None
    # Make the agent
    ag = Agent(name, db_refs=db_refs, mods=mods, mutations=muts, activity=ac,
               location=to_loc)
    return ag
Пример #25
0
def _get_agent(node_data, node_modifier_data=None):
    # FIXME: Handle translocations on the agent for ActiveForms, turn into
    # location conditions
    # Check the node type/function
    node_func = node_data[pc.FUNCTION]
    if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX,
                         pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA):
        mod_data = ('No node data' if not node_modifier_data else
                    node_modifier_data.get(pc.CNAME))
        logger.info("Nodes of type %s not handled: %s" % (node_func, mod_data))
        return None
    # Skip gene/protein fusions
    if pc.FUSION in node_data:
        logger.info("Gene and protein fusions not handled: %s" %
                    str(node_data))
        return None
    # COMPLEXES ------------
    # First, handle complexes, which will consist recursively of other agents
    if node_func == pc.COMPLEX:
        # First, check for members: if there are no members, we assume this
        # is a named complex
        members = node_data.get(pc.MEMBERS)
        if members is None:
            return None
        # Otherwise, get the "main" agent, to which the other members will be
        # attached as bound conditions
        main_agent = _get_agent(members[0])
        # If we can't get the main agent, return None
        if main_agent is None:
            return None
        bound_conditions = [
            BoundCondition(_get_agent(m), True) for m in members[1:]
        ]
        # Check the bound_conditions for any None agents
        if any([bc.agent is None for bc in bound_conditions]):
            return None
        main_agent.bound_conditions = bound_conditions
        # Get activity of main agent
        ac = _get_activity_condition(node_modifier_data)
        main_agent.activity = ac
        return main_agent
    # OTHER NODE TYPES -----
    # Get node identifier information
    name = node_data.get(pc.NAME)
    ns = node_data[pc.NAMESPACE]
    ident = node_data.get(pc.IDENTIFIER)
    # No ID present, get identifier using the name, namespace
    db_refs = None
    if not ident:
        assert name, "Node must have a name if lacking an identifier."
        if ns == 'HGNC':
            hgnc_id = hgnc_client.get_hgnc_id(name)
            if not hgnc_id:
                logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
                return None
            db_refs = {'HGNC': hgnc_id}
            up_id = _get_up_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
        # FIXME: Look up go ID in ontology lookup service
        # FIXME: Look up MESH IDs from name
        # FIXME: For now, just use node name
        elif ns in ('GOBP', 'MESHPP', 'MESHD'):
            db_refs = {}
        # For now, handle MGI/RGD but putting the name into the db_refs so
        # it's clear what namespace the name belongs to
        # FIXME: Full implementation would look up MGI/RGD identifiers from
        # the names, and obtain corresponding Uniprot IDs
        elif ns in ('MGI', 'RGD'):
            db_refs = {ns: name}
        # Map Selventa families to FamPlexes
        elif ns == 'SFAM':
            db_refs = {'SFAM': name}
            indra_name = bel_to_indra.get(name)
            if indra_name is None:
                logger.info('Could not find mapping for BEL/SFAM family: '
                            '%s (%s)' % (name, node_data))
            else:
                db_refs['FPLX'] = indra_name
                name = indra_name
        # Map Entrez genes to HGNC/UP
        elif ns == 'EGID':
            hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
            db_refs = {'EGID': name}
            if hgnc_id is not None:
                db_refs['HGNC'] = hgnc_id
                name = hgnc_client.get_hgnc_name(hgnc_id)
                up_id = hgnc_client.get_uniprot_id(hgnc_id)
                if up_id:
                    db_refs['UP'] = up_id
                else:
                    logger.info('HGNC entity %s with HGNC ID %s has no '
                                'corresponding Uniprot ID.' % (name, hgnc_id))
            else:
                logger.info('Could not map EGID%s to HGNC.' % name)
                name = 'E%s' % name
        # CHEBI
        elif ns == 'CHEBI':
            chebi_id = chebi_name_id.get(name)
            if chebi_id:
                db_refs = {'CHEBI': chebi_id}
            else:
                logger.info('CHEBI name %s not found in map.' % name)
        # SDIS, SCHEM: Include the name as the ID for the namespace
        elif ns in ('SDIS', 'SCHEM'):
            db_refs = {ns: name}
        else:
            print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data))
    # We've already got an identifier, look up other identifiers if necessary
    else:
        # Get the name, overwriting existing name if necessary
        if ns == 'HGNC':
            name = hgnc_client.get_hgnc_name(ident)
            db_refs = {'HGNC': ident}
            up_id = _get_up_id(ident)
            if up_id:
                db_refs['UP'] = up_id
        elif ns == 'UP':
            db_refs = {'UP': ident}
            name = uniprot_client.get_gene_name(ident)
            assert name
            if uniprot_client.is_human(ident):
                hgnc_id = hgnc_client.get_hgnc_id(name)
                if not hgnc_id:
                    logger.info('Uniprot ID linked to invalid human gene '
                                'name %s' % name)
                else:
                    db_refs['HGNC'] = hgnc_id
        elif ns in ('MGI', 'RGD'):
            raise ValueError('Identifiers for MGI and RGD databases are not '
                             'currently handled: %s' % node_data)
        else:
            print("Unhandled namespace with identifier: %s: %s (%s)" %
                  (ns, name, node_data))
    if db_refs is None:
        logger.info('Unable to get identifier information for node: %s' %
                    node_data)
        return None
    # Get modification conditions
    mods, muts = _get_all_pmods(node_data)
    # Get activity condition
    ac = _get_activity_condition(node_modifier_data)
    to_loc = _get_translocation_target(node_modifier_data)
    # Check for unhandled node modifiers, skip if so
    if _has_unhandled_modifiers(node_modifier_data):
        return None
    # Make the agent
    ag = Agent(name,
               db_refs=db_refs,
               mods=mods,
               mutations=muts,
               activity=ac,
               location=to_loc)
    return ag
Пример #26
0
def get_db_refs_by_name(ns, name, node_data):
    """Return standard name and grounding based on a namespace and a name.

    Parameters
    ----------
    ns : str
        A name space in which the given name is interpreted.
    name : str
        The name in the given name space to get grounding for.
    node_data : dict
        Node data for logging purposes.

    Returns
    -------
    name : str
        The standardized name for the given entity.
    db_refs : dict
        The grounding for the given entity.

    """
    db_refs = None
    if ns == 'HGNC':
        # Assumption: name is an HGNC symbol
        hgnc_id = hgnc_client.get_current_hgnc_id(name)
        if not hgnc_id:
            logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
            return name, None
        elif isinstance(hgnc_id, list):
            logger.info('More than one current HGNC ID for %s, choosing %s' %
                        (name, hgnc_id[0]))
            hgnc_id = hgnc_id[0]
        name = hgnc_client.get_hgnc_name(hgnc_id)
        db_refs = {'HGNC': hgnc_id}
        up_id = _get_up_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
        mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
        if mirbase_id:
            db_refs['MIRBASE'] = mirbase_id

    elif ns in ('UNIPROT', 'UP'):
        up_id = None
        # This is a simple test to see if name is a valid UniProt ID,
        # if we can't get a mnemonic, we assume it's not a UP ID
        if uniprot_client.get_mnemonic(name, web_fallback=False):
            up_id = name
        # We next check if it's a mnemonic
        else:
            up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name)
            if up_id_from_mnem:
                up_id = up_id_from_mnem
        if not up_id:
            logger.info('Couldn\'t get UP ID from %s' % name)
            return name, None
        db_refs = {'UP': up_id}
        hgnc_id = uniprot_client.get_hgnc_id(up_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
        else:
            name = uniprot_client.get_gene_name(up_id)
    elif ns == 'FPLX':
        db_refs = {'FPLX': name}
    elif ns in ('GO', 'GOBP', 'GOCC'):
        if name == 'cell proliferation':
            name = 'cell population proliferation'
        go_id = go_client.get_go_id_from_label(name)
        if not go_id:
            logger.info('Could not find GO ID for %s' % name)
            return name, None
        db_refs = {'GO': go_id}
        name = go_client.get_go_label(go_id)
    elif ns in ('MESHPP', 'MESHD', 'MESH'):
        mesh_id, mesh_name = mesh_client.get_mesh_id_name(name)
        if not mesh_id:
            logger.info('Could not find MESH ID from %s' % name)
            return name, None
        name = mesh_name
        db_refs = {'MESH': mesh_id}
    # For now, handle MGI/RGD but putting the name into the db_refs so
    # it's clear what namespace the name belongs to
    # FIXME: Full implementation would look up MGI/RGD identifiers from
    # the names, and obtain corresponding Uniprot IDs
    elif ns == 'MGI':
        up_id = mouse_lookup.get(name)
        if up_id:
            db_refs = {'UP': up_id}
    elif ns == 'RGD':
        up_id = rat_lookup.get(name)
        if up_id:
            db_refs = {'UP': up_id}
    # Map Selventa families and complexes to FamPlex
    elif ns == 'SFAM':
        db_refs = {'SFAM': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SFAM family: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    elif ns == 'SCOMP':
        db_refs = {'SCOMP': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SCOMP complex: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    # Map Entrez genes to HGNC/UP
    elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'):
        hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
        db_refs = {'EGID': name}
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
            else:
                logger.info(
                    'HGNC entity %s with HGNC ID %s has no '
                    'corresponding Uniprot ID.', name, hgnc_id)
            mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
            if mirbase_id:
                db_refs['MIRBASE'] = mirbase_id
        else:
            logger.debug('Could not map EGID%s to HGNC.' % name)
            name = 'E%s' % name
    elif ns == 'MIRBASE':
        mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name)
        if not mirbase_id:
            logger.info('Could not map miRBase name %s to ID', name)
            return name, None
        db_refs = {'MIRBASE': mirbase_id}
        hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
    # CHEBI
    elif ns == 'CHEBI':
        # We first look up BEL's own namespace map for ChEBI names to IDs
        chebi_id = chebi_name_id.get(name)
        # If that fails, we look up INDRA's ChEBI name to ID mapping
        if not chebi_id:
            chebi_id = chebi_client.get_chebi_id_from_name(name)
        if chebi_id:
            db_refs = {'CHEBI': chebi_id}
        else:
            logger.info('CHEBI name %s not found in map.' % name)
    # These appear in the name slot but are actually IDs
    elif ns == 'CHEBIID':
        chebi_id = identifiers.ensure_chebi_prefix(name)
        db_refs = {'CHEBI': chebi_id}
        name = chebi_client.get_chebi_name_from_id(chebi_id)
    # SDIS, SCHEM: Include the name as the ID for the namespace
    elif ns in ('SDIS', 'SCHEM', 'TEXT'):
        db_refs = {ns: name}
    elif ns == 'TAX':
        tid = taxonomy_client.get_taxonomy_id(name)
        if tid:
            db_refs = {'TAXONOMY': tid}
        else:
            logger.info('Could not get taxonomy ID for %s' % name)
    else:
        logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data))
    return name, db_refs
Пример #27
0
"""Reads a list of kinases annotated with which ones are considered "dark"
kinases, and retrieves a list of statements associated with them from the INDRA
database."""

import pickle
import pandas as pd
from indra.db import client as dbc
from indra.databases import hgnc_client

dk_col = 'in_IDG_darkkinases'
kinase_file = 'Table_001_all_kinases.csv'

if __name__ == '__main__':
    kinases = pd.read_csv(kinase_file, delimiter=',', header=0)
    dark_kinases = kinases[kinases[dk_col]]

    results = {}
    for egid in dark_kinases.gene_id:
        hgnc_id = hgnc_client.get_hgnc_from_entrez(str(egid))
        if hgnc_id is None:
            print("No HGNC id for Entrez Gene id %s" % egid)
            continue
        gene_sym = hgnc_client.get_hgnc_name(hgnc_id)
        if gene_sym is None:
            print("No symbol for gene id %s" % hgnc_id)
        stmts = dbc.get_statements_by_gene_role_type(agent_id=hgnc_id,
                                                     agent_ns='HGNC')
        results[gene_sym] = stmts
    with open('dark_kinase_stmts.pkl', 'wb') as f:
        pickle.dump(results, f)