예제 #1
0
def generate_uniprot_terms(download=False):
    path = os.path.join(resource_dir, 'up_synonyms.tsv')
    if not os.path.exists(path) or download:
        url = ('https://www.uniprot.org/uniprot/?format=tab&columns=id,'
               'genes(PREFERRED),protein%20names&sort=score&'
               'fil=organism:"H**o%20sapiens%20(Human)%20[9606]"'
               '%20AND%20reviewed:yes')
        logger.info('Downloading UniProt resource file')
        res = requests.get(url)
        with open(path, 'w') as fh:
            fh.write(res.text)
    terms = []
    for row in read_csv(path, delimiter='\t', header=True):
        names = parse_uniprot_synonyms(row['Protein names'])
        up_id = row['Entry']
        standard_name = row['Gene names  (primary )']
        ns = 'UP'
        id = row['Entry']
        # We skip a small number of not critical entries that don't have
        # standard names
        if not standard_name:
            continue
        hgnc_id = uniprot_client.get_hgnc_id(up_id)
        if hgnc_id:
            ns = 'HGNC'
            id = hgnc_id
            standard_name = hgnc_client.get_hgnc_name(hgnc_id)
        for name in names:
            # Skip names that are EC codes
            if name.startswith('EC '):
                continue
            term = Term(normalize(name), name, ns, id, standard_name,
                        'synonym', 'uniprot')
            terms.append(term)
    return terms
예제 #2
0
파일: processor.py 프로젝트: maparent/indra
    def _extract_protein(self, line):
        # Extract key information from the lines.
        prot_name = line['Protein Name']
        prot_id = line['Protein HMS LINCS ID']

        # Get available db-refs.
        db_refs = {}
        if prot_id:
            db_refs.update(self._lc.get_protein_refs(prot_id))
            # Since the resource only gives us an UP ID (not HGNC), we
            # try to get that and standardize the name to the gene name
            up_id = db_refs.get('UP')
            if up_id:
                hgnc_id = uniprot_client.get_hgnc_id(up_id)
                if hgnc_id:
                    db_refs['HGNC'] = hgnc_id
                    prot_name = hgnc_client.get_hgnc_name(hgnc_id)
                else:
                    gene_name = uniprot_client.get_gene_name(up_id)
                    if gene_name:
                        prot_name = gene_name
        # In some cases lines are missing protein information in which
        # case we return None
        else:
            return None

        # Create the agent.
        return Agent(prot_name, db_refs=db_refs)
예제 #3
0
파일: processor.py 프로젝트: maparent/indra
 def _get_agent(self, ent_name, ent_type, id, database):
     # Returns a list of agents corresponding to this id
     # (If it is a signor complex, returns an Agent object with complex
     # constituents as BoundConditions
     if database == 'SIGNOR' and id in self.complex_map:
         components = self.complex_map[id]
         agents = self._get_complex_agents(id)
         # Return the first agent with the remaining agents as a bound
         # condition
         agent = agents[0]
         agent.bound_conditions = \
                 [BoundCondition(a, True) for a in agents[1:]]
         return agent
     else:
         gnd_type = _type_db_map[(ent_type, database)]
         if gnd_type == 'UP':
             up_id = id
             db_refs = {'UP': up_id}
             hgnc_id = uniprot_client.get_hgnc_id(up_id)
             if hgnc_id:
                 db_refs['HGNC'] = hgnc_id
                 name = hgnc_client.get_hgnc_name(hgnc_id)
             else:
                 name = uniprot_client.get_gene_name(up_id)
         # Map SIGNOR protein families to FamPlex families
         elif ent_type == 'proteinfamily':
             db_refs = {
                 database: id
             }  # Keep the SIGNOR family ID in db_refs
             key = (database, id)
             # Use SIGNOR name unless we have a mapping in FamPlex
             name = ent_name
             famplex_id = famplex_map.get(key)
             if famplex_id is None:
                 logger.info('Could not find %s in FamPlex map' % str(key))
             else:
                 db_refs['FPLX'] = famplex_id
                 name = famplex_id
         # Other possible groundings are PUBCHEM, SIGNOR, etc.
         elif gnd_type is not None:
             if database not in ('PUBCHEM', 'SIGNOR', 'ChEBI', 'miRBase',
                                 'DRUGBANK'):
                 raise ValueError('Unexpected database %s' % database)
             if database == 'PUBCHEM' and id.startswith('CID:'):
                 # We take off the CID: prefix plus fix an issue with
                 # SIGNOR's format in which it leaves extra spaces around
                 # the ID, as in 'CID: 923'
                 id = id[4:].strip()
             db_refs = {gnd_type: id}
             name = ent_name
         # If no grounding, include as an untyped/ungrounded node
         else:
             name = ent_name
             db_refs = {}
         return Agent(name, db_refs=db_refs)
예제 #4
0
def generate_famplex_terms(ignore_mappings=False):
    fname = os.path.join(indra_resources, 'famplex', 'grounding_map.csv')
    logger.info('Loading %s' % fname)
    terms = []
    for row in read_csv(fname, delimiter=','):
        txt = row[0]
        norm_txt = normalize(txt)
        groundings = {k: v for k, v in zip(row[1::2], row[2::2]) if (k and v)}
        if 'FPLX' in groundings:
            id = groundings['FPLX']
            term = Term(norm_txt, txt, 'FPLX', id, id, 'assertion', 'famplex')
        elif 'HGNC' in groundings:
            id = groundings['HGNC']
            term = Term(norm_txt, txt, 'HGNC', hgnc_client.get_hgnc_id(id), id,
                        'assertion', 'famplex', '9606')
        elif 'UP' in groundings:
            db = 'UP'
            id = groundings['UP']
            name = id
            organism = None
            if uniprot_client.is_human(id):
                organism = '9606'
                hgnc_id = uniprot_client.get_hgnc_id(id)
                if hgnc_id:
                    name = hgnc_client.get_hgnc_name(hgnc_id)
                    if hgnc_id:
                        db = 'HGNC'
                        id = hgnc_id
                else:
                    logger.warning('No gene name for %s' % id)
            # TODO: should we add organism info here?
            term = Term(norm_txt, txt, db, id, name, 'assertion', 'famplex',
                        organism)
        elif 'CHEBI' in groundings:
            id = groundings['CHEBI']
            name = chebi_client.get_chebi_name_from_id(id[6:])
            term = Term(norm_txt, txt, 'CHEBI', id, name, 'assertion',
                        'famplex')
        elif 'GO' in groundings:
            id = groundings['GO']
            term = Term(norm_txt, txt, 'GO', id, go_client.get_go_label(id),
                        'assertion', 'famplex')
        elif 'MESH' in groundings:
            id = groundings['MESH']
            mesh_mapping = mesh_mappings.get(id)
            db, db_id, name = mesh_mapping if (mesh_mapping
                                               and not ignore_mappings) else \
                ('MESH', id, mesh_client.get_mesh_name(id))
            term = Term(norm_txt, txt, db, db_id, name, 'assertion', 'famplex')
        else:
            # TODO: handle HMDB, PUBCHEM, CHEMBL
            continue
        terms.append(term)
    return terms
예제 #5
0
파일: processor.py 프로젝트: maparent/indra
def _agent_from_id(db_id):
    # There are some Ensembl protein IDs which we currently can't normalize
    # to anything else (unlike ENSG).
    if db_id.startswith('ENSP'):
        db_refs = {'ENSEMBL': db_id}
        name = db_id
    # All other entries are UniProt IDs
    else:
        name = uniprot_client.get_gene_name(db_id)
        if not name:
            return None
        db_refs = {'UP': db_id}
        hgnc_id = uniprot_client.get_hgnc_id(db_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
    return Agent(name, db_refs=db_refs)
예제 #6
0
 def _initialize_node_agents(self):
     """Initialize internal dicts containing node information."""
     nodes = _get_dict_from_list('nodes', self.cx)
     invalid_genes = []
     for node in nodes:
         id = node['@id']
         cx_db_refs = self.get_aliases(node)
         node_name = node['n']
         up_id = cx_db_refs.get('UP')
         if up_id:
             db_refs = {'UP': up_id, 'TEXT': node_name}
             hgnc_id = uniprot_client.get_hgnc_id(up_id)
             if hgnc_id:
                 db_refs['HGNC'] = hgnc_id
                 gene_name = hgnc_client.get_hgnc_name(hgnc_id)
             else:
                 gene_name = uniprot_client.get_gene_name(up_id)
             agent = Agent(gene_name, db_refs=db_refs)
             self._node_names[id] = gene_name
             self._node_agents[id] = agent
             continue
         else:
             self._node_names[id] = node_name
             hgnc_id = hgnc_client.get_hgnc_id(node_name)
             db_refs = {'TEXT': node_name}
             if not hgnc_id:
                 if not self.require_grounding:
                     self._node_agents[id] = \
                             Agent(node_name, db_refs=db_refs)
                 invalid_genes.append(node_name)
             else:
                 db_refs.update({'HGNC': hgnc_id})
                 up_id = hgnc_client.get_uniprot_id(hgnc_id)
                 # It's possible that a valid HGNC ID will not have a
                 # Uniprot ID, as in the case of HOTAIR (HOX transcript
                 # antisense RNA, HGNC:33510)
                 if up_id:
                     db_refs.update({'UP': up_id})
                 self._node_agents[id] = Agent(node_name, db_refs=db_refs)
     if invalid_genes:
         verb = 'Skipped' if self.require_grounding else 'Included'
         logger.info('%s invalid gene symbols: %s' %
                     (verb, ', '.join(invalid_genes)))
예제 #7
0
    def _get_complex_agents(self, complex_id):
        """Returns a list of agents corresponding to each of the constituents
        in a SIGNOR complex."""
        agents = []
        components = self._recursively_lookup_complex(complex_id)

        for c in components:
            db_refs = {}
            if c.startswith('CHEBI'):
                db_refs['CHEBI'] = c
                name = chebi_client.get_chebi_name_from_id(c)
            else:
                name = uniprot_client.get_gene_name(c)
                if name is None:
                    db_refs['SIGNOR'] = c
                else:
                    db_refs['UP'] = c
                    hgnc_id = uniprot_client.get_hgnc_id(c)
                    if hgnc_id:
                        name = hgnc_client.get_hgnc_name(hgnc_id)
                        db_refs['HGNC'] = hgnc_id

                famplex_key = ('SIGNOR', c)
                if famplex_key in famplex_map:
                    db_refs['FPLX'] = famplex_map[famplex_key]
                    if not name:
                        # Set agent name to Famplex name if
                        # the Uniprot name is not available
                        name = db_refs['FPLX']
                elif not name:
                    # We neither have a Uniprot nor Famplex grounding
                    logger.info('Have neither a Uniprot nor Famplex grounding '
                                'for "%s" in complex %s' % (c, complex_id))
                    if not name:
                        # Set the agent name to the Signor name if neither the
                        # Uniprot nor Famplex names are available
                        name = db_refs['SIGNOR']
            assert name is not None
            agents.append(Agent(name, db_refs=db_refs))
        return agents
예제 #8
0
def get_db_refs_by_name(ns, name, node_data):
    """Return standard name and grounding based on a namespace and a name.

    Parameters
    ----------
    ns : str
        A name space in which the given name is interpreted.
    name : str
        The name in the given name space to get grounding for.
    node_data : dict
        Node data for logging purposes.

    Returns
    -------
    name : str
        The standardized name for the given entity.
    db_refs : dict
        The grounding for the given entity.

    """
    db_refs = None
    if ns == 'HGNC':
        # Assumption: name is an HGNC symbol
        hgnc_id = hgnc_client.get_current_hgnc_id(name)
        if not hgnc_id:
            logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
            return name, None
        elif isinstance(hgnc_id, list):
            logger.info('More than one current HGNC ID for %s, choosing %s' %
                        (name, hgnc_id[0]))
            hgnc_id = hgnc_id[0]
        name = hgnc_client.get_hgnc_name(hgnc_id)
        db_refs = {'HGNC': hgnc_id}
        up_id = _get_up_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
        mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
        if mirbase_id:
            db_refs['MIRBASE'] = mirbase_id

    elif ns in ('UNIPROT', 'UP'):
        up_id = None
        # This is a simple test to see if name is a valid UniProt ID,
        # if we can't get a mnemonic, we assume it's not a UP ID
        if uniprot_client.get_mnemonic(name, web_fallback=False):
            up_id = name
        # We next check if it's a mnemonic
        else:
            up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name)
            if up_id_from_mnem:
                up_id = up_id_from_mnem
        if not up_id:
            logger.info('Couldn\'t get UP ID from %s' % name)
            return name, None
        db_refs = {'UP': up_id}
        hgnc_id = uniprot_client.get_hgnc_id(up_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
        else:
            name = uniprot_client.get_gene_name(up_id)
    elif ns == 'FPLX':
        db_refs = {'FPLX': name}
    elif ns in ('GO', 'GOBP', 'GOCC'):
        if name == 'cell proliferation':
            name = 'cell population proliferation'
        go_id = go_client.get_go_id_from_label(name)
        if not go_id:
            logger.info('Could not find GO ID for %s' % name)
            return name, None
        db_refs = {'GO': go_id}
        name = go_client.get_go_label(go_id)
    elif ns in ('MESHPP', 'MESHD', 'MESH'):
        mesh_id, mesh_name = mesh_client.get_mesh_id_name(name)
        if not mesh_id:
            logger.info('Could not find MESH ID from %s' % name)
            return name, None
        name = mesh_name
        db_refs = {'MESH': mesh_id}
    # For now, handle MGI/RGD but putting the name into the db_refs so
    # it's clear what namespace the name belongs to
    # FIXME: Full implementation would look up MGI/RGD identifiers from
    # the names, and obtain corresponding Uniprot IDs
    elif ns == 'MGI':
        up_id = mouse_lookup.get(name)
        if up_id:
            db_refs = {'UP': up_id}
    elif ns == 'RGD':
        up_id = rat_lookup.get(name)
        if up_id:
            db_refs = {'UP': up_id}
    # Map Selventa families and complexes to FamPlex
    elif ns == 'SFAM':
        db_refs = {'SFAM': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SFAM family: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    elif ns == 'SCOMP':
        db_refs = {'SCOMP': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SCOMP complex: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    # Map Entrez genes to HGNC/UP
    elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'):
        hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
        db_refs = {'EGID': name}
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
            else:
                logger.info(
                    'HGNC entity %s with HGNC ID %s has no '
                    'corresponding Uniprot ID.', name, hgnc_id)
            mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
            if mirbase_id:
                db_refs['MIRBASE'] = mirbase_id
        else:
            logger.debug('Could not map EGID%s to HGNC.' % name)
            name = 'E%s' % name
    elif ns == 'MIRBASE':
        mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name)
        if not mirbase_id:
            logger.info('Could not map miRBase name %s to ID', name)
            return name, None
        db_refs = {'MIRBASE': mirbase_id}
        hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
    # CHEBI
    elif ns == 'CHEBI':
        # We first look up BEL's own namespace map for ChEBI names to IDs
        chebi_id = chebi_name_id.get(name)
        # If that fails, we look up INDRA's ChEBI name to ID mapping
        if not chebi_id:
            chebi_id = chebi_client.get_chebi_id_from_name(name)
        if chebi_id:
            db_refs = {'CHEBI': chebi_id}
        else:
            logger.info('CHEBI name %s not found in map.' % name)
    # These appear in the name slot but are actually IDs
    elif ns == 'CHEBIID':
        chebi_id = identifiers.ensure_chebi_prefix(name)
        db_refs = {'CHEBI': chebi_id}
        name = chebi_client.get_chebi_name_from_id(chebi_id)
    # SDIS, SCHEM: Include the name as the ID for the namespace
    elif ns in ('SDIS', 'SCHEM', 'TEXT'):
        db_refs = {ns: name}
    elif ns == 'TAX':
        tid = taxonomy_client.get_taxonomy_id(name)
        if tid:
            db_refs = {'TAXONOMY': tid}
        else:
            logger.info('Could not get taxonomy ID for %s' % name)
    else:
        logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data))
    return name, db_refs
예제 #9
0
def fix_agent(agent):
    """Fix naming and grounding issues in an Agent, changes Agent in place."""
    if agent is None:
        return
    # First we fix some name spaces
    db_refs_tmp = copy(agent.db_refs)
    for db_ns, db_id in agent.db_refs.items():
        # Make sure CHEBI prefix is there
        if db_ns == 'CHEBI':
            if not db_id.startswith('CHEBI:'):
                db_refs_tmp['CHEBI'] = 'CHEBI:%s' % db_id
        # Change FA name space
        elif db_ns == 'FA':
            db_refs_tmp.pop('FA', None)
            db_refs_tmp['NXPFA'] = db_id
        # Change IPR name space
        elif db_ns == 'IPR':
            db_refs_tmp.pop('IPR', None)
            db_refs_tmp['IP'] = db_id
        # Change XFAM name space
        elif db_ns == 'XFAM':
            db_refs_tmp.pop('XFAM', None)
            db_refs_tmp['PF'] = db_id.split('.')[0]
        elif db_ns == 'GO':
            if db_id.startswith('GO:'):
                db_refs_tmp['GO'] = db_id
            else:
                db_refs_tmp['GO'] = 'GO:' + db_id
        # Change PCID name space
        elif db_ns == 'PCID':
            db_refs_tmp.pop('PCID', None)
            db_refs_tmp['PUBCHEM'] = db_id
    agent.db_refs = db_refs_tmp
    # Check if we have a FPLX entry and handle old BE mappings
    if 'BE' in agent.db_refs:
        agent.db_refs['FPLX'] = agent.db_refs.pop('BE')
    be_id = agent.db_refs.get('FPLX')
    # Try to map to FPLX from NXP, IPR, PF, NCIT
    if not be_id:
        for db_ns, db_id in agent.db_refs.items():
            be_id = famplex_map.get((db_ns, db_id))
            if be_id:
                break
    # Try mapping NCIT to specific genes if possible
    if not be_id and 'NCIT' in agent.db_refs:
        target = ncit_map.get(agent.db_refs['NCIT'])
        if target:
            agent.db_refs[target[0]] = target[1]
    # If the name is an UP ID, change it
    if agent.name and 'UP' not in agent.db_refs \
            and 'FPLX' not in agent.db_refs:
        if uniprot_client.get_gene_name(agent.name, web_fallback=False):
            agent.db_refs['UP'] = agent.name

    # Check what entries we have
    up_id = agent.db_refs.get('UP')
    hgnc_id = agent.db_refs.get('HGNC')
    # This is a special case that happens sometimes where agent.name is 'UP:
    # db_refs['UP'] is an empty string, and there is no other grounding.
    # In this case, we remove the empty UP grounding and reset the name to the
    # agent text.
    if not be_id and not hgnc_id and up_id == '':
        agent.name = agent.db_refs.get('TEXT', agent.name)
        agent.db_refs.pop('UP')
    # FPLX takes precedence if we have it
    elif be_id:
        agent.db_refs['FPLX'] = be_id
        agent.name = be_id
    elif hgnc_id:
        gene_name = hgnc_client.get_hgnc_name(hgnc_id)
        if gene_name:
            agent.name = gene_name
        if not up_id:
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                if ', ' in up_id:
                    up_id = up_id.split(', ')[0]
                agent.db_refs['UP'] = up_id
    elif up_id:
        hgnc_id = uniprot_client.get_hgnc_id(up_id)
        if hgnc_id:
            agent.db_refs['HGNC'] = hgnc_id
            agent.name = hgnc_client.get_hgnc_name(hgnc_id)
        else:
            gene_name = uniprot_client.get_gene_name(up_id, web_fallback=False)
            if gene_name:
                agent.name = gene_name
            # If it doesn't have a gene name, it's better to just
            # use the raw string name otherwise Sparser sets
            # has Uniprot IDs or mnemonics as the name
            else:
                name = agent.db_refs.get('TEXT', agent.name)
                agent.name = name
예제 #10
0
    def _get_agent_from_ref(self, ref):
        # TODO: handle collections
        if ref.attrib.get('category') == 'collection':
            #logger.warning('Skipping collection Agent.')
            return None

        # Find the name, uid and raw-text tags first and get their text
        # content if available
        uid_tag = ref.find("var/[@name='uid']")
        name_tag = ref.find("var/[@name='name']")
        text_tag = ref.find("var/[@name='raw-text']")
        if name_tag is not None and name_tag.text:
            name = name_tag.text
        else:
            name = None
        if uid_tag is not None and uid_tag.text:
            uid = uid_tag.text
        else:
            uid = None
        if text_tag is not None and text_tag.text:
            raw_text = text_tag.text
        else:
            raw_text = None

        # TODO: factor this out and reuse fix_agents
        db_refs = {}
        # Save raw text if available
        if raw_text:
            db_refs['TEXT'] = raw_text
        agent_name = raw_text
        # If we have a proper UID then we try to reconstruct an Agent from that
        if uid is not None and len(uid.split(':')) == 2:
            db_ns, db_id = uid.split(':')
            be_id = famplex_map.get((db_ns, db_id))
            if be_id:
                db_refs[db_ns] = db_id
                db_refs['FPLX'] = be_id
                agent_name = be_id
            elif db_ns in ['UP', 'Uniprot']:
                id_from_mnemonic = uniprot_client.get_id_from_mnemonic(db_id)
                if id_from_mnemonic:
                    db_id = id_from_mnemonic
                db_refs['UP'] = db_id
                hgnc_id = uniprot_client.get_hgnc_id(db_id)
                if hgnc_id:
                    db_refs['HGNC'] = hgnc_id
                    agent_name = hgnc_client.get_hgnc_name(hgnc_id)
                else:
                    gene_name = uniprot_client.get_gene_name(db_id)
                    if gene_name:
                        agent_name = gene_name
            elif db_ns == 'NCIT':
                db_refs['NCIT'] = db_id
                target = ncit_map.get(db_id)
                if target:
                    db_refs[target[0]] = target[1]
                    if target[0] == 'HGNC':
                        up_id = hgnc_client.get_uniprot_id(target[1])
                        agent_name = hgnc_client.get_hgnc_name(target[1])
                        if up_id:
                            db_refs['UP'] = up_id
                    elif target[0] == 'UP':
                        agent_name = uniprot_client.get_gene_name(target[1])
                        if agent_name:
                            hgnc_id = hgnc_client.get_hgnc_id(agent_name)
                            if hgnc_id:
                                db_refs['HGNC'] = hgnc_id
            elif db_ns == 'FA':
                db_refs['NXP'] = 'FA:' + db_id
            elif db_ns == 'XFAM':
                db_refs['PF'] = db_id.split('.')[0]
            elif db_ns == 'CHEBI':
                db_refs['CHEBI'] = 'CHEBI:' + db_id
            elif db_ns in ['GO', 'MESH', 'FPLX']:
                db_refs[db_ns] = db_id
            # Handle old BE mappings and add them as FPLX
            elif db_ns == 'BE':
                db_refs['FPLX'] = db_id
            elif db_ns in ['PR', 'CO', 'CVCL', 'EFO', 'ORPHANET']:
                db_refs[db_ns] = db_id
            else:
                logger.warning('Unknown database name space %s' % db_ns)
        if not agent_name:
            if raw_text is not None:
                agent_name = raw_text
            else:
                return None

        assert (agent_name)

        agent = Agent(agent_name, db_refs=db_refs)
        return agent
예제 #11
0
def generate_uniprot_terms(download=False, organisms=None):
    if not organisms:
        organisms = popular_organisms
    path = os.path.join(resource_dir, 'up_synonyms.tsv')
    org_filter_str = ' OR '.join(organisms)
    if not os.path.exists(path) or download:
        url = (
            f'https://www.uniprot.org/uniprot/?format=tab&columns=id,'
            f'genes(PREFERRED),genes(ALTERNATIVE),protein%20names,organism-id&sort=score&'
            f'query=reviewed:yes&fil=organism:{org_filter_str}')
        logger.info('Downloading UniProt resource file')
        res = requests.get(url)
        with open(path, 'w') as fh:
            fh.write(res.text)
    terms = []
    for row in read_csv(path, delimiter='\t', header=True):
        up_id = row['Entry']
        organism = row['Organism ID']
        protein_names = parse_uniprot_synonyms(row['Protein names'])
        primary_gene_name = row['Gene names  (primary )'].strip()
        if primary_gene_name == ';':
            primary_gene_name = None
        gene_synonyms_str = row['Gene names  (synonym )'].strip()
        if gene_synonyms_str == ';':
            gene_synonyms_str = None
        # We generally use the gene name as the standard name
        # except when there are multiple gene names (separated by
        # semi-colons) in which case we take the first protein name.
        if not primary_gene_name or ';' in primary_gene_name:
            standard_name = protein_names[0]
        else:
            standard_name = primary_gene_name
        # We skip a small number of not critical entries that don't have
        # standard names
        if not standard_name:
            continue
        ns = 'UP'
        id = up_id
        hgnc_id = uniprot_client.get_hgnc_id(up_id)
        if hgnc_id:
            ns = 'HGNC'
            id = hgnc_id
            standard_name = hgnc_client.get_hgnc_name(hgnc_id)
        for name in protein_names:
            # Skip names that are EC codes
            if name.startswith('EC '):
                continue
            if name == standard_name:
                continue
            term = Term(normalize(name), name, ns, id, standard_name,
                        'synonym', 'uniprot', organism)
            terms.append(term)
        term = Term(normalize(standard_name), standard_name, ns, id,
                    standard_name, 'name', 'uniprot', organism)
        terms.append(term)
        if gene_synonyms_str:
            # This is to deal with all the variations in which
            # synonyms are listed, including degenerate strings
            # like "; ;"
            for synonym_group in gene_synonyms_str.split('; '):
                for synonym in synonym_group.split(' '):
                    if not synonym or synonym == ';':
                        continue
                    term = Term(normalize(synonym), synonym, ns, id,
                                standard_name, 'synonym', 'uniprot', organism)
                    terms.append(term)

    return terms
예제 #12
0
def get_agent_from_entity_info(entity_info):
    """Return an INDRA Agent by processing an entity_info dict."""
    # This will be the default name. If we get a gene name, it will
    # override this rawtext name.
    raw_text = entity_info['entityText']
    name = raw_text

    # Get the db refs.
    refs = {'TEXT': raw_text}
    entries = entity_info['entityId']
    if entries is None:
        entries = []
    ref_counts = Counter([entry['source'] for entry in entries])
    for source, count in ref_counts.items():
        if source in ('Entrez', 'UniProt') and count > 1:
            logger.info('%s has %d entries for %s, skipping'
                        % (raw_text, count, source))
            return None, None
    muts = []
    for id_dict in entries:
        if id_dict['source'] == 'Entrez':
            refs['EGID'] = id_dict['idString']
            hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString'])
            if hgnc_id is not None:
                # Check against what we may have already inferred from
                # UniProt. If it disagrees with this, let it be. Inference
                # from Entrez isn't as reliable.
                if 'HGNC' in refs.keys():
                    if refs['HGNC'] != hgnc_id:
                        msg = ('HGNC:%s previously set does not'
                               ' match HGNC:%s from EGID:%s') % \
                               (refs['HGNC'], hgnc_id, refs['EGID'])
                        logger.info(msg)
                else:
                    refs['HGNC'] = hgnc_id
        elif id_dict['source'] == 'UniProt':
            refs['UP'] = id_dict['idString']
            hgnc_id = uniprot_client.get_hgnc_id(id_dict['idString'])
            if hgnc_id:
                # Check to see if we have a conflict with an HGNC id
                # found from the Entrez id. If so, overwrite with this
                # one, in which we have greater faith.
                if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id:
                    msg = ('Inferred HGNC:%s from UP:%s does not'
                           ' match HGNC:%s from EGID:%s') % \
                          (refs['HGNC'], refs['UP'], hgnc_id,
                           refs['EGID'])
                    logger.info(msg)
                refs['HGNC'] = hgnc_id
                name = hgnc_client.get_hgnc_name(hgnc_id)
            else:
                gene_name = uniprot_client.get_gene_name(id_dict['idString'])
                if gene_name is not None:
                    name = gene_name
        elif id_dict['source'] in ('Tax', 'NCBI'):
            refs['TAX'] = id_dict['idString']
        elif id_dict['source'] == 'CHEBI':
            refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString']
        # These we take as is
        elif id_dict['source'] in ('MESH', 'OMIM', 'CTD'):
            refs[id_dict['source']] = id_dict['idString']
        # Handle mutations
        elif id_dict['source'] == 'Unk' and \
                id_dict['entityType'] == 'ProteinMutation':
            # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk',
            #  'tool': 'PubTator', 'entityType': 'ProteinMutation'}
            # Mpk1(Y268A)'
            if id_dict['idString'].startswith('p|SUB|'):
                try:
                    # Handle special cases like p|SUB|A|30|P;RS#:104893878
                    parts = id_dict['idString'].split(';')[0].split('|')
                    residue_from, pos, residue_to = parts[2:5]
                    mut = MutCondition(pos, residue_from, residue_to)
                    muts.append(mut)
                except Exception as e:
                    logger.info('Could not process mutation %s' %
                                id_dict['idString'])
            else:
                logger.info('Unhandled mutation: %s' % id_dict['idString'])
        else:
            logger.warning("Unhandled id type: {source}={idString}"
                           .format(**id_dict))

    raw_coords = (entity_info['charStart'], entity_info['charEnd'])
    return Agent(name, db_refs=refs, mutations=muts), raw_coords