Exemplo n.º 1
0
def update_kinases():
    logger.info('--Updating kinase list------')
    url = 'http://www.uniprot.org/uniprot/?' + \
        'sort=entry_name&desc=no&compress=no&query=database:(type:' + \
        'interpro%20ipr011009)%20AND%20reviewed:yes%20AND%20organism:' + \
        '%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no' + \
        '&format=tab&columns=id,genes(PREFERRED),organism-id,entry%20name'
    fname = os.path.join(path, 'kinases.tsv')
    save_from_http(url, fname)

    from indra.databases import hgnc_client, uniprot_client
    add_kinases = [
        'PGK1', 'PKM', 'TAF1', 'NME1', 'BCKDK', 'PDK1', 'PDK2', 'PDK3', 'PDK4',
        'BCR', 'FAM20C', 'BAZ1B', 'PIKFYVE'
    ]
    df = pandas.read_csv(fname, sep='\t')
    for kinase in add_kinases:
        hgnc_id = hgnc_client.get_hgnc_id(kinase)
        up_id = hgnc_client.get_uniprot_id(hgnc_id)
        up_mnemonic = uniprot_client.get_mnemonic(up_id)
        df = df.append(
            {
                'Entry': up_id,
                'Gene names  (primary )': kinase,
                'Organism ID': '9606',
                'Entry name': up_mnemonic
            },
            ignore_index=True)
    df.to_csv(fname, sep='\t', index=False)
Exemplo n.º 2
0
def save_base_map(filename, grouped_by_text):
    """Dump a list of agents along with groundings and counts into a csv file

    Parameters
    ----------
    filename : str
        Filepath for output file
    grouped_by_text : list of tuple
        List of tuples of the form output by agent_texts_with_grounding
    """
    rows = []
    for group in grouped_by_text:
        text_string = group[0]
        for db, db_id, count in group[1]:
            if db == 'UP':
                name = uniprot_client.get_mnemonic(db_id)
            else:
                name = ''
            row = [text_string, db, db_id, count, name]
            rows.append(row)

    write_unicode_csv(filename,
                      rows,
                      delimiter=',',
                      quotechar='"',
                      quoting=csv.QUOTE_MINIMAL,
                      lineterminator='\r\n')
Exemplo n.º 3
0
def protein_map_from_twg(twg):
    """Build  map of entity texts to validate protein grounding.

    Looks at the grounding of the entity texts extracted from the statements
    and finds proteins where there is grounding to a human protein that maps to
    an HGNC name that is an exact match to the entity text. Returns a dict that
    can be used to update/expand the grounding map.

    Parameters
    ----------
    twg : list of tuple
        list of tuples of the form output by agent_texts_with_grounding

    Returns
    -------
    protein_map : dict
        dict keyed on agent text with associated values
        {'TEXT': agent_text, 'UP': uniprot_id}. Entries are for agent texts
        where the grounding map was able to find human protein grounded to
        this agent_text in Uniprot.
    """

    protein_map = {}
    unmatched = 0
    matched = 0
    logger.info('Building grounding map for human proteins')
    for agent_text, grounding_list, _ in twg:
        # If 'UP' (Uniprot) not one of the grounding entries for this text,
        # then we skip it.
        if 'UP' not in [entry[0] for entry in grounding_list]:
            continue
        # Otherwise, collect all the Uniprot IDs for this protein.
        uniprot_ids = [
            entry[1] for entry in grounding_list if entry[0] == 'UP'
        ]
        # For each Uniprot ID, look up the species
        for uniprot_id in uniprot_ids:
            # If it's not a human protein, skip it
            mnemonic = uniprot_client.get_mnemonic(uniprot_id)
            if mnemonic is None or not mnemonic.endswith('_HUMAN'):
                continue
            # Otherwise, look up the gene name in HGNC and match against the
            # agent text
            gene_name = uniprot_client.get_gene_name(uniprot_id)
            if gene_name is None:
                unmatched += 1
                continue
            if agent_text.upper() == gene_name.upper():
                matched += 1
                protein_map[agent_text] = {
                    'TEXT': agent_text,
                    'UP': uniprot_id
                }
            else:
                unmatched += 1
    logger.info('Exact matches for %d proteins' % matched)
    logger.info('No match (or no gene name) for %d proteins' % unmatched)
    return protein_map
Exemplo n.º 4
0
def protein_map_from_twg(twg):
    """Build  map of entity texts to validate protein grounding.

    Looks at the grounding of the entity texts extracted from the statements
    and finds proteins where there is grounding to a human protein that maps to
    an HGNC name that is an exact match to the entity text. Returns a dict that
    can be used to update/expand the grounding map.

    Parameters
    ----------
    twg : list of tuple
        list of tuples of the form output by agent_texts_with_grounding

    Returns
    -------
    protein_map : dict
        dict keyed on agent text with associated values
        {'TEXT': agent_text, 'UP': uniprot_id}. Entries are for agent texts
        where the grounding map was able to find human protein grounded to
        this agent_text in Uniprot.
    """

    protein_map = {}
    unmatched = 0
    matched = 0
    logger.info('Building grounding map for human proteins')
    for agent_text, grounding_list, _ in twg:
        # If 'UP' (Uniprot) not one of the grounding entries for this text,
        # then we skip it.
        if 'UP' not in [entry[0] for entry in grounding_list]:
            continue
        # Otherwise, collect all the Uniprot IDs for this protein.
        uniprot_ids = [entry[1] for entry in grounding_list
                       if entry[0] == 'UP']
        # For each Uniprot ID, look up the species
        for uniprot_id in uniprot_ids:
            # If it's not a human protein, skip it
            mnemonic = uniprot_client.get_mnemonic(uniprot_id)
            if mnemonic is None or not mnemonic.endswith('_HUMAN'):
                continue
            # Otherwise, look up the gene name in HGNC and match against the
            # agent text
            gene_name = uniprot_client.get_gene_name(uniprot_id)
            if gene_name is None:
                unmatched += 1
                continue
            if agent_text.upper() == gene_name.upper():
                matched += 1
                protein_map[agent_text] = {'TEXT': agent_text,
                                           'UP': uniprot_id}
            else:
                unmatched += 1
    logger.info('Exact matches for %d proteins' % matched)
    logger.info('No match (or no gene name) for %d proteins' % unmatched)
    return protein_map
Exemplo n.º 5
0
def save_base_map(filename, grouped_by_text):
    rows = []
    for group in grouped_by_text:
        text_string = group[0]
        for db, id, count in group[1]:
            if db == 'UP':
                name = uniprot_client.get_mnemonic(id)
            else:
                name = ''
            row = [text_string, db, id, count, name]
            rows.append(row)

    write_unicode_csv(filename, rows, delimiter=',', quotechar='"',
                      quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
Exemplo n.º 6
0
def save_base_map(filename, grouped_by_text):
    rows = []
    for group in grouped_by_text:
        text_string = group[0]
        for db, id, count in group[1]:
            if db == 'UP':
                name = uniprot_client.get_mnemonic(id)
            else:
                name = ''
            row = [text_string, db, id, count, name]
            rows.append(row)

    write_unicode_csv(filename, rows, delimiter=',', quotechar='"',
                      quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
Exemplo n.º 7
0
def sanitize_up_ids(up_ids):
    # First, we map any secondary IDs to primary IDs
    up_ids = {uniprot_client.get_primary_id(up_id) for up_id in up_ids}
    # We filter out IDs that are actually mnemonics, these are just mixed
    # in without any differentiation from other IDs
    up_ids = {up_id for up_id in up_ids if '_' not in up_id}
    # TODO: should we do anything about isoforms?
    # We separate out specific sets of IDs
    human_ids = [up_id for up_id in up_ids if uniprot_client.is_human(up_id)]
    reviewed_non_human_ids = [
        up_id for up_id in up_ids if not uniprot_client.is_human(up_id)
        # get_mnemonic is just a quick way to see if we have this entry
        and uniprot_client.get_mnemonic(up_id, web_fallback=False)
    ]
    if human_ids:
        return human_ids
    elif reviewed_non_human_ids:
        return reviewed_non_human_ids
    else:
        return []
Exemplo n.º 8
0
 def get_grounding(self):
     be = self.db_refs.get('FPLX')
     if be:
         return ('FPLX', be)
     hgnc = self.db_refs.get('HGNC')
     if hgnc:
         if isinstance(hgnc, list):
             hgnc = hgnc[0]
         return ('HGNC', hgc.get_hgnc_name(str(hgnc)))
     up = self.db_refs.get('UP')
     if up:
         if isinstance(up, list):
             up = up[0]
         up_mnemonic = upc.get_mnemonic(up)
         if up_mnemonic and up_mnemonic.endswith('HUMAN'):
             gene_name = upc.get_gene_name(up, web_fallback=False)
             if gene_name:
                 return ('HGNC', gene_name)
         else:
             return ('UP', up)
     return (None, None)
Exemplo n.º 9
0
def update_kinases():
    logger.info('--Updating kinase list------')
    url = 'http://www.uniprot.org/uniprot/?' + \
        'sort=entry_name&desc=no&compress=no&query=database:(type:' + \
        'interpro%20ipr011009)%20AND%20reviewed:yes%20AND%20organism:' + \
        '%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no' + \
        '&format=tab&columns=id,genes(PREFERRED),organism-id,entry%20name'
    fname = os.path.join(path, 'kinases.tsv')
    save_from_http(url, fname)

    from indra.databases import hgnc_client, uniprot_client
    add_kinases = ['PGK1', 'PKM', 'TAF1', 'NME1', 'BCKDK', 'PDK1', 'PDK2',
                   'PDK3', 'PDK4', 'BCR', 'FAM20C', 'BAZ1B', 'PIKFYVE']
    df = pandas.read_csv(fname, sep='\t')
    for kinase in add_kinases:
        hgnc_id = hgnc_client.get_hgnc_id(kinase)
        up_id = hgnc_client.get_uniprot_id(hgnc_id)
        up_mnemonic = uniprot_client.get_mnemonic(up_id)
        df = df.append({'Entry': up_id, 'Gene names  (primary )': kinase,
                        'Organism ID': '9606', 'Entry name': up_mnemonic},
                       ignore_index=True)
    df.to_csv(fname, sep='\t', index=False)
Exemplo n.º 10
0
def save_base_map(filename, grouped_by_text):
    """Dump a list of agents along with groundings and counts into a csv file

    Parameters
    ----------
    filename : str
        Filepath for output file
    grouped_by_text : list of tuple
        List of tuples of the form output by agent_texts_with_grounding
    """
    rows = []
    for group in grouped_by_text:
        text_string = group[0]
        for db, db_id, count in group[1]:
            if db == 'UP':
                name = uniprot_client.get_mnemonic(db_id)
            else:
                name = ''
            row = [text_string, db, db_id, count, name]
            rows.append(row)

    write_unicode_csv(filename, rows, delimiter=',', quotechar='"',
                      quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
Exemplo n.º 11
0
def test_get_mnemonic():
    mnemonic = uniprot_client.get_mnemonic('Q02750')
    assert mnemonic == 'MP2K1_HUMAN'
    assert unicode_strs(mnemonic)
Exemplo n.º 12
0
def get_participant(agent):
    # Handle missing Agent as generic protein
    if agent is None:
        return get_generic('protein')
    # The Agent is not missing
    text_name = agent.db_refs.get('TEXT')
    if text_name is None:
        text_name = agent.name
    participant = {}
    participant['entity_text'] = [text_name]
    hgnc_id = agent.db_refs.get('HGNC')
    uniprot_id = agent.db_refs.get('UP')
    chebi_id = agent.db_refs.get('CHEBI')
    pfam_def_ids = agent.db_refs.get('PFAM-DEF')
    # If HGNC grounding is available, that is the first choice
    if hgnc_id:
        uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
    if uniprot_id:
        uniprot_mnemonic = str(uniprot_client.get_mnemonic(uniprot_id))
        participant['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic
        participant['entity_type'] = 'protein'
    elif chebi_id:
        pubchem_id = chebi_client.get_pubchem_id(chebi_id)
        participant['identifier'] = 'PUBCHEM:%s' % pubchem_id
        participant['entity_type'] = 'chemical'
    elif pfam_def_ids:
        participant['entity_type'] = 'protein_family'
        participant['entities'] = []
        pfam_def_list = []
        for p in pfam_def_ids.split('|'):
            dbname, dbid = p.split(':')
            pfam_def_list.append({dbname: dbid})
        for pdi in pfam_def_list:
            # TODO: handle non-uniprot protein IDs here
            uniprot_id = pdi.get('UP')
            if uniprot_id:
                entity_dict = {}
                uniprot_mnemonic = \
                    str(uniprot_client.get_mnemonic(uniprot_id))
                gene_name = uniprot_client.get_gene_name(uniprot_id)
                if gene_name is None:
                    gene_name = ""
                entity_dict['entity_text'] = [gene_name]
                entity_dict['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic
                entity_dict['entity_type'] = 'protein'
                participant['entities'].append(entity_dict)
    else:
        participant['identifier'] = ''
        participant['entity_type'] = 'protein'

    features = []
    not_features = []
    # Binding features
    for bc in agent.bound_conditions:
        feature = {
            'feature_type': 'binding_feature',
            'bound_to': {
                # NOTE: get type and identifier for bound to protein
                'entity_type': 'protein',
                'entity_text': [bc.agent.name],
                'identifier': ''
                }
            }
        if bc.is_bound:
            features.append(feature)
        else:
            not_features.append(feature)
    # Modification features
    for mc in agent.mods:
        feature = {
            'feature_type': 'modification_feature',
            'modification_type': mc.mod_type.lower(),
            }
        if mc.position is not None:
            pos = int(mc.position)
            feature['location'] = pos
        if mc.residue is not None:
            feature['aa_code'] = mc.residue
        if mc.is_modified:
            features.append(feature)
        else:
            not_features.append(feature)
    # Mutation features
    for mc in agent.mutations:
        feature = {}
        feature['feature_type'] = 'mutation_feature'
        if mc.residue_from is not None:
            feature['from_aa'] = mc.residue_from
        if mc.residue_to is not None:
            feature['to_aa'] = mc.residue_to
        if mc.position is not None:
            pos = int(mc.position)
            feature['location'] = pos
        features.append(feature)
    if features:
        participant['features'] = features
    if not_features:
        participant['not_features'] = not_features
    return participant
Exemplo n.º 13
0
def add_source_urls(stmts):
    for stmt in stmts:
        for ev in stmt.evidence:
            if ev.source_api == 'hprd':
                if ev.source_id and ev.source_id.startswith('http'):
                    ev.annotations['source_url'] = ev.source_id
            elif ev.source_api == 'signor':
                # Not clear how to use the source_id like SIGNOR-252627 to
                # link directly to the reaction
                up_id = stmt.real_agent_list()[0].db_refs.get('UP')
                if up_id:
                    ev.annotations['source_url'] = \
                        ('https://signor.uniroma2.it/relation_result.php?'
                         'id=%s' % up_id)
            elif ev.source_api == 'ctd':
                agent = stmt.real_agent_list()[0]
                egid = agent.db_refs.get('EGID')
                meshid = agent.db_refs.get('MESH')
                if egid:
                    ev.annotations['source_url'] = \
                        'http://ctdbase.org/detail.go?type=gene&acc=%s' % egid
                elif meshid:
                    ev.annotations['source_url'] = \
                        'http://ctdbase.org/detail.go?type=chem&acc=%s' % meshid
            elif ev.source_api == 'biogrid':
                ev.annotations['source_url'] = \
                    'https://thebiogrid.org/interaction/%s' % ev.source_id
            elif ev.source_api == 'phosphoelm':
                ev.annotations['source_url'] = \
                    'http://phospho.elm.eu.org/byKinase/%s.html' % \
                    ev.annotations['phosphoelm_kinase_name']
            elif ev.source_api == 'virhostnet':
                agent = stmt.real_agent_list()[0]
                upid = agent.db_refs.get('UP')
                if upid:
                    mnemonic = \
                        uniprot_client.get_mnemonic(upid, web_fallback=False)
                    if mnemonic:
                        ev.annotations['source_url'] = \
                            ('https://virhostnet.prabi.fr/pathostscape3.html'
                             '?protein=%s' % mnemonic)
            elif ev.source_api == 'drugbank':
                agent = stmt.real_agent_list()[0]
                dbid = agent.db_refs.get('DRUGBANK')
                if dbid:
                    ev.annotations['source_url'] = \
                        'https://go.drugbank.com/drugs/%s' % dbid
            elif ev.source_api == 'trrust':
                target = stmt.obj
                ev.annotations['source_url'] = \
                    ('https://www.grnpedia.org/trrust/result_tonly.php?gene=%s'
                     '&species=human') % target.name
            elif ev.source_api == 'biopax':
                if not ev.source_id:
                    continue
                elif 'phosphosite' in ev.source_id:
                    ev.annotations[
                        'source_url'] = 'https://www.phosphosite.org/'
                else:
                    ev.annotations['source_url'] = ev.source_id
            elif ev.source_api == 'tas':
                lspcid = stmt.subj.db_refs.get('LSPCI')
                if lspcid:
                    url = (
                        'https://labsyspharm.shinyapps.io/smallmoleculesuite/'
                        '?_inputs_&binding-table-selectivity_nav=%%22tas%%22&'
                        'binding-query-select_compound=%%22%s-1%%22&'
                        'tab=%%22binding%%22') % lspcid
                    ev.annotations['source_url'] = url
    return stmts
Exemplo n.º 14
0
def get_db_refs_by_name(ns, name, node_data):
    """Return standard name and grounding based on a namespace and a name.

    Parameters
    ----------
    ns : str
        A name space in which the given name is interpreted.
    name : str
        The name in the given name space to get grounding for.
    node_data : dict
        Node data for logging purposes.

    Returns
    -------
    name : str
        The standardized name for the given entity.
    db_refs : dict
        The grounding for the given entity.

    """
    db_refs = None
    if ns == 'HGNC':
        # Assumption: name is an HGNC symbol
        hgnc_id = hgnc_client.get_current_hgnc_id(name)
        if not hgnc_id:
            logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
            return name, None
        elif isinstance(hgnc_id, list):
            logger.info('More than one current HGNC ID for %s, choosing %s' %
                        (name, hgnc_id[0]))
            hgnc_id = hgnc_id[0]
        name = hgnc_client.get_hgnc_name(hgnc_id)
        db_refs = {'HGNC': hgnc_id}
        up_id = _get_up_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
        mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
        if mirbase_id:
            db_refs['MIRBASE'] = mirbase_id

    elif ns in ('UNIPROT', 'UP'):
        up_id = None
        # This is a simple test to see if name is a valid UniProt ID,
        # if we can't get a mnemonic, we assume it's not a UP ID
        if uniprot_client.get_mnemonic(name, web_fallback=False):
            up_id = name
        # We next check if it's a mnemonic
        else:
            up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name)
            if up_id_from_mnem:
                up_id = up_id_from_mnem
        if not up_id:
            logger.info('Couldn\'t get UP ID from %s' % name)
            return name, None
        db_refs = {'UP': up_id}
        hgnc_id = uniprot_client.get_hgnc_id(up_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
        else:
            name = uniprot_client.get_gene_name(up_id)
    elif ns == 'FPLX':
        db_refs = {'FPLX': name}
    elif ns in ('GO', 'GOBP', 'GOCC'):
        if name == 'cell proliferation':
            name = 'cell population proliferation'
        go_id = go_client.get_go_id_from_label(name)
        if not go_id:
            logger.info('Could not find GO ID for %s' % name)
            return name, None
        db_refs = {'GO': go_id}
        name = go_client.get_go_label(go_id)
    elif ns in ('MESHPP', 'MESHD', 'MESH'):
        mesh_id, mesh_name = mesh_client.get_mesh_id_name(name)
        if not mesh_id:
            logger.info('Could not find MESH ID from %s' % name)
            return name, None
        name = mesh_name
        db_refs = {'MESH': mesh_id}
    # For now, handle MGI/RGD but putting the name into the db_refs so
    # it's clear what namespace the name belongs to
    # FIXME: Full implementation would look up MGI/RGD identifiers from
    # the names, and obtain corresponding Uniprot IDs
    elif ns == 'MGI':
        up_id = mouse_lookup.get(name)
        if up_id:
            db_refs = {'UP': up_id}
    elif ns == 'RGD':
        up_id = rat_lookup.get(name)
        if up_id:
            db_refs = {'UP': up_id}
    # Map Selventa families and complexes to FamPlex
    elif ns == 'SFAM':
        db_refs = {'SFAM': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SFAM family: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    elif ns == 'SCOMP':
        db_refs = {'SCOMP': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SCOMP complex: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    # Map Entrez genes to HGNC/UP
    elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'):
        hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
        db_refs = {'EGID': name}
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
            else:
                logger.info(
                    'HGNC entity %s with HGNC ID %s has no '
                    'corresponding Uniprot ID.', name, hgnc_id)
            mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
            if mirbase_id:
                db_refs['MIRBASE'] = mirbase_id
        else:
            logger.debug('Could not map EGID%s to HGNC.' % name)
            name = 'E%s' % name
    elif ns == 'MIRBASE':
        mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name)
        if not mirbase_id:
            logger.info('Could not map miRBase name %s to ID', name)
            return name, None
        db_refs = {'MIRBASE': mirbase_id}
        hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
    # CHEBI
    elif ns == 'CHEBI':
        # We first look up BEL's own namespace map for ChEBI names to IDs
        chebi_id = chebi_name_id.get(name)
        # If that fails, we look up INDRA's ChEBI name to ID mapping
        if not chebi_id:
            chebi_id = chebi_client.get_chebi_id_from_name(name)
        if chebi_id:
            db_refs = {'CHEBI': chebi_id}
        else:
            logger.info('CHEBI name %s not found in map.' % name)
    # These appear in the name slot but are actually IDs
    elif ns == 'CHEBIID':
        chebi_id = identifiers.ensure_chebi_prefix(name)
        db_refs = {'CHEBI': chebi_id}
        name = chebi_client.get_chebi_name_from_id(chebi_id)
    # SDIS, SCHEM: Include the name as the ID for the namespace
    elif ns in ('SDIS', 'SCHEM', 'TEXT'):
        db_refs = {ns: name}
    elif ns == 'TAX':
        tid = taxonomy_client.get_taxonomy_id(name)
        if tid:
            db_refs = {'TAXONOMY': tid}
        else:
            logger.info('Could not get taxonomy ID for %s' % name)
    else:
        logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data))
    return name, db_refs
Exemplo n.º 15
0
def test_get_mnemonic():
    mnemonic = uniprot_client.get_mnemonic('Q02750')
    assert mnemonic == 'MP2K1_HUMAN'
    assert unicode_strs(mnemonic)
Exemplo n.º 16
0
def get_participant(agent):
    # Handle missing Agent as generic protein
    if agent is None:
        return get_generic('protein')
    # The Agent is not missing
    text_name = agent.db_refs.get('TEXT')
    if text_name is None:
        text_name = agent.name
    participant = {}
    participant['entity_text'] = [text_name]
    hgnc_id = agent.db_refs.get('HGNC')
    uniprot_id = agent.db_refs.get('UP')
    chebi_id = agent.db_refs.get('CHEBI')
    pfam_def_ids = agent.db_refs.get('PFAM-DEF')
    # If HGNC grounding is available, that is the first choice
    if hgnc_id:
        uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
    if uniprot_id:
        uniprot_mnemonic = str(uniprot_client.get_mnemonic(uniprot_id))
        participant['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic
        participant['entity_type'] = 'protein'
    elif chebi_id:
        pubchem_id = chebi_client.get_pubchem_id(chebi_id)
        participant['identifier'] = 'PUBCHEM:%s' % pubchem_id
        participant['entity_type'] = 'chemical'
    elif pfam_def_ids:
        participant['entity_type'] = 'protein_family'
        participant['entities'] = []
        pfam_def_list = []
        for p in pfam_def_ids.split('|'):
            dbname, dbid = p.split(':')
            pfam_def_list.append({dbname: dbid})
        for pdi in pfam_def_list:
            # TODO: handle non-uniprot protein IDs here
            uniprot_id = pdi.get('UP')
            if uniprot_id:
                entity_dict = {}
                uniprot_mnemonic = \
                    str(uniprot_client.get_mnemonic(uniprot_id))
                gene_name = uniprot_client.get_gene_name(uniprot_id)
                if gene_name is None:
                    gene_name = ""
                entity_dict['entity_text'] = [gene_name]
                entity_dict['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic
                entity_dict['entity_type'] = 'protein'
                participant['entities'].append(entity_dict)
    else:
        participant['identifier'] = ''
        participant['entity_type'] = 'protein'

    features = []
    not_features = []
    # Binding features
    for bc in agent.bound_conditions:
        feature = {
            'feature_type': 'binding_feature',
            'bound_to': {
                # NOTE: get type and identifier for bound to protein
                'entity_type': 'protein',
                'entity_text': [bc.agent.name],
                'identifier': ''
            }
        }
        if bc.is_bound:
            features.append(feature)
        else:
            not_features.append(feature)
    # Modification features
    for mc in agent.mods:
        feature = {
            'feature_type': 'modification_feature',
            'modification_type': mc.mod_type.lower(),
        }
        if mc.position is not None:
            pos = int(mc.position)
            feature['location'] = pos
        if mc.residue is not None:
            feature['aa_code'] = mc.residue
        if mc.is_modified:
            features.append(feature)
        else:
            not_features.append(feature)
    # Mutation features
    for mc in agent.mutations:
        feature = {}
        feature['feature_type'] = 'mutation_feature'
        if mc.residue_from is not None:
            feature['from_aa'] = mc.residue_from
        if mc.residue_to is not None:
            feature['to_aa'] = mc.residue_to
        if mc.position is not None:
            pos = int(mc.position)
            feature['location'] = pos
        features.append(feature)
    if features:
        participant['features'] = features
    if not_features:
        participant['not_features'] = not_features
    return participant
Exemplo n.º 17
0
def test_get_mnemonic():
    mnemonic = uniprot_client.get_mnemonic('Q02750')
    assert(mnemonic == 'MP2K1_HUMAN')