示例#1
0
def get_specific_chebi_id(chebi_ids, name):
    # NOTE: this function is mainly factored out to be able to use cacheing, it
    # requires a frozenset as input to work.

    # First, if we have a manual override, we just do that
    manual_id = manual_chebi_map.get(name)
    if manual_id:
        return manual_id

    # The first thing we do is eliminate the secondary IDs by mapping them to
    # primaries
    primary_ids = {chebi_client.get_primary_id(cid) for cid in chebi_ids}
    # Occasinally, invalid ChEBI IDs are given that don't have corresponding
    # primary IDs, which we can filter out
    primary_ids = {pi for pi in primary_ids if pi is not None}
    # We then get rid of generic IDs which are never useful for grounding
    non_generic_ids = primary_ids - generic_chebi_ids

    # We then try name-based grounding to see if any of the names in the list
    # match the name of the entity well enough
    grounding_names = [
        chebi_client.get_chebi_name_from_id(p) for p in non_generic_ids
    ]
    for grounding_name, grounding_id in zip(grounding_names, non_generic_ids):
        if grounding_name and (name.lower() == grounding_name.lower()):
            return grounding_id

    # If we still have no best grounding, we try to distill the IDs down to
    # the most specific one based on the hierarchy
    specific_chebi_id = chebi_client.get_specific_id(non_generic_ids)
    return specific_chebi_id
示例#2
0
def fix_id_standards(db_ns, db_id):
    if db_ns == 'CHEBI':
        if not db_id.startswith('CHEBI:'):
            db_id = f'CHEBI:{db_id}'
        db_id = chebi_client.get_primary_id(db_id)
    elif db_ns == 'HGNC' and db_id.startswith('HGNC:'):
        db_id = db_id[5:]
    return db_ns, db_id
示例#3
0
def generate_chebi_terms():
    fname = os.path.join(indra_resources, 'chebi_entries.tsv')
    logger.info('Loading %s' % fname)
    terms = []
    for row in read_csv(fname, header=True, delimiter='\t'):
        db = 'CHEBI'
        id = 'CHEBI:' + row['CHEBI_ID']
        name = row['NAME']
        term = Term(normalize(name), name, db, id, name, 'name', 'chebi')
        terms.append(term)
    logger.info('Loaded %d terms' % len(terms))

    # Now we add synonyms
    # NOTE: this file is not in version control. The file is available
    # at ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_
    # tab_delimited/names_3star.tsv.gz, it needs to be decompressed
    # into the INDRA resources folder.
    fname = os.path.join(indra_resources, 'names_3star.tsv')
    if not os.path.exists(fname):
        import pandas as pd
        chebi_url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' \
                    'Flat_file_tab_delimited/names_3star.tsv.gz'
        logger.info('Loading %s into memory. You can download and decompress'
                    ' it in the indra/resources folder for faster access.'
                    % chebi_url)
        df = pd.read_csv(chebi_url, sep='\t')
        rows = (row for _, row in df.iterrows())
    else:
        rows = read_csv(fname, header=True, delimiter='\t')

    added = set()
    for row in rows:
        chebi_id = chebi_client.get_primary_id(str(row['COMPOUND_ID']))
        if not chebi_id:
            logger.info('Could not get valid CHEBI ID for %s' %
                        row['COMPOUND_ID'])
            continue
        db = 'CHEBI'
        id = 'CHEBI:%s' % chebi_id
        name = str(row['NAME'])
        chebi_name = \
            chebi_client.get_chebi_name_from_id(chebi_id, offline=True)
        if chebi_name is None:
            logger.info('Could not get valid name for %s' % chebi_id)
            continue

        term_args = (normalize(name), name, db, id, chebi_name, 'synonym',
                     'chebi')
        if term_args in added:
            continue
        else:
            term = Term(*term_args)
            terms.append(term)
            added.add(term_args)
    logger.info('Loaded %d terms' % len(terms))
    return terms
示例#4
0
def generate_chebi_terms():
    # We can get standard names directly from the OBO
    terms = _generate_obo_terms('chebi', ignore_mappings=True, map_to_ns={})

    # Now we add synonyms
    # NOTE: this file is not in version control. The file is available
    # at ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_
    # tab_delimited/names_3star.tsv.gz, it needs to be decompressed
    # into the INDRA resources folder.
    fname = os.path.join(indra_resources, 'names_3star.tsv')
    if not os.path.exists(fname):
        import pandas as pd
        chebi_url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' \
                    'Flat_file_tab_delimited/names_3star.tsv.gz'
        logger.info('Loading %s into memory. You can download and decompress'
                    ' it in the indra/resources folder for faster access.' %
                    chebi_url)
        df = pd.read_csv(chebi_url, sep='\t')
        rows = (row for _, row in df.iterrows())
    else:
        rows = read_csv(fname, header=True, delimiter='\t')

    added = set()
    for row in rows:
        chebi_id = chebi_client.get_primary_id(str(row['COMPOUND_ID']))
        if not chebi_id:
            logger.info('Could not get valid CHEBI ID for %s' %
                        row['COMPOUND_ID'])
            continue
        db = 'CHEBI'
        name = str(row['NAME'])
        chebi_name = \
            chebi_client.get_chebi_name_from_id(chebi_id, offline=True)
        if chebi_name is None:
            logger.info('Could not get valid name for %s' % chebi_id)
            continue
        # We skip entries of the form Glu-Lys with synonyms like EK since
        # there are highly ambiguous with other acronyms, and are unlikely
        # to be used in practice.
        if is_aa_sequence(chebi_name) and re.match(r'(^[A-Z-]+$)', name):
            continue

        term_args = (normalize(name), name, db, chebi_id, chebi_name,
                     'synonym', 'chebi')
        if term_args in added:
            continue
        else:
            term = Term(*term_args)
            terms.append(term)
            added.add(term_args)
    logger.info('Loaded %d terms' % len(terms))
    return terms
示例#5
0
def sanitize_chebi_ids(chebi_ids, name):
    chebi_ids = {chebi_id if chebi_id.startswith('CHEBI:')
                 else 'CHEBI:%s' % chebi_id for chebi_id in chebi_ids}
    chebi_ids = {chebi_client.get_primary_id(chebi_id)
                 for chebi_id in chebi_ids}
    # Make sure we eliminate Nones here which can appear as a result
    # of failed primary ID lookups
    chebi_ids = {ci for ci in chebi_ids if ci is not None}
    if not chebi_ids:
        return []
    elif len(chebi_ids) == 1:
        return list(chebi_ids)
    specific_chebi_id = get_specific_chebi_id(frozenset(chebi_ids), name)
    return specific_chebi_id
示例#6
0
 def _get_primary_id_wrapper(chebi_id):
     return get_primary_id(chebi_id)[6:]
示例#7
0
def test_chebi_to_primary():
    assert chebi_client.get_primary_id('CHEBI:6281') == 'CHEBI:17490'
    assert chebi_client.get_primary_id('CHEBI:161680') == 'CHEBI:161680'