Пример #1
0
def get_std_chemical(raw_string: str, db_id: str) -> List[Agent]:
    """Standardize chemical names.

    Parameters
    ----------
    raw_string :
        Name of the agent in the GNBR dataset.
    db_id :
        Entrez identifier of the agent.

    Returns
    -------
    :
        A standardized Agent object.
    """
    # If neither a name nor a DB ID is given, we return empty
    if pd.isna(db_id) and pd.isna(raw_string):
        return []
    # We add TEXT to db_refs if there is a raw_string
    db_refs = {'TEXT': raw_string} if not pd.isna(raw_string) else {}
    # In this case we know that there is no db_id but we have raw_string that
    # we can use as a name and we return with that agent
    if pd.isna(db_id):
        return [Agent(raw_string, db_refs=db_refs)]
    # Otherwise we have a db_id that we can process
    else:
        agents = []
        for single_db_id in db_id.split('|'):
            single_db_refs = deepcopy(db_refs)
            name = raw_string if not pd.isna(raw_string) else single_db_id
            if cheby_pattern.match(single_db_id):
                single_db_refs['CHEBI'] = single_db_id
            elif mesh_pattern.match(single_db_id):
                mesh_id = single_db_id[5:]
                # There are often non-existent MESH IDs here for some reason
                # that can be filtered out with this technique
                if not mesh_client.get_mesh_name(mesh_id, offline=True):
                    continue
                single_db_refs['MESH'] = mesh_id
            elif mesh_no_prefix_pattern.match(single_db_id):
                mesh_id = single_db_id
                # There are often non-existent MESH IDs here for some reason
                # that can be filtered out with this technique
                if not mesh_client.get_mesh_name(mesh_id, offline=True):
                    continue
                single_db_refs['MESH'] = single_db_id
            else:
                raise ValueError('Unexpected chemical identifier: %s' %
                                 single_db_id)
            agents.append(get_standard_agent(name, single_db_refs))
    return agents
Пример #2
0
def update_mesh_mappings():
    """Update MeSH mappings to other databases."""
    from indra.databases import mesh_client
    url = ('https://raw.githubusercontent.com/indralab/gilda/master/gilda/'
           'resources/mesh_mappings.tsv')
    df = pandas.read_csv(url, delimiter='\t', dtype=str, header=None)
    namespaces = ['efo', 'hp', 'doid']
    xref_mappings = []
    for ns in namespaces:

        filename = os.path.join(path, '%s.json' % ns)
        with open(filename) as file:
            entries = json.load(file)
        for entry in entries:
            db, db_id, name = ns.upper(), entry['id'], entry['name']
            if not df[(df[3] == ns.upper()) & (df[4] == db_id)].empty:
                continue
            # We first need to decide if we prioritize another name space
            xref_dict = {xr['namespace']: xr['id'] for xr in entry['xrefs']}
            if 'MESH' in xref_dict or 'MSH' in xref_dict:
                mesh_id = xref_dict.get('MESH') or xref_dict.get('MSH')
                if not mesh_id.startswith('D'):
                    continue
                mesh_name = mesh_client.get_mesh_name(mesh_id)
                if not mesh_name:
                    continue
                xref_mappings.append(
                    ('MESH', mesh_id, mesh_name, db, db_id, name))
    df_extend = pandas.DataFrame(xref_mappings, columns=None, dtype=str)
    df = df.append(df_extend)
    df.sort_values(1, inplace=True)  # sort by MeSH ID
    fname = os.path.join(path, 'mesh_mappings.tsv')
    df.to_csv(fname, sep='\t', index=None, header=None)
Пример #3
0
def make_search_terms(search_strings, mesh_ids):
    """Return EMMAA SearchTerms based on search strings and MeSH IDs.

    Parameters
    ----------
    search_strings : list of str
        A list of search strings e.g., "diabetes" to find papers in the
        literature.
    mesh_ids : list of str
        A list of MeSH IDs that are used to search the literature as headings
        associated with papers.

    Returns
    -------
    list of emmmaa.prior.SearchTerm
        A list of EMMAA SearchTerm objects constructed from the search strings
        and the MeSH IDs.
    """
    search_terms = []
    for search_string in search_strings:
        search_term = SearchTerm(type='other', name=search_string,
                                 db_refs={}, search_term=search_string)
        search_terms.append(search_term)
    for mesh_id in mesh_ids:
        mesh_name = mesh_client.get_mesh_name(mesh_id)
        suffix = 'mh' if mesh_id.startswith('D') else 'nm'
        search_term = SearchTerm(type='mesh', name=mesh_name,
                                 db_refs={'MESH': mesh_id},
                                 search_term=f'{mesh_name} [{suffix}]')
        search_terms.append(search_term)
    return search_terms
Пример #4
0
def get_ids_for_mesh(mesh_id, major_topic=False, **kwargs):
    """Return PMIDs that are annotated with a given MeSH ID.

    Parameters
    ----------
    mesh_id : str
        The MeSH ID of a term to search for, e.g., D009101.
    major_topic : bool
        If True, only papers for which the given MeSH ID is annotated as
        a major topic are returned. Otherwise all annotations are considered.
        Default: False
    **kwargs
        Any further PudMed search arguments that are passed to
        get_ids.
    """
    from indra.databases import mesh_client
    mesh_name = mesh_client.get_mesh_name(mesh_id)
    if not mesh_name:
        logger.error('Could not get MeSH name for ID %s' % mesh_id)
        return []
    suffix = 'majr' if major_topic else 'mh'
    search_term = '%s [%s]' % (mesh_name, suffix)
    ids = get_ids(search_term, use_text_word=False, **kwargs)
    if mesh_id.startswith('C') and not major_topic:
        # Get pmids for supplementary concepts as well
        search_term = '%s [nm]' % mesh_name
        ids2 = get_ids(search_term, use_text_word=False, **kwargs)
        ids = list(set(ids) | set(ids2))
    return ids
Пример #5
0
    def standardize_agent_name(agent, standardize_refs=True):
        """Standardize the name of an Agent based on grounding information.

        If an agent contains a FamPlex grounding, the FamPlex ID is used as a
        name. Otherwise if it contains a Uniprot ID, an attempt is made to find
        the associated HGNC gene name. If one can be found it is used as the
        agent name and the associated HGNC ID is added as an entry to the
        db_refs. Similarly, CHEBI, MESH and GO IDs are used in this order of
        priority to assign a standardized name to the Agent. If no relevant
        IDs are found, the name is not changed.

        Parameters
        ----------
        agent : indra.statements.Agent
            An INDRA Agent whose name attribute should be standardized based
            on grounding information.
        standardize_refs : Optional[bool]
            If True, this function assumes that the Agent's db_refs need to
            be standardized, e.g., HGNC mapped to UP.
            Default: True
        """
        # We return immediately for None Agents
        if agent is None:
            return

        if standardize_refs:
            agent.db_refs = GroundingMapper.standardize_db_refs(agent.db_refs)

        # We next look for prioritized grounding, if missing, we return
        db_ns, db_id = agent.get_grounding()
        if not db_ns or not db_id:
            return

        # If there's a FamPlex ID, prefer that for the name
        if db_ns == 'FPLX':
            agent.name = agent.db_refs['FPLX']
        # Importantly, HGNC here will be a symbol because that is what
        # get_grounding returns
        elif db_ns == 'HGNC':
            agent.name = hgnc_client.get_hgnc_name(db_id)
        elif db_ns == 'UP':
            # Try for the gene name
            gene_name = uniprot_client.get_gene_name(agent.db_refs['UP'],
                                                     web_fallback=False)
            if gene_name:
                agent.name = gene_name
        elif db_ns == 'CHEBI':
            chebi_name = \
                chebi_client.get_chebi_name_from_id(agent.db_refs['CHEBI'])
            if chebi_name:
                agent.name = chebi_name
        elif db_ns == 'MESH':
            mesh_name = mesh_client.get_mesh_name(agent.db_refs['MESH'], False)
            if mesh_name:
                agent.name = mesh_name
        elif db_ns == 'GO':
            go_name = go_client.get_go_label(agent.db_refs['GO'])
            if go_name:
                agent.name = go_name
        return
Пример #6
0
def generate_adeft_terms():
    from adeft import available_shortforms
    from adeft.disambiguate import load_disambiguator
    all_term_args = set()
    for shortform in available_shortforms:
        da = load_disambiguator(shortform)
        for grounding in da.names.keys():
            if grounding == 'ungrounded' or ':' not in grounding:
                continue
            db_ns, db_id = grounding.split(':', maxsplit=1)
            if db_ns == 'HGNC':
                standard_name = hgnc_client.get_hgnc_name(db_id)
            elif db_ns == 'GO':
                standard_name = go_client.get_go_label(db_id)
            elif db_ns == 'MESH':
                standard_name = mesh_client.get_mesh_name(db_id)
            elif db_ns == 'CHEBI':
                standard_name = chebi_client.get_chebi_name_from_id(db_id)
            elif db_ns == 'FPLX':
                standard_name = db_id
            elif db_ns == 'UP':
                standard_name = uniprot_client.get_gene_name(db_id)
            else:
                logger.warning('Unknown grounding namespace from Adeft: %s' %
                               db_ns)
                continue
            term_args = (normalize(shortform), shortform, db_ns, db_id,
                         standard_name, 'synonym', 'adeft')
            all_term_args.add(term_args)
    terms = [
        Term(*term_args)
        for term_args in sorted(list(all_term_args), key=lambda x: x[0])
    ]
    return terms
Пример #7
0
def proc_mesh(mesh_list, range=100):
    ctr = Counter(mesh_list)
    sort_ctr = sorted([(k, v) for k, v in ctr.items()],
                      key=lambda x: x[1],
                      reverse=True)
    mesh_names = []
    print("Retrieving MESH names")
    for mesh_id, count in sort_ctr[:range]:
        mesh_name = mesh_client.get_mesh_name(mesh_id)
        mesh_names.append((mesh_name, mesh_id, count))
    return mesh_names
Пример #8
0
def generate_famplex_terms(ignore_mappings=False):
    fname = os.path.join(indra_resources, 'famplex', 'grounding_map.csv')
    logger.info('Loading %s' % fname)
    terms = []
    for row in read_csv(fname, delimiter=','):
        txt = row[0]
        norm_txt = normalize(txt)
        groundings = {k: v for k, v in zip(row[1::2], row[2::2]) if (k and v)}
        if 'FPLX' in groundings:
            id = groundings['FPLX']
            term = Term(norm_txt, txt, 'FPLX', id, id, 'assertion', 'famplex')
        elif 'HGNC' in groundings:
            id = groundings['HGNC']
            term = Term(norm_txt, txt, 'HGNC', hgnc_client.get_hgnc_id(id), id,
                        'assertion', 'famplex', '9606')
        elif 'UP' in groundings:
            db = 'UP'
            id = groundings['UP']
            name = id
            organism = None
            if uniprot_client.is_human(id):
                organism = '9606'
                hgnc_id = uniprot_client.get_hgnc_id(id)
                if hgnc_id:
                    name = hgnc_client.get_hgnc_name(hgnc_id)
                    if hgnc_id:
                        db = 'HGNC'
                        id = hgnc_id
                else:
                    logger.warning('No gene name for %s' % id)
            # TODO: should we add organism info here?
            term = Term(norm_txt, txt, db, id, name, 'assertion', 'famplex',
                        organism)
        elif 'CHEBI' in groundings:
            id = groundings['CHEBI']
            name = chebi_client.get_chebi_name_from_id(id[6:])
            term = Term(norm_txt, txt, 'CHEBI', id, name, 'assertion',
                        'famplex')
        elif 'GO' in groundings:
            id = groundings['GO']
            term = Term(norm_txt, txt, 'GO', id, go_client.get_go_label(id),
                        'assertion', 'famplex')
        elif 'MESH' in groundings:
            id = groundings['MESH']
            mesh_mapping = mesh_mappings.get(id)
            db, db_id, name = mesh_mapping if (mesh_mapping
                                               and not ignore_mappings) else \
                ('MESH', id, mesh_client.get_mesh_name(id))
            term = Term(norm_txt, txt, db, db_id, name, 'assertion', 'famplex')
        else:
            # TODO: handle HMDB, PUBCHEM, CHEMBL
            continue
        terms.append(term)
    return terms
Пример #9
0
def dump_mappings(mappings, fname):
    mappings = sorted(mappings.values(), key=lambda x: x[0].id)
    with open(fname, 'w') as fh:
        for me, te in mappings:
            mesh_name = mesh_client.get_mesh_name(me.id)
            if te.db == 'HGNC':
                tname = hgnc_client.get_hgnc_name(te.id)
            elif te.db == 'FPLX':
                tname = te.id
            fh.write(
                '\t'.join([me.db, me.id, mesh_name, te.db, te.id, tname]) +
                '\n')
Пример #10
0
def make_search_terms(
    search_strings: List[str],
    mesh_ids: List[str],
) -> List[SearchTerm]:
    """Return EMMAA SearchTerms based on search strings and MeSH IDs.

    Parameters
    ----------
    search_strings :
        A list of search strings e.g., "diabetes" to find papers in the
        literature.
    mesh_ids :
        A list of MeSH IDs that are used to search the literature as headings
        associated with papers.

    Returns
    -------
    :
        A list of EMMAA SearchTerm objects constructed from the search strings
        and the MeSH IDs.
    """
    if not search_strings and not mesh_ids:
        raise ValueError("Need at least one of search_strings or mesh_ids")
    search_terms = []
    for search_string in search_strings:
        search_term = SearchTerm(type='other',
                                 name=search_string,
                                 db_refs={},
                                 search_term=search_string)
        search_terms.append(search_term)
    for mesh_id in mesh_ids:
        mesh_name = mesh_client.get_mesh_name(mesh_id)
        suffix = 'mh' if mesh_id.startswith('D') else 'nm'
        search_term = SearchTerm(type='mesh',
                                 name=mesh_name,
                                 db_refs={'MESH': mesh_id},
                                 search_term=f'{mesh_name} [{suffix}]')
        search_terms.append(search_term)
    return search_terms
Пример #11
0
def get_ids_for_mesh(mesh_id, major_topic=False, **kwargs):
    """Return PMIDs that are annotated with a given MeSH ID.

    Parameters
    ----------
    mesh_id : str
        The MeSH ID of a term to search for, e.g., D009101.
    major_topic : bool
        If True, only papers for which the given MeSH ID is annotated as
        a major topic are returned. Otherwise all annotations are considered.
        Default: False
    **kwargs
        Any further PudMed search arguments that are passed to
        get_ids.
    """
    mesh_name = mesh_client.get_mesh_name(mesh_id)
    if not mesh_name:
        logger.error('Could not get MeSH name for ID %s' % mesh_id)
        return []
    suffix = 'majr' if major_topic else 'mh'
    search_term = '%s [%s]' % (mesh_name, suffix)
    ids = get_ids(search_term, use_text_word=False, **kwargs)
    return ids
Пример #12
0
def test_mesh_id_local_missing():
    mesh_id = 'XXXX'  # dummy name to make sure we don't have it offline
    mesh_name = mesh_client.get_mesh_name(mesh_id, offline=True)
    assert mesh_name is None
Пример #13
0
def test_mesh_id_fallback_to_rest():
    mesh_id = 'D015242'
    mesh_name = mesh_client.get_mesh_name(mesh_id, offline=False)
    assert mesh_name == 'Ofloxacin'
Пример #14
0
def test_mesh_id_lookup_local():
    mesh_id = 'D005963'
    mesh_name = mesh_client.get_mesh_name(mesh_id, offline=True)
    assert mesh_name == 'Glucosylceramides'
Пример #15
0
def test_mesh_supplementary_id_lookup_local():
    mesh_id = 'C056331'
    mesh_name = mesh_client.get_mesh_name(mesh_id, offline=True)
    assert mesh_name == 'carbazomycin G'
Пример #16
0
def update_biomappings():
    """Update mappings from the BioMappings project."""
    from indra.databases import mesh_client
    from indra.databases.identifiers import get_ns_id_from_identifiers
    from biomappings.resources import load_mappings, load_predictions

    # We now construct a mapping dict of these mappings
    biomappings = defaultdict(list)
    mappings = load_mappings()
    predictions = load_predictions()
    exclude_ns = {'kegg.pathway', 'depmap', 'ccle', 'reactome'}
    for mappings, mapping_type in ((mappings, 'curated'),
                                   (predictions, 'predicted')):
        for mapping in mappings:
            # We skip anything that isn't an exact match
            if mapping['relation'] != 'skos:exactMatch':
                continue
            # Skip excluded name spaces that aren't relevant here
            if mapping['source prefix'] in exclude_ns or \
                    mapping['target prefix'] in exclude_ns:
                continue
            # We only accept curated mappings for NCIT
            if mapping_type == 'predicted' and \
                    (mapping['source prefix'] == 'ncit' or
                     mapping['target prefix'] == 'ncit'):
                continue
            source_ns, source_id = \
                get_ns_id_from_identifiers(mapping['source prefix'],
                                           mapping['source identifier'])
            target_ns, target_id = \
                get_ns_id_from_identifiers(mapping['target prefix'],
                                           mapping['target identifier'])
            # We only take real xrefs, not refs within a given ontology
            if source_ns == target_ns:
                continue
            biomappings[(source_ns, source_id, mapping['source name'])].append(
                (target_ns, target_id, mapping['target name']))
            biomappings[(target_ns, target_id, mapping['target name'])].append(
                (source_ns, source_id, mapping['source name']))

    def _filter_ncit(values):
        if len(values) > 1 and 'NCIT' in {v[0] for v in values}:
            return [v for v in values if v[0] != 'NCIT']
        else:
            return values

    mesh_mappings = {k: _filter_ncit(v) for k, v in biomappings.items()
                     if k[0] == 'MESH'}
    non_mesh_mappings = {k: [vv for vv in v if vv[0] != 'MESH']
                         for k, v in biomappings.items()
                         if k[0] != 'MESH' and k[1] != 'MESH'}
    rows = []
    for k, v in non_mesh_mappings.items():
        for vv in v:
            rows.append(list(k + vv))
    rows = sorted(rows, key=lambda x: x[1])
    write_unicode_csv(get_resource_path('biomappings.tsv'), rows,
                      delimiter='\t')

    # We next look at mappings to MeSH from EFO/HP/DOID
    for ns in ['efo', 'hp', 'doid']:
        for entry in load_resource_json('%s.json' % ns):
            db, db_id, name = ns.upper(), entry['id'], entry['name']
            if (db, db_id) in biomappings:
                continue
            # We first need to decide if we prioritize another name space
            xref_dict = {xr['namespace']: xr['id']
                         for xr in entry.get('xrefs', [])}
            if 'MESH' in xref_dict or 'MSH' in xref_dict:
                mesh_id = xref_dict.get('MESH') or xref_dict.get('MSH')
                if not mesh_id.startswith('D'):
                    continue
                mesh_name = mesh_client.get_mesh_name(mesh_id)
                if not mesh_name:
                    continue
                key = ('MESH', mesh_id, mesh_name)
                if db_id.startswith('BFO'):
                    db_to_use = 'BFO'
                    db_id_to_use = db_id[4:]
                else:
                    db_to_use = db
                    db_id_to_use = db_id
                if key not in mesh_mappings:
                    mesh_mappings[key] = [(db_to_use, db_id_to_use,
                                           entry['name'])]
                else:
                    mesh_mappings[key].append((db_to_use, db_id_to_use,
                                               entry['name']))

    rows = []
    for k, v in mesh_mappings.items():
        for vv in v:
            rows.append(list(k + vv))
    rows = sorted(rows, key=lambda x: (x[1], x[2], x[3]))
    write_unicode_csv(get_resource_path('mesh_mappings.tsv'), rows,
                      delimiter='\t')
Пример #17
0
def test_mesh_id_local_missing():
    mesh_id = 'D015242'
    mesh_name = mesh_client.get_mesh_name(mesh_id, offline=True)
    assert mesh_name is None
Пример #18
0
def test_mesh_id_lookup_local():
    mesh_id = 'D005963'
    mesh_name = mesh_client.get_mesh_name(mesh_id, offline=True)
    assert mesh_name == 'Glucosylceramides'
Пример #19
0
def test_mesh_id_local_missing():
    mesh_id = 'XXXX'  # dummy name to make sure we don't have it offline
    mesh_name = mesh_client.get_mesh_name(mesh_id, offline=True)
    assert mesh_name is None
Пример #20
0
def _generate_obo_terms(prefix, ignore_mappings=False):
    filename = os.path.join(indra_resources, '%s.json' % prefix)
    logger.info('Loading %s', filename)
    with open(filename) as file:
        entries = json.load(file)

    terms = []
    for entry in entries:
        db, db_id, name = prefix.upper(), entry['id'], entry['name']
        # We first need to decide if we prioritize another name space
        xref_dict = {xr['namespace']: xr['id'] for xr in entry['xrefs']}
        # Handle MeSH mappings first
        auto_mesh_mapping = mesh_mappings_reverse.get((db, db_id))
        if auto_mesh_mapping and not ignore_mappings:
            db, db_id, name = ('MESH', auto_mesh_mapping[0],
                               auto_mesh_mapping[1])
        elif 'MESH' in xref_dict or 'MSH' in xref_dict:
            mesh_id = xref_dict.get('MESH') or xref_dict.get('MSH')
            # Since we currently only include regular MeSH terms (which start
            # with D), we only need to do the mapping if that's the case.
            # We don't map any supplementary terms that start with C.
            if mesh_id.startswith('D'):
                mesh_name = mesh_client.get_mesh_name(mesh_id)
                if mesh_name:
                    # Here we need to check if we further map the MeSH ID to
                    # another namespace
                    mesh_mapping = mesh_mappings.get(mesh_id)
                    db, db_id, name = mesh_mapping if (mesh_mapping and \
                            mesh_mapping[0] not in {'EFO', 'HP', 'DOID'}) else \
                        ('MESH', mesh_id, mesh_name)
        # Next we look at mappings to DOID
        # TODO: are we sure that the DOIDs that we get here (from e.g., EFO)
        # cannot be mapped further to MeSH per the DOID resource file?
        elif 'DOID' in xref_dict:
            doid = xref_dict['DOID']
            if not doid.startswith('DOID:'):
                doid = 'DOID:' + doid
            doid_prim_id = doid_client.get_doid_id_from_doid_alt_id(doid)
            if doid_prim_id:
                doid = doid_prim_id
            doid_name = doid_client.get_doid_name_from_doid_id(doid)
            # If we don't get a name here, it's likely because an entry is
            # obsolete so we don't do the mapping
            if doid_name:
                db, db_id, name = 'DOID', doid, doid_name

        # Add a term for the name first
        name_term = Term(
            norm_text=normalize(name),
            text=name,
            db=db,
            id=db_id,
            entry_name=name,
            status='name',
            source=prefix,
        )
        terms.append(name_term)

        # Then add all the synonyms
        for synonym in set(entry['synonyms']):
            # Some synonyms are tagged as ambiguous, we remove these
            if 'ambiguous' in synonym.lower():
                continue
            # Some synonyms contain a "formerly" clause, we remove these
            match = re.match(r'(.+) \(formerly', synonym)
            if match:
                synonym = match.groups()[0]
            # Some synonyms contain additional annotations
            # e.g. Hyperplasia of facial adipose tissue" NARROW
            # [ORCID:0000-0001-5889-4463]
            # If this is the case, we strip these off
            match = re.match(r'([^"]+)', synonym)
            if match:
                synonym = match.groups()[0]

            synonym_term = Term(
                norm_text=normalize(synonym),
                text=synonym,
                db=db,
                id=db_id,
                entry_name=name,
                status='synonym',
                source=prefix,
            )
            terms.append(synonym_term)

    logger.info('Loaded %d terms from %s', len(terms), prefix)
    return terms
Пример #21
0
def test_mesh_id_local_missing():
    mesh_id = 'D015242'
    mesh_name = mesh_client.get_mesh_name(mesh_id, offline=True)
    assert mesh_name is None
Пример #22
0
def test_mesh_id_fallback_to_rest():
    mesh_id = 'D015242'
    mesh_name = mesh_client.get_mesh_name(mesh_id, offline=False)
    assert mesh_name == 'Ofloxacin'