def get_std_chemical(raw_string: str, db_id: str) -> List[Agent]: """Standardize chemical names. Parameters ---------- raw_string : Name of the agent in the GNBR dataset. db_id : Entrez identifier of the agent. Returns ------- : A standardized Agent object. """ # If neither a name nor a DB ID is given, we return empty if pd.isna(db_id) and pd.isna(raw_string): return [] # We add TEXT to db_refs if there is a raw_string db_refs = {'TEXT': raw_string} if not pd.isna(raw_string) else {} # In this case we know that there is no db_id but we have raw_string that # we can use as a name and we return with that agent if pd.isna(db_id): return [Agent(raw_string, db_refs=db_refs)] # Otherwise we have a db_id that we can process else: agents = [] for single_db_id in db_id.split('|'): single_db_refs = deepcopy(db_refs) name = raw_string if not pd.isna(raw_string) else single_db_id if cheby_pattern.match(single_db_id): single_db_refs['CHEBI'] = single_db_id elif mesh_pattern.match(single_db_id): mesh_id = single_db_id[5:] # There are often non-existent MESH IDs here for some reason # that can be filtered out with this technique if not mesh_client.get_mesh_name(mesh_id, offline=True): continue single_db_refs['MESH'] = mesh_id elif mesh_no_prefix_pattern.match(single_db_id): mesh_id = single_db_id # There are often non-existent MESH IDs here for some reason # that can be filtered out with this technique if not mesh_client.get_mesh_name(mesh_id, offline=True): continue single_db_refs['MESH'] = single_db_id else: raise ValueError('Unexpected chemical identifier: %s' % single_db_id) agents.append(get_standard_agent(name, single_db_refs)) return agents
def update_mesh_mappings(): """Update MeSH mappings to other databases.""" from indra.databases import mesh_client url = ('https://raw.githubusercontent.com/indralab/gilda/master/gilda/' 'resources/mesh_mappings.tsv') df = pandas.read_csv(url, delimiter='\t', dtype=str, header=None) namespaces = ['efo', 'hp', 'doid'] xref_mappings = [] for ns in namespaces: filename = os.path.join(path, '%s.json' % ns) with open(filename) as file: entries = json.load(file) for entry in entries: db, db_id, name = ns.upper(), entry['id'], entry['name'] if not df[(df[3] == ns.upper()) & (df[4] == db_id)].empty: continue # We first need to decide if we prioritize another name space xref_dict = {xr['namespace']: xr['id'] for xr in entry['xrefs']} if 'MESH' in xref_dict or 'MSH' in xref_dict: mesh_id = xref_dict.get('MESH') or xref_dict.get('MSH') if not mesh_id.startswith('D'): continue mesh_name = mesh_client.get_mesh_name(mesh_id) if not mesh_name: continue xref_mappings.append( ('MESH', mesh_id, mesh_name, db, db_id, name)) df_extend = pandas.DataFrame(xref_mappings, columns=None, dtype=str) df = df.append(df_extend) df.sort_values(1, inplace=True) # sort by MeSH ID fname = os.path.join(path, 'mesh_mappings.tsv') df.to_csv(fname, sep='\t', index=None, header=None)
def make_search_terms(search_strings, mesh_ids): """Return EMMAA SearchTerms based on search strings and MeSH IDs. Parameters ---------- search_strings : list of str A list of search strings e.g., "diabetes" to find papers in the literature. mesh_ids : list of str A list of MeSH IDs that are used to search the literature as headings associated with papers. Returns ------- list of emmmaa.prior.SearchTerm A list of EMMAA SearchTerm objects constructed from the search strings and the MeSH IDs. """ search_terms = [] for search_string in search_strings: search_term = SearchTerm(type='other', name=search_string, db_refs={}, search_term=search_string) search_terms.append(search_term) for mesh_id in mesh_ids: mesh_name = mesh_client.get_mesh_name(mesh_id) suffix = 'mh' if mesh_id.startswith('D') else 'nm' search_term = SearchTerm(type='mesh', name=mesh_name, db_refs={'MESH': mesh_id}, search_term=f'{mesh_name} [{suffix}]') search_terms.append(search_term) return search_terms
def get_ids_for_mesh(mesh_id, major_topic=False, **kwargs): """Return PMIDs that are annotated with a given MeSH ID. Parameters ---------- mesh_id : str The MeSH ID of a term to search for, e.g., D009101. major_topic : bool If True, only papers for which the given MeSH ID is annotated as a major topic are returned. Otherwise all annotations are considered. Default: False **kwargs Any further PudMed search arguments that are passed to get_ids. """ from indra.databases import mesh_client mesh_name = mesh_client.get_mesh_name(mesh_id) if not mesh_name: logger.error('Could not get MeSH name for ID %s' % mesh_id) return [] suffix = 'majr' if major_topic else 'mh' search_term = '%s [%s]' % (mesh_name, suffix) ids = get_ids(search_term, use_text_word=False, **kwargs) if mesh_id.startswith('C') and not major_topic: # Get pmids for supplementary concepts as well search_term = '%s [nm]' % mesh_name ids2 = get_ids(search_term, use_text_word=False, **kwargs) ids = list(set(ids) | set(ids2)) return ids
def standardize_agent_name(agent, standardize_refs=True): """Standardize the name of an Agent based on grounding information. If an agent contains a FamPlex grounding, the FamPlex ID is used as a name. Otherwise if it contains a Uniprot ID, an attempt is made to find the associated HGNC gene name. If one can be found it is used as the agent name and the associated HGNC ID is added as an entry to the db_refs. Similarly, CHEBI, MESH and GO IDs are used in this order of priority to assign a standardized name to the Agent. If no relevant IDs are found, the name is not changed. Parameters ---------- agent : indra.statements.Agent An INDRA Agent whose name attribute should be standardized based on grounding information. standardize_refs : Optional[bool] If True, this function assumes that the Agent's db_refs need to be standardized, e.g., HGNC mapped to UP. Default: True """ # We return immediately for None Agents if agent is None: return if standardize_refs: agent.db_refs = GroundingMapper.standardize_db_refs(agent.db_refs) # We next look for prioritized grounding, if missing, we return db_ns, db_id = agent.get_grounding() if not db_ns or not db_id: return # If there's a FamPlex ID, prefer that for the name if db_ns == 'FPLX': agent.name = agent.db_refs['FPLX'] # Importantly, HGNC here will be a symbol because that is what # get_grounding returns elif db_ns == 'HGNC': agent.name = hgnc_client.get_hgnc_name(db_id) elif db_ns == 'UP': # Try for the gene name gene_name = uniprot_client.get_gene_name(agent.db_refs['UP'], web_fallback=False) if gene_name: agent.name = gene_name elif db_ns == 'CHEBI': chebi_name = \ chebi_client.get_chebi_name_from_id(agent.db_refs['CHEBI']) if chebi_name: agent.name = chebi_name elif db_ns == 'MESH': mesh_name = mesh_client.get_mesh_name(agent.db_refs['MESH'], False) if mesh_name: agent.name = mesh_name elif db_ns == 'GO': go_name = go_client.get_go_label(agent.db_refs['GO']) if go_name: agent.name = go_name return
def generate_adeft_terms(): from adeft import available_shortforms from adeft.disambiguate import load_disambiguator all_term_args = set() for shortform in available_shortforms: da = load_disambiguator(shortform) for grounding in da.names.keys(): if grounding == 'ungrounded' or ':' not in grounding: continue db_ns, db_id = grounding.split(':', maxsplit=1) if db_ns == 'HGNC': standard_name = hgnc_client.get_hgnc_name(db_id) elif db_ns == 'GO': standard_name = go_client.get_go_label(db_id) elif db_ns == 'MESH': standard_name = mesh_client.get_mesh_name(db_id) elif db_ns == 'CHEBI': standard_name = chebi_client.get_chebi_name_from_id(db_id) elif db_ns == 'FPLX': standard_name = db_id elif db_ns == 'UP': standard_name = uniprot_client.get_gene_name(db_id) else: logger.warning('Unknown grounding namespace from Adeft: %s' % db_ns) continue term_args = (normalize(shortform), shortform, db_ns, db_id, standard_name, 'synonym', 'adeft') all_term_args.add(term_args) terms = [ Term(*term_args) for term_args in sorted(list(all_term_args), key=lambda x: x[0]) ] return terms
def proc_mesh(mesh_list, range=100): ctr = Counter(mesh_list) sort_ctr = sorted([(k, v) for k, v in ctr.items()], key=lambda x: x[1], reverse=True) mesh_names = [] print("Retrieving MESH names") for mesh_id, count in sort_ctr[:range]: mesh_name = mesh_client.get_mesh_name(mesh_id) mesh_names.append((mesh_name, mesh_id, count)) return mesh_names
def generate_famplex_terms(ignore_mappings=False): fname = os.path.join(indra_resources, 'famplex', 'grounding_map.csv') logger.info('Loading %s' % fname) terms = [] for row in read_csv(fname, delimiter=','): txt = row[0] norm_txt = normalize(txt) groundings = {k: v for k, v in zip(row[1::2], row[2::2]) if (k and v)} if 'FPLX' in groundings: id = groundings['FPLX'] term = Term(norm_txt, txt, 'FPLX', id, id, 'assertion', 'famplex') elif 'HGNC' in groundings: id = groundings['HGNC'] term = Term(norm_txt, txt, 'HGNC', hgnc_client.get_hgnc_id(id), id, 'assertion', 'famplex', '9606') elif 'UP' in groundings: db = 'UP' id = groundings['UP'] name = id organism = None if uniprot_client.is_human(id): organism = '9606' hgnc_id = uniprot_client.get_hgnc_id(id) if hgnc_id: name = hgnc_client.get_hgnc_name(hgnc_id) if hgnc_id: db = 'HGNC' id = hgnc_id else: logger.warning('No gene name for %s' % id) # TODO: should we add organism info here? term = Term(norm_txt, txt, db, id, name, 'assertion', 'famplex', organism) elif 'CHEBI' in groundings: id = groundings['CHEBI'] name = chebi_client.get_chebi_name_from_id(id[6:]) term = Term(norm_txt, txt, 'CHEBI', id, name, 'assertion', 'famplex') elif 'GO' in groundings: id = groundings['GO'] term = Term(norm_txt, txt, 'GO', id, go_client.get_go_label(id), 'assertion', 'famplex') elif 'MESH' in groundings: id = groundings['MESH'] mesh_mapping = mesh_mappings.get(id) db, db_id, name = mesh_mapping if (mesh_mapping and not ignore_mappings) else \ ('MESH', id, mesh_client.get_mesh_name(id)) term = Term(norm_txt, txt, db, db_id, name, 'assertion', 'famplex') else: # TODO: handle HMDB, PUBCHEM, CHEMBL continue terms.append(term) return terms
def dump_mappings(mappings, fname): mappings = sorted(mappings.values(), key=lambda x: x[0].id) with open(fname, 'w') as fh: for me, te in mappings: mesh_name = mesh_client.get_mesh_name(me.id) if te.db == 'HGNC': tname = hgnc_client.get_hgnc_name(te.id) elif te.db == 'FPLX': tname = te.id fh.write( '\t'.join([me.db, me.id, mesh_name, te.db, te.id, tname]) + '\n')
def make_search_terms( search_strings: List[str], mesh_ids: List[str], ) -> List[SearchTerm]: """Return EMMAA SearchTerms based on search strings and MeSH IDs. Parameters ---------- search_strings : A list of search strings e.g., "diabetes" to find papers in the literature. mesh_ids : A list of MeSH IDs that are used to search the literature as headings associated with papers. Returns ------- : A list of EMMAA SearchTerm objects constructed from the search strings and the MeSH IDs. """ if not search_strings and not mesh_ids: raise ValueError("Need at least one of search_strings or mesh_ids") search_terms = [] for search_string in search_strings: search_term = SearchTerm(type='other', name=search_string, db_refs={}, search_term=search_string) search_terms.append(search_term) for mesh_id in mesh_ids: mesh_name = mesh_client.get_mesh_name(mesh_id) suffix = 'mh' if mesh_id.startswith('D') else 'nm' search_term = SearchTerm(type='mesh', name=mesh_name, db_refs={'MESH': mesh_id}, search_term=f'{mesh_name} [{suffix}]') search_terms.append(search_term) return search_terms
def get_ids_for_mesh(mesh_id, major_topic=False, **kwargs): """Return PMIDs that are annotated with a given MeSH ID. Parameters ---------- mesh_id : str The MeSH ID of a term to search for, e.g., D009101. major_topic : bool If True, only papers for which the given MeSH ID is annotated as a major topic are returned. Otherwise all annotations are considered. Default: False **kwargs Any further PudMed search arguments that are passed to get_ids. """ mesh_name = mesh_client.get_mesh_name(mesh_id) if not mesh_name: logger.error('Could not get MeSH name for ID %s' % mesh_id) return [] suffix = 'majr' if major_topic else 'mh' search_term = '%s [%s]' % (mesh_name, suffix) ids = get_ids(search_term, use_text_word=False, **kwargs) return ids
def test_mesh_id_local_missing(): mesh_id = 'XXXX' # dummy name to make sure we don't have it offline mesh_name = mesh_client.get_mesh_name(mesh_id, offline=True) assert mesh_name is None
def test_mesh_id_fallback_to_rest(): mesh_id = 'D015242' mesh_name = mesh_client.get_mesh_name(mesh_id, offline=False) assert mesh_name == 'Ofloxacin'
def test_mesh_id_lookup_local(): mesh_id = 'D005963' mesh_name = mesh_client.get_mesh_name(mesh_id, offline=True) assert mesh_name == 'Glucosylceramides'
def test_mesh_supplementary_id_lookup_local(): mesh_id = 'C056331' mesh_name = mesh_client.get_mesh_name(mesh_id, offline=True) assert mesh_name == 'carbazomycin G'
def update_biomappings(): """Update mappings from the BioMappings project.""" from indra.databases import mesh_client from indra.databases.identifiers import get_ns_id_from_identifiers from biomappings.resources import load_mappings, load_predictions # We now construct a mapping dict of these mappings biomappings = defaultdict(list) mappings = load_mappings() predictions = load_predictions() exclude_ns = {'kegg.pathway', 'depmap', 'ccle', 'reactome'} for mappings, mapping_type in ((mappings, 'curated'), (predictions, 'predicted')): for mapping in mappings: # We skip anything that isn't an exact match if mapping['relation'] != 'skos:exactMatch': continue # Skip excluded name spaces that aren't relevant here if mapping['source prefix'] in exclude_ns or \ mapping['target prefix'] in exclude_ns: continue # We only accept curated mappings for NCIT if mapping_type == 'predicted' and \ (mapping['source prefix'] == 'ncit' or mapping['target prefix'] == 'ncit'): continue source_ns, source_id = \ get_ns_id_from_identifiers(mapping['source prefix'], mapping['source identifier']) target_ns, target_id = \ get_ns_id_from_identifiers(mapping['target prefix'], mapping['target identifier']) # We only take real xrefs, not refs within a given ontology if source_ns == target_ns: continue biomappings[(source_ns, source_id, mapping['source name'])].append( (target_ns, target_id, mapping['target name'])) biomappings[(target_ns, target_id, mapping['target name'])].append( (source_ns, source_id, mapping['source name'])) def _filter_ncit(values): if len(values) > 1 and 'NCIT' in {v[0] for v in values}: return [v for v in values if v[0] != 'NCIT'] else: return values mesh_mappings = {k: _filter_ncit(v) for k, v in biomappings.items() if k[0] == 'MESH'} non_mesh_mappings = {k: [vv for vv in v if vv[0] != 'MESH'] for k, v in biomappings.items() if k[0] != 'MESH' and k[1] != 'MESH'} rows = [] for k, v in non_mesh_mappings.items(): for vv in v: rows.append(list(k + vv)) rows = sorted(rows, key=lambda x: x[1]) write_unicode_csv(get_resource_path('biomappings.tsv'), rows, delimiter='\t') # We next look at mappings to MeSH from EFO/HP/DOID for ns in ['efo', 'hp', 'doid']: for entry in load_resource_json('%s.json' % ns): db, db_id, name = ns.upper(), entry['id'], entry['name'] if (db, db_id) in biomappings: continue # We first need to decide if we prioritize another name space xref_dict = {xr['namespace']: xr['id'] for xr in entry.get('xrefs', [])} if 'MESH' in xref_dict or 'MSH' in xref_dict: mesh_id = xref_dict.get('MESH') or xref_dict.get('MSH') if not mesh_id.startswith('D'): continue mesh_name = mesh_client.get_mesh_name(mesh_id) if not mesh_name: continue key = ('MESH', mesh_id, mesh_name) if db_id.startswith('BFO'): db_to_use = 'BFO' db_id_to_use = db_id[4:] else: db_to_use = db db_id_to_use = db_id if key not in mesh_mappings: mesh_mappings[key] = [(db_to_use, db_id_to_use, entry['name'])] else: mesh_mappings[key].append((db_to_use, db_id_to_use, entry['name'])) rows = [] for k, v in mesh_mappings.items(): for vv in v: rows.append(list(k + vv)) rows = sorted(rows, key=lambda x: (x[1], x[2], x[3])) write_unicode_csv(get_resource_path('mesh_mappings.tsv'), rows, delimiter='\t')
def test_mesh_id_local_missing(): mesh_id = 'D015242' mesh_name = mesh_client.get_mesh_name(mesh_id, offline=True) assert mesh_name is None
def _generate_obo_terms(prefix, ignore_mappings=False): filename = os.path.join(indra_resources, '%s.json' % prefix) logger.info('Loading %s', filename) with open(filename) as file: entries = json.load(file) terms = [] for entry in entries: db, db_id, name = prefix.upper(), entry['id'], entry['name'] # We first need to decide if we prioritize another name space xref_dict = {xr['namespace']: xr['id'] for xr in entry['xrefs']} # Handle MeSH mappings first auto_mesh_mapping = mesh_mappings_reverse.get((db, db_id)) if auto_mesh_mapping and not ignore_mappings: db, db_id, name = ('MESH', auto_mesh_mapping[0], auto_mesh_mapping[1]) elif 'MESH' in xref_dict or 'MSH' in xref_dict: mesh_id = xref_dict.get('MESH') or xref_dict.get('MSH') # Since we currently only include regular MeSH terms (which start # with D), we only need to do the mapping if that's the case. # We don't map any supplementary terms that start with C. if mesh_id.startswith('D'): mesh_name = mesh_client.get_mesh_name(mesh_id) if mesh_name: # Here we need to check if we further map the MeSH ID to # another namespace mesh_mapping = mesh_mappings.get(mesh_id) db, db_id, name = mesh_mapping if (mesh_mapping and \ mesh_mapping[0] not in {'EFO', 'HP', 'DOID'}) else \ ('MESH', mesh_id, mesh_name) # Next we look at mappings to DOID # TODO: are we sure that the DOIDs that we get here (from e.g., EFO) # cannot be mapped further to MeSH per the DOID resource file? elif 'DOID' in xref_dict: doid = xref_dict['DOID'] if not doid.startswith('DOID:'): doid = 'DOID:' + doid doid_prim_id = doid_client.get_doid_id_from_doid_alt_id(doid) if doid_prim_id: doid = doid_prim_id doid_name = doid_client.get_doid_name_from_doid_id(doid) # If we don't get a name here, it's likely because an entry is # obsolete so we don't do the mapping if doid_name: db, db_id, name = 'DOID', doid, doid_name # Add a term for the name first name_term = Term( norm_text=normalize(name), text=name, db=db, id=db_id, entry_name=name, status='name', source=prefix, ) terms.append(name_term) # Then add all the synonyms for synonym in set(entry['synonyms']): # Some synonyms are tagged as ambiguous, we remove these if 'ambiguous' in synonym.lower(): continue # Some synonyms contain a "formerly" clause, we remove these match = re.match(r'(.+) \(formerly', synonym) if match: synonym = match.groups()[0] # Some synonyms contain additional annotations # e.g. Hyperplasia of facial adipose tissue" NARROW # [ORCID:0000-0001-5889-4463] # If this is the case, we strip these off match = re.match(r'([^"]+)', synonym) if match: synonym = match.groups()[0] synonym_term = Term( norm_text=normalize(synonym), text=synonym, db=db, id=db_id, entry_name=name, status='synonym', source=prefix, ) terms.append(synonym_term) logger.info('Loaded %d terms from %s', len(terms), prefix) return terms