def update_biomappings(): """Update mappings from the BioMappings project.""" from indra.databases import mesh_client from indra.databases.identifiers import get_ns_id_from_identifiers from biomappings.resources import load_mappings, load_predictions # We now construct a mapping dict of these mappings biomappings = defaultdict(list) mappings = load_mappings() predictions = load_predictions() exclude_ns = {'kegg.pathway', 'depmap', 'ccle', 'reactome'} for mappings, mapping_type in ((mappings, 'curated'), (predictions, 'predicted')): for mapping in mappings: # We skip anything that isn't an exact match if mapping['relation'] != 'skos:exactMatch': continue # Skip excluded name spaces that aren't relevant here if mapping['source prefix'] in exclude_ns or \ mapping['target prefix'] in exclude_ns: continue # We only accept curated mappings for NCIT if mapping_type == 'predicted' and \ (mapping['source prefix'] == 'ncit' or mapping['target prefix'] == 'ncit'): continue source_ns, source_id = \ get_ns_id_from_identifiers(mapping['source prefix'], mapping['source identifier']) target_ns, target_id = \ get_ns_id_from_identifiers(mapping['target prefix'], mapping['target identifier']) # We only take real xrefs, not refs within a given ontology if source_ns == target_ns: continue biomappings[(source_ns, source_id, mapping['source name'])].append( (target_ns, target_id, mapping['target name'])) biomappings[(target_ns, target_id, mapping['target name'])].append( (source_ns, source_id, mapping['source name'])) def _filter_ncit(values): if len(values) > 1 and 'NCIT' in {v[0] for v in values}: return [v for v in values if v[0] != 'NCIT'] else: return values mesh_mappings = {k: _filter_ncit(v) for k, v in biomappings.items() if k[0] == 'MESH'} non_mesh_mappings = {k: [vv for vv in v if vv[0] != 'MESH'] for k, v in biomappings.items() if k[0] != 'MESH' and k[1] != 'MESH'} rows = [] for k, v in non_mesh_mappings.items(): for vv in v: rows.append(list(k + vv)) rows = sorted(rows, key=lambda x: x[1]) write_unicode_csv(get_resource_path('biomappings.tsv'), rows, delimiter='\t') # We next look at mappings to MeSH from EFO/HP/DOID for ns in ['efo', 'hp', 'doid']: for entry in load_resource_json('%s.json' % ns): db, db_id, name = ns.upper(), entry['id'], entry['name'] if (db, db_id) in biomappings: continue # We first need to decide if we prioritize another name space xref_dict = {xr['namespace']: xr['id'] for xr in entry.get('xrefs', [])} if 'MESH' in xref_dict or 'MSH' in xref_dict: mesh_id = xref_dict.get('MESH') or xref_dict.get('MSH') if not mesh_id.startswith('D'): continue mesh_name = mesh_client.get_mesh_name(mesh_id) if not mesh_name: continue key = ('MESH', mesh_id, mesh_name) if db_id.startswith('BFO'): db_to_use = 'BFO' db_id_to_use = db_id[4:] else: db_to_use = db db_id_to_use = db_id if key not in mesh_mappings: mesh_mappings[key] = [(db_to_use, db_id_to_use, entry['name'])] else: mesh_mappings[key].append((db_to_use, db_id_to_use, entry['name'])) rows = [] for k, v in mesh_mappings.items(): for vv in v: rows.append(list(k + vv)) rows = sorted(rows, key=lambda x: (x[1], x[2], x[3])) write_unicode_csv(get_resource_path('mesh_mappings.tsv'), rows, delimiter='\t')
def test_map_ns_id(): assert get_ns_id_from_identifiers('uniprot', 'P12345') == \ ('UP', 'P12345') assert get_ns_id_from_identifiers('go', 'GO:0005856') == \ ('GO', 'GO:0005856')