예제 #1
0
def update_biomappings():
    """Update mappings from the BioMappings project."""
    from indra.databases import mesh_client
    from indra.databases.identifiers import get_ns_id_from_identifiers
    from biomappings.resources import load_mappings, load_predictions

    # We now construct a mapping dict of these mappings
    biomappings = defaultdict(list)
    mappings = load_mappings()
    predictions = load_predictions()
    exclude_ns = {'kegg.pathway', 'depmap', 'ccle', 'reactome'}
    for mappings, mapping_type in ((mappings, 'curated'),
                                   (predictions, 'predicted')):
        for mapping in mappings:
            # We skip anything that isn't an exact match
            if mapping['relation'] != 'skos:exactMatch':
                continue
            # Skip excluded name spaces that aren't relevant here
            if mapping['source prefix'] in exclude_ns or \
                    mapping['target prefix'] in exclude_ns:
                continue
            # We only accept curated mappings for NCIT
            if mapping_type == 'predicted' and \
                    (mapping['source prefix'] == 'ncit' or
                     mapping['target prefix'] == 'ncit'):
                continue
            source_ns, source_id = \
                get_ns_id_from_identifiers(mapping['source prefix'],
                                           mapping['source identifier'])
            target_ns, target_id = \
                get_ns_id_from_identifiers(mapping['target prefix'],
                                           mapping['target identifier'])
            # We only take real xrefs, not refs within a given ontology
            if source_ns == target_ns:
                continue
            biomappings[(source_ns, source_id, mapping['source name'])].append(
                (target_ns, target_id, mapping['target name']))
            biomappings[(target_ns, target_id, mapping['target name'])].append(
                (source_ns, source_id, mapping['source name']))

    def _filter_ncit(values):
        if len(values) > 1 and 'NCIT' in {v[0] for v in values}:
            return [v for v in values if v[0] != 'NCIT']
        else:
            return values

    mesh_mappings = {k: _filter_ncit(v) for k, v in biomappings.items()
                     if k[0] == 'MESH'}
    non_mesh_mappings = {k: [vv for vv in v if vv[0] != 'MESH']
                         for k, v in biomappings.items()
                         if k[0] != 'MESH' and k[1] != 'MESH'}
    rows = []
    for k, v in non_mesh_mappings.items():
        for vv in v:
            rows.append(list(k + vv))
    rows = sorted(rows, key=lambda x: x[1])
    write_unicode_csv(get_resource_path('biomappings.tsv'), rows,
                      delimiter='\t')

    # We next look at mappings to MeSH from EFO/HP/DOID
    for ns in ['efo', 'hp', 'doid']:
        for entry in load_resource_json('%s.json' % ns):
            db, db_id, name = ns.upper(), entry['id'], entry['name']
            if (db, db_id) in biomappings:
                continue
            # We first need to decide if we prioritize another name space
            xref_dict = {xr['namespace']: xr['id']
                         for xr in entry.get('xrefs', [])}
            if 'MESH' in xref_dict or 'MSH' in xref_dict:
                mesh_id = xref_dict.get('MESH') or xref_dict.get('MSH')
                if not mesh_id.startswith('D'):
                    continue
                mesh_name = mesh_client.get_mesh_name(mesh_id)
                if not mesh_name:
                    continue
                key = ('MESH', mesh_id, mesh_name)
                if db_id.startswith('BFO'):
                    db_to_use = 'BFO'
                    db_id_to_use = db_id[4:]
                else:
                    db_to_use = db
                    db_id_to_use = db_id
                if key not in mesh_mappings:
                    mesh_mappings[key] = [(db_to_use, db_id_to_use,
                                           entry['name'])]
                else:
                    mesh_mappings[key].append((db_to_use, db_id_to_use,
                                               entry['name']))

    rows = []
    for k, v in mesh_mappings.items():
        for vv in v:
            rows.append(list(k + vv))
    rows = sorted(rows, key=lambda x: (x[1], x[2], x[3]))
    write_unicode_csv(get_resource_path('mesh_mappings.tsv'), rows,
                      delimiter='\t')
예제 #2
0
def test_map_ns_id():
    assert get_ns_id_from_identifiers('uniprot', 'P12345') == \
        ('UP', 'P12345')
    assert get_ns_id_from_identifiers('go', 'GO:0005856') == \
        ('GO', 'GO:0005856')