def __new__(cls, *args, **kwargs): if not hasattr(cls, 'existing'): e_config = Config('cut-development') e_config.load_existing() # FIXME clear use case for the remaining bound to whatever query produced it rather # than the other way around ... how to support this use case ... cls.existing = {n.origLabel.toPython():n for n in e_config.existing_pes} cls.existing.update({n.id_:n for n in e_config.existing_pes}) cls.query = oq.OntQuery(oq.plugin.get('rdflib')(e_config.core_graph), instrumented=OntTerm) cls.sgv = Vocabulary() return super().__new__(cls)
def __new__(cls, *args, **kwargs): if not hasattr(cls, 'existing'): e_config = Config('common-usage-types') e_config.load_existing() # FIXME clear use case for the remaining bound to whatever query produced it rather # than the other way around ... how to support this use case ... cls.existing = {str(n.origLabel): n for n in e_config.neurons()} cls.query = oq.OntQuery(oq.plugin.get('rdflib')( e_config.core_graph), instrumented=OntTerm) cls.sgv = Vocabulary() return super().__new__(cls)
import os from collections import defaultdict, namedtuple import rdflib from rdflib import URIRef, RDFS, RDF, OWL from rdflib.namespace import SKOS import requests from pyontutils.scigraph import Vocabulary, Graph from pyontutils.utils import TODAY, async_getter, TermColors as tc from pyontutils.scig import scigPrint from pyontutils.hierarchies import creatTree, flatten from pyontutils.core import devconfig, OntMeta, makePrefixes, makeGraph from pyontutils.core import NIFRID, oboInOwl from IPython import embed sgg = Graph(cache=True) sgv = Vocabulary(cache=True) Query = namedtuple('Query', ['root', 'relationshipType', 'direction', 'depth']) CON = oboInOwl.consider DBX = oboInOwl.hasDbXref # FIXME also behaves as objectProperty :/ AID = oboInOwl.hasAlternativeId IRBC = NIFRID.isReplacedByClass PREFIXES = makePrefixes( 'UBERON', 'ro', 'owl', 'skos', ) NIFPREFIXES = makePrefixes(
#!/usr/bin/env python3 import os import csv import json from pathlib import Path from datetime import date import rdflib from rdflib.extras import infixowl from pyontutils.core import makeGraph from pyontutils.config import devconfig from pyontutils.scigraph import Vocabulary from pyontutils.namespaces import makePrefixes from IPython import embed v = Vocabulary() PREFIXES = makePrefixes('ilx', 'owl', 'skos', 'NIFSTD', 'NIFRID', 'SAO', 'NIFEXT', 'NLXCELL') PREFIXES.update({ 'HBP_CELL':'http://www.hbp.FIXME.org/hbp_cell_ontology/', }) def expand(curie): prefix, suffix = curie.split(':') return rdflib.URIRef(PREFIXES[prefix] + suffix) def ilx_get_start(): with open(Path(devconfig.ontology_local_repo, 'interlex_reserved.txt').as_posix(), 'rt') as f: for line in f.readlines()[::-1]: # go backward to find the first non empty
def sheet_to_neurons(values, notes_index, expect_pes): # TODO import existing ids to register by label sgv = Vocabulary() e_config = Config('common-usage-types') e_config.load_existing() query = oq.OntQuery(oq.plugin.get('rdflib')(e_config.core_graph), instrumented=OntTerm) # FIXME clear use case for the remaining bound to whatever query produced it rather # than the other way around ... how to support this use case ... existing = {str(n.origLabel):n for n in e_config.neurons()} def convert_header(header): if header.startswith('has'): # FIXME use a closed namespace return ilxtr[header] else: return None def convert_other(header): if header == 'label': return rdfs.label elif header == 'curie': return rdf.type elif header == 'definition': return definition else: header = header.replace(' ', '_') return TEMP[header] # FIXME def mapCell(cell, syns=False): search_prefixes = ('UBERON', 'CHEBI', 'PR', 'NCBITaxon', 'NCBIGene', 'ilxtr', 'NIFEXT', 'SAO', 'NLXMOL', 'BIRNLEX',) if ':' in cell and ' ' not in cell: log.debug(cell) if 'http' in cell: if cell.startswith('http'): t = OntTerm(iri=cell) else: return None, None # garbage with http inline else: t = OntTerm(cell, exclude_prefix=('FMA',)) # FIXME need better error message in ontquery return t.u, t.label result = [r for r in sgv.findByTerm(cell, searchSynonyms=syns, prefix=search_prefixes) if not r['deprecated']] #printD(cell, result) if not result: log.debug(f'{cell}') maybe = list(query(label=cell, exclude_prefix=('FMA',))) if maybe: qr = maybe[0] return qr.OntTerm.u, qr.label elif not syns: return mapCell(cell, syns=True) else: return None, None elif len(result) > 1: #printD('WARNING', result) result = select_by_curie_rank(result) else: result = result[0] return rdflib.URIRef(result['iri']), result['labels'][0] def lower_check(label, cell): return label not in cell and label.lower() not in cell.lower() # have to handle comma sep case lnlu = {v:k for k, v in LogicalPhenotype.local_names.items()} def convert_cell(cell_or_comma_sep): #printD('CONVERTING', cell_or_comma_sep) for cell_w_junk in cell_or_comma_sep.split(','): # XXX WARNING need a way to alter people to this cell = cell_w_junk.strip() if cell.startswith('(OR') or cell.startswith('(AND'): start, *middle, end = cell.split('" "') OPoperator, first = start.split(' "') operator = OPoperator[1:] operator = lnlu[operator] last, CP = end.rsplit('"') iris, labels = [], [] for term in (first, *middle, last): iri, label = mapCell(term) if label is None: label = cell_or_comma_sep iris.append(iri) labels.append(label) yield (operator, *iris), tuple(labels) else: iri, label = mapCell(cell) if label is None: yield iri, cell_or_comma_sep # FIXME need a way to handle this that doesn't break things? else: yield iri, label config = Config('cut-roundtrip') skip = 'alignment label', headers, *rows = values errors = [] new = [] release = [] for i, neuron_row in enumerate(rows): id = None label_neuron = None definition_neuron = None synonyms_neuron = None current_neuron = None phenotypes = [] do_release = False predicate_notes = {} object_notes = {} other_notes = {} wat = {} for j, (header, cell) in enumerate(zip(headers, neuron_row)): notes = list(process_note(get_note(i + 1, j, notes_index))) # + 1 since headers is removed if notes and not header.startswith('has'): _predicate = convert_other(header) if cell: _object = rdflib.Literal(cell) # FIXME curies etc. else: _object = rdf.nil other_notes[_predicate, _object] = notes if header == 'curie': id = OntId(cell).u if cell else None continue elif header == 'label': label_neuron = cell if cell in existing: current_neuron = existing[cell] elif cell: # TODO new.append(cell) else: raise ValueError(cell) # wat continue elif header == 'Status': # TODO if cell == 'Yes': do_release = True elif cell == 'Maybe': pass elif cell == 'Not yet': pass elif cell == 'Delete': pass else: pass continue elif header == 'PMID': # TODO continue elif header == 'Other reference': # TODO continue elif header == 'Other label': # TODO continue elif header == 'definition': continue # FIXME single space differences between the spreadsheet and the source if cell: definition_neuron = rdflib.Literal(cell) continue elif header == 'synonyms': if cell: synonyms_neuron = [rdflib.Literal(s.strip()) # FIXME bare comma is extremely dangerous for s in cell.split(',')] continue elif header in skip: continue objects = [] if cell: predicate = convert_header(header) if predicate is None: log.debug(f'{(header, cell, notes)}') for object, label in convert_cell(cell): if isinstance(label, tuple): # LogicalPhenotype case _err = [] for l in label: if lower_check(l, cell): _err.append((cell, label)) if _err: errors.extend(_err) else: objects.append(object) elif lower_check(label, cell): errors.append((cell, label)) elif str(id) == object: errors.append((header, cell, object, label)) object = None else: objects.append(object) if notes: # FIXME this is a hack to only attach to the last value # since we can't distinguish at the moment wat[predicate, object] = notes if object is not None: # object aka iri can be none if we don't find anything object_notes[object] = notes else: predicate_notes[predicate] = notes # FIXME it might also be simpler in some cases # to have this be object_notes[object] = notes # because we are much less likely to have the same # phenotype appear attached to the different dimensions # FIXME comma sep is weak here because the # reference is technically ambiguous # might be an argument for the denormalized form ... # or perhaps having another sheet for cases like that else: continue if predicate and objects: for object in objects: # FIXME has layer location phenotype if isinstance(object, tuple): op, *rest = object pes = (Phenotype(r, predicate) for r in rest) # FIXME nonhomogenous phenotypes phenotypes.append(LogicalPhenotype(op, *pes)) elif object: phenotypes.append(Phenotype(object, predicate)) else: errors.append((object, predicate, cell)) elif objects: errors.append((header, objects)) else: errors.append((header, cell)) # translate header -> predicate # translate cell value to ontology id if current_neuron and phenotypes: # TODO merge current with changes # or maybe we just replace since all the phenotypes should be there? log.debug(phenotypes) if id is not None: log.debug(f'{(id, bool(id))}') elif label_neuron: id = make_cut_id(label_neuron) if id not in expect_pes: log.error(f'{id!r} not in cuts!?') continue if expect_pes[id] != len(phenotypes): log.error(f'{id!r} failed roundtrip {len(phenotypes)} != {expect_pes[id]}') continue neuron = NeuronCUT(*phenotypes, id_=id, label=label_neuron, override=bool(id) or bool(label_neuron)) neuron.adopt_meta(current_neuron) # FIXME occasionally this will error?! else: continue # FIXME this polutes everything ??? fn = fixname(label_neuron) if not phenotypes and i: # i skips header errors.append((i, neuron_row)) # TODO special review for phenos but not current phenotypes = Phenotype('TEMP:phenotype/' + fn), neuron = NeuronCUT(*phenotypes, id_=make_cut_id(label_neuron), label=label_neuron, override=True) # update the meta if there were any changes if definition_neuron is not None: neuron.definition = definition_neuron if synonyms_neuron is not None: neuron.synonyms = synonyms_neuron try: neuron.batchAnnotateByObject(object_notes) neuron.batchAnnotate(other_notes) except AttributeError as e: #embed() log.exception(e) #'something very strage has happened\n', e) pass # FIXME FIXME FIXME #neuron.batchAnnotateByPredicate(predicate_notes) # TODO # FIXME doesn't quite work in this context, but there are other # cases where annotations to the general modality are still desireable # FIXME there may be no predicate? if the object fails to match? if do_release: release.append(neuron) return config, errors, new, release
def main(): resources = auth.get_path('resources') if not resources.exists(): raise FileNotFoundError(f'{resources} does not exist cannot continue') with open((auth.get_path('git-local-base') / 'entity_mapping/mappings/uberon-nervous').as_posix(), 'rt') as f: brain_only = set([l.strip() for l in f.readlines()]) sgv = Vocabulary(cache=True) sgg = Graph(cache=True) g = rdflib.Graph() g.parse((auth.get_path('ontology-local-repo') / 'ttl/generated/parcellation/cocomacslim.ttl').as_posix(), format='turtle') sos = [so for so in g.subject_objects(rdflib.RDFS.label)] map_ = [] smap_ = [] fma_lookup = {} for s, o in sos: cc_id = g.qname(s) cc_label = o.toPython() existing_id = None existing_label = None existing_fma = '' s_existing_id = None s_existing_label = None s_existing_fma = '' cands = sgv.findByTerm(o) if not cands: cands = [] scands = sgv.searchByTerm(o) if not scands: scands = [] else: scands = [] for cand in cands: existing_fma = '' if 'curie' in cand: existing_id = cand['curie'] elif 'cocomac' in cand['iri']: continue else: raise ValueError(f'What is this thing? {curie["iri"]}') existing_label = cand['labels'][0] if existing_id.startswith('UBERON'): if existing_id not in brain_only: existing_id = None existing_label = None existing_fma = '' else: if existing_id in fma_lookup: existing_fma = fma_lookup[existing_id] else: meta = sgg.getNode(existing_id)['nodes'][0]['meta'] if dbx in meta: xrefs = meta[dbx] for ref in xrefs: if ref.startswith('FMA:'): existing_fma += ref fma_lookup[existing_id] = existing_fma break #elif cand['curie'].startswith('NIFGA'): #elif cand['curie'].startswith('MBA'): if existing_id: map_.append( (cc_label, cc_id, existing_label, existing_id, existing_fma)) for scand in scands: if 'cocomac' in scand['iri']: continue elif not scand['curie']: continue # good old remove the key instead of set it to None s_existing_fma = '' if scand['curie'].startswith('UBERON'): if scand['curie'] in brain_only: s_existing_id = scand['curie'] s_existing_label = scand['labels'][0] if not s_existing_id: print(scand) continue asdf = sgg.getNode(s_existing_id) #print(asdf, s_existing_id, s_existing_label) if s_existing_id in fma_lookup: s_existing_fma = fma_lookup[s_existing_id] else: meta = asdf['nodes'][0]['meta'] if dbx in meta: xrefs = meta[dbx] for ref in xrefs: if ref.startswith('FMA:'): s_existing_fma += ref fma_lookup[s_existing_id] = s_existing_fma smap_.append((cc_label, cc_id, s_existing_label, s_existing_id, s_existing_fma)) #break # FOW :/ _ = [ print(a) for a in sorted(smap_, key=lambda a: int(a[1].split(':')[1])) ] with open('/tmp/coco_uber_match.csv', 'wt') as f: writer = csv.writer(f) writer.writerows(map_) with open('/tmp/coco_uber_search.csv', 'wt') as f: writer = csv.writer(f) writer.writerows(smap_) # cocomac -> integrated connectivity terminiology mapping def lnc(string): return string.lower().replace(',', ' ') # matches the conv in NIF_conn ccslim = rdflib.Graph().parse( (auth.get_path('ontology-local-repo') / 'ttl/generated/parcellation/cocomacslim.ttl').as_posix(), format='turtle') coco_all = [l for l in ccslim.objects(None, rdflib.RDFS.label)] intcon = resources / 'NIF_conn_allcols_minimal_clean_filtered2.csv' with open(intcon.as_posix(), 'rt') as f: ber_rows = [r for r in csv.reader(f)] ber_set = set( [c for c in zip(*[r for r in ber_rows if r[0] == 'CoCoMac'])][1]) coco_match_lower_no_comma = set( [lnc(t) for t in [c for c in zip(*map_)][0]]) if smap_: coco_search_lower_no_comma = set( [lnc(t) for t in [c for c in zip(*smap_)][0]]) else: coco_search_lower_no_comma = set() coco_all_lower_no_comma = set([lnc(t) for t in coco_all]) matched = ber_set.intersection(coco_match_lower_no_comma) searched = ber_set.intersection(coco_search_lower_no_comma) alled = ber_set.intersection(coco_all_lower_no_comma) unmapped = alled.difference(matched.union(searched)) missing = ber_set.difference(alled) nmatch = len(matched) nsearch = len(searched) nall = len(alled) nunmapped = len(unmapped) nmissing = len(missing) print('# matched =', nmatch) print('# searched =', nsearch) print('# alled =', nall) print('# unmatched =', nunmapped) print('# missing =', nmissing) print('missing') for m in sorted(missing): print(m) print('unmapped') for m in sorted(unmapped): print(m)
def main(): DB_URI = 'mysql+mysqlconnector://{user}:{password}@{host}:{port}/{db}' if socket.gethostname() != 'orpheus': config = mysql_conn_helper('localhost', 'nif_eelg', 'nif_eelg_secure', 33060) # see .ssh/config else: config = mysql_conn_helper('nif-mysql.crbs.ucsd.edu', 'nif_eelg', 'nif_eelg_secure') engine = create_engine(DB_URI.format(**config), echo=True) config = None del(config) insp = inspect(engine) terms = [c['name'] for c in insp.get_columns('terms')] term_existing_ids = [c['name'] for c in insp.get_columns('term_existing_ids')] #breakpoint() #sys.exit() query = engine.execute('SELECT * FROM term_existing_ids as teid JOIN terms as t ON t.id = teid.tid WHERE t.type != "cde"') header = term_existing_ids + terms data = query.fetchall() cdata = list(zip(*data)) def datal(head): return cdata[header.index(head)] ilx_labels = {ilxb[ilx_fragment]:label for ilx_fragment, label in zip(datal('ilx'), datal('label'))} mapping_no_sao = [p for p in zip(datal('iri'), datal('ilx')) if 'neuinfo' in p[0]] # 9446 mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'neuinfo' in p[0] or '/sao' in p[0]] # 9883 done = [ilx for iri, ilx in mapping] obo_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'obolibrary' in p[0] and p[1] not in done] done = done + [ilx for iri, ilx in obo_mapping] db_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'drugbank' in p[0] and p[1] not in done] done = done + [ilx for iri, ilx in db_mapping] t3db_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 't3db' in p[0] and p[1] not in done] done = done + [ilx for iri, ilx in t3db_mapping] wiki_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'neurolex' in p[0] and p[1] not in done] sao_mapping = {o.toPython():s for s, o in Graph().parse((gitf / 'nlxeol/sao-nlxwiki-fixes.ttl').as_posix(), format='ttl').subject_objects(oboInOwl.hasAlternativeId)} scr = Graph().parse((gitf / 'NIF-Ontology/scicrunch-registry.ttl').as_posix(), format='turtle') moved_to_scr = {} #PROBLEM = set() for s, o in scr.subject_objects(oboInOwl.hasDbXref): if 'SCR_' in o: print(f'WARNING Registry identifier listed as alt id! {s} hasDbXref {o}') continue uri = NIFSTD[o] #try: assert uri not in moved_to_scr, f'utoh {uri} was mapped to more than one registry entry! {s} {moved_to_scr[uri]}' #except AssertionError: #PROBLEM.add(uri) moved_to_scr[uri] = s to_scr = [(k, v) for k, v in moved_to_scr.items() if noneMembers(k, 'SciEx_', 'OMICS_', 'rid_', 'SciRes_', 'biodbcore-', 'C0085410', 'doi.org', 'C43960', 'doi:10.', 'GAZ:', # 'birnlex_', 'nlx_', 'nif-' )] replacement_graph = createOntology(filename='NIFSTD-ILX-mapping', name='NLX* to ILX equivalents', prefixes=makePrefixes('ILX'),) scr_rep_graph = createOntology(filename='NIFSTD-SCR-mapping', name='NLX* to SCR equivalents', prefixes=makePrefixes('SCR'),) _existing = {} def dupes(this, other, set_, dupes_): if this not in set_: set_.add(this) _existing[this] = other elif _existing[this] != other: dupes_[this].add(_existing[this]) dupes_[this].add(other) iri_done = set() ilx_done = set() iri_dupes = defaultdict(set) ilx_dupes = defaultdict(set) def check_dupes(iri, ilx): dupes(iri, ilx, iri_done, iri_dupes) dupes(ilx, iri, ilx_done, ilx_dupes) BIRNLEX = Namespace(uPREFIXES['BIRNLEX']) trouble = [ # some are _2 issues :/ # in interlex -- YES WE KNOW THEY DONT MATCH SOME IDIOT DID THIS IN THE PAST BIRNLEX['1006'], # this one appears to be entirely novel despite a note that it was created in 2006... BIRNLEX['1152'], # this was used in uberon ;_; BIRNLEX['2476'], # can be owl:sameAs ed -> _2 version BIRNLEX['2477'], # can be owl:sameAs ed -> _2 version BIRNLEX['2478'], # can be owl:sameAs ed -> _2 version BIRNLEX['2479'], # can be owl:sameAs ed -> _2 version BIRNLEX['2480'], # can be owl:sameAs ed -> _2 version BIRNLEX['2533'], # This is in interlex as a wiki id http://uri.interlex.org/base/ilx_0109349 since never used in the ontology, we could add it to the list of 'same as' for cosmetic purposes which will probably happen... BIRNLEX['3074'], # -> CHEBI:26848 # add to slim and bridge... BIRNLEX['3076'], # -> CHEBI:26195 # XXX when we go to load chebi make sure we don't dupe this... ] aaaaaaaaaaaaaaaaaaaaaaaaaaaaa = [t + '_2' for t in trouble] # _never_ do this # TODO check for cases where there is an ilx and scr for the same id >_< sao_help = set() for iri, ilx_fragment in chain(mapping, to_scr): # XXX core loop if iri in sao_mapping: uri = sao_mapping[iri] sao_help.add(uri) else: uri = URIRef(iri) if uri in trouble: #print('TROUBLE', iri, ilxb[ilx_fragment]) print('TROUBLE', ilxb[ilx_fragment]) if uri in moved_to_scr: # TODO I think we need to have _all_ the SCR redirects here... s, p, o = uri, ilxtr.hasScrId, moved_to_scr[uri] scr_rep_graph.g.add((s, p, o)) else: s, p, o = uri, ilxtr.hasIlxId, ilxb[ilx_fragment] #s, p, o = o, ilxtr.ilxIdFor, s replacement_graph.g.add((s, p, o)) check_dupes(s, o) dupes = {k:v for k, v in iri_dupes.items()} idupes = {k:v for k, v in ilx_dupes.items()} assert not dupes, f'there are duplicate mappings for an external id {dupes}' #print(ilx_dupes) # there are none yet ng = cull_prefixes(replacement_graph.g, prefixes=uPREFIXES) ng.filename = replacement_graph.filename sng = cull_prefixes(scr_rep_graph.g, prefixes=uPREFIXES) sng.filename = scr_rep_graph.filename _ = [print(k.toPython(), ' '.join(sorted(ng.qname(_.toPython()) for _ in v))) for k, v in idupes.items()] # run `resolver_uris = sorted(set(e for t in graph for e in t if 'uri.neuinfo.org' in e))` on a graph with everything loaded to get this file... resources = Path(__file__).resolve().absolute().parent / 'resources' with open((resources / 'all-uri.neuinfo.org-uris.pickle').as_posix(), 'rb') as f: all_uris = pickle.load(f) # come in as URIRefs... with open((resources / 'all-uri.neuinfo.org-uris-old.pickle').as_posix(), 'rb') as f: all_uris_old = pickle.load(f) # come in as URIRefs... with open((resources / 'all-uri.neuinfo.org-uris-old2.pickle').as_posix(), 'rb') as f: all_uris_old2 = pickle.load(f) # come in as URIRefs... resolver_uris = set(e for t in chain(ng.g, sng.g) for e in t if 'uri.neuinfo.org' in e) ilx_only = resolver_uris - all_uris # aka nlxonly resolver_not_ilx_only = resolver_uris - ilx_only problem_uris = all_uris - resolver_uris old_uris = all_uris_old - all_uris old_uris2 = all_uris_old2 - all_uris dold_uris = all_uris_old - all_uris_old2 #idold_uris = all_uris_old2 - all_uris_old # empty as expected #nxrefs = Graph().parse((gitf / 'NIF-Ontology/ttl/generated/nlx-xrefs.ttl').as_posix(), format='turtle') nxrefs = Graph().parse((gitf / 'nlxeol/nlx-xrefs.ttl').as_posix(), format='turtle') xrefs_uris = set(e for t in nxrefs for e in t if 'uri.neuinfo.org' in e) test_old_uris = old_uris2 - xrefs_uris diff_uris = test_old_uris - ilx_only #diff_uris.remove(URIRef('http://uri.neuinfo.org/nif/nifstd/nlx_149160')) # ORNL was included in an old bad version of the xrefs file and was pulled in in the old all-uris # now dealt with by the scr mapping diff_uris.remove(URIRef('http://uri.neuinfo.org/nif/nifstd/nlx_40280,birnlex_1731')) # one of the doubled neurolex ids diff_uris.remove(URIRef('http://uri.neuinfo.org/nif/nifstd')) # i have zero idea how this snuck in assert not diff_uris, 'old uris and problem uris should be identical' _ilx = set(e for t in ng.g for e in t) _scr = set(e for t in sng.g for e in t) for uri in ilx_only: if uri in _ilx and uri in _scr: raise BaseException('AAAAAAAAAAAAAAAAAAAAAAAAAAAAA') elif uri in _ilx: g = ng.g elif uri in _scr: g = sng.g else: raise BaseException('????????????') g.add((uri, ilxtr.isDefinedBy, URIRef('http://neurolex.org'))) # XXX write the graphs ng.write() sng.write() nsuris = set(uri for uri, ilx in mapping_no_sao) auris = set(_.toPython() for _ in all_uris) iuris = set(_.toPython() for _ in resolver_uris) #sao_missing = iuris - nsuris # now fixed and cannot run due to addition of scr ids to resolver_uris #assert not sao_missing, f'whoops {sao_missing}' ilx_missing = auris - iuris all_missing = iuris - auris #assert not all_missing, f'all is not all! {all_missing}' # XXX have to deal with ilx_only separately as NLX-ILX or something # fixed #sao_add = {o.toPython():s.toPython() for s, p, o in ng.g if s.toPython() in sao_missing} #assert len(sao_add) == len(sao_missing), 'EEEEEEEEEEEEEEE' #with open('/tmp/please-add-these-sao-ids-as-existing-ids-to-the-listed-interlex-record.json', 'wt') as f: #json.dump(sao_add, f, indent=2) to_review = sorted(ilx_missing) # not relevant anymore #with open('thought-to-be-missing.json', 'rt') as f: #thought_to_be_missing = json.load(f) # from troy has issues #with open('nifext-duplicates-and-new.json', 'rt') as f: #nifext_data = json.load(f) #nifext_dupes = {v['current_nifext_id']:v['dropped_nifext_ids'][-1] if v['dropped_nifext_ids'] else None for v in nifext_data.values()} sgv = Vocabulary(cache=True) trts = [(v, (sgv.findById(v)['labels'][0] if sgv.findById(v)['labels'] else '<--NO-LABEL-->') if sgv.findById(v) else '<------>') for v in to_review] sgg = sGraph(cache=True) SGG = Namespace(sgg._basePath.rstrip('/') + '/graph/') rg = Graph().parse((gitf / 'NIF-Ontology/ttl/unused/NIF-Retired.ttl').as_posix(), format='turtle') retired = set(e.toPython() for t in rg for e in t if 'uri.neuinfo.org' in e) retfile = '<ttl/unused/NIF-Retired.ttl>' help_graph = createOntology(filename='NIFSTD-BLACKHOLE-mapping', name='HELPPPPPPPP!!!!', prefixes=uPREFIXES,) def make_rt(to_review_tuples, retired=retired): def inner(u, l, retired=retired): ne = sgg.getNeighbors(u, relationshipType="isDefinedBy", depth=1) if ne: curie = help_graph.qname(u) help_graph.g.add((URIRef(u), ilxtr.SciGraphLookup, URIRef(f'http://scigraph.olympiangods.org/scigraph/graph/{curie}'))) if ne and ne['edges']: src = ' '.join([f'<{e["obj"]}>' for e in ne["edges"]]) elif u in retired: src = retfile else: src = '<>' return f'{u:<70} {l:<50} {src}' out = Async(rate=3000)(deferred(inner)(u, l) for u, l in sorted(to_review_tuples, key=lambda a:a[-1])) return '\n'.join(out) review_text = make_rt(trts) trts2 = [(u, l) for u, l in trts if 'nifext' not in u] not_nifext = make_rt(trts2) hng = cull_prefixes(help_graph.g, prefixes=uPREFIXES) hng.filename = help_graph.filename hng.write() ### # Accounting of uri.neuinfo.org ids that do not resolve ### not_in_interlex = set(s for s, o in hng.g.subject_objects(ilxtr.SciGraphLookup)) bh_deprecated = set(s for s in hng.g.subjects() if sgv.findById(s) and sgv.findById(s)['deprecated']) bh_not_deprecated = set(s for s in hng.g.subjects() if sgv.findById(s) and not sgv.findById(s)['deprecated']) bh_nifexts = set(s for s in bh_not_deprecated if 'nifext' in s) bh_readable = set(s for s in bh_not_deprecated if 'readable' in s) unaccounted = not_in_interlex - bh_readable - bh_nifexts - bh_deprecated namedinds = set(s for s in unaccounted if sgv.findById(s) and sgg.getNode(s)['nodes'][0]['meta']['types'] and sgg.getNode(s)['nodes'][0]['meta']['types'][0] == 'NamedIndividual') unaccounted = unaccounted - namedinds ual = sorted(o for s in unaccounted for o in hng.g.objects(s, ilxtr.SciGraphLookup)) report = ( f'Total {len(not_in_interlex)}\n' f'deprecated {len(bh_deprecated)}\n' f'nd nifext {len(bh_nifexts)}\n' f'nd readable {len(bh_readable)}\n' f'nd namedind {len(namedinds)}\n' f'unaccounted {len(unaccounted)}\n' ) print(report) def reverse_report(): ilx = Graph() ilx.parse('/tmp/interlex.ttl', format='turtle') not_in_ontology = set() annotations = set() relations = set() drugbank = set() t3db = set() for subject in ilx.subjects(rdf.type, owl.Class): ok = False for object in ilx.objects(subject, oboInOwl.hasDbXref): if anyMembers(object, 'uri.neuinfo.org', 'GO_', 'CHEBI_', 'PR_', 'PATO_', 'HP_', 'OBI_', 'DOID_', 'COGPO_', 'CAO_', 'UBERON_', 'NCBITaxon_', 'SO_', 'IAO_'): # FIXME doe we areally import HP? ok = True if (subject, rdf.type, owl.AnnotationProperty) in ilx: # FIXME for troy these need to be cleared up annotations.add(subject) elif (subject, rdf.type, owl.ObjectProperty) in ilx: relations.add(subject) elif 'drugbank' in object: drugbank.add(subject) elif 't3db.org' in object: t3db.add(subject) if not ok: not_in_ontology.add(subject) drugbank = drugbank & not_in_ontology t3db = t3db & not_in_ontology annotations = annotations & not_in_ontology relations = relations & not_in_ontology unaccounted = not_in_ontology - drugbank - t3db - annotations - relations report = ( f'Total {len(not_in_ontology)}\n' f'annotations {len(annotations)}\n' f'relations {len(relations)}\n' f'drugbank {len(drugbank)}\n' f't3db {len(t3db)}\n' f'unaccounted {len(unaccounted)}\n' ) print(report) return (not_in_ontology, drugbank, unaccounted) _, _, un = reverse_report() h_uris = set(e for t in hng.g for e in t if 'uri.neuinfo.org' in e) real_problems = problem_uris - h_uris ### # Missing neurons ### with open((gitf / 'nlxeol/neuron_data_curated.csv').as_posix()) as f: r = csv.reader(f) nheader = next(r) rows = list(r) ndata = list(zip(*rows)) def datan(head): return ndata[nheader.index(head)] if __name__ == '__main__': breakpoint()