def repeat(dobig=dobig): # we don't really know when to stop, so just adjust for s, o in graph.subject_objects(owl.imports): if os.path.basename(o) not in done and o not in done: #if (o, rdf.type, owl.Ontology) not in graph: print(o) done.append(o) ext = os.path.splitext(o)[1] fmt = 'turtle' if ext == '.ttl' else 'xml' if noneMembers(o, *bigleaves) or dobig: graph.parse(o, format=fmt)
def repeat(dobig=False): # we don't really know when to stop, so just adjust for s, o in graph.subject_objects(rdflib.OWL.imports): if os.path.basename(o) not in done and o not in done: #if (o, rdflib.RDF.type, rdflib.OWL.Ontology) not in graph: print(o) done.append(o) ext = os.path.splitext(o)[1] fmt = 'turtle' if ext == '.ttl' else 'xml' if noneMembers(o, 'go.owl', 'uberon.owl', 'pr.owl', 'doid.owl', 'taxslim.owl') or dobig: graph.parse(o, format=fmt)
def graph_todo(graph, curie_prefixes, get_values): ug = makeGraph('big-graph', graph=graph) ug.add_known_namespaces('NIFRID') fragment_prefixes, ureps = get_values(ug) #all_uris = sorted(set(_ for t in graph for _ in t if type(_) == rdflib.URIRef)) # this snags a bunch of other URIs #all_uris = sorted(set(_ for _ in graph.subjects() if type(_) != rdflib.BNode)) #all_uris = set(spo for t in graph.subject_predicates() for spo in t if isinstance(spo, rdflib.URIRef)) all_uris = set(spo for t in graph for spo in t if isinstance(spo, rdflib.URIRef)) prefs = set(_.rsplit('#', 1)[0] + '#' if '#' in _ else (_.rsplit('_',1)[0] + '_' if '_' in _ else _.rsplit('/',1)[0] + '/') for _ in all_uris) nots = set(_ for _ in prefs if _ not in curie_prefixes) # TODO sos = set(prefs) - set(nots) all_uris = [u if u not in ureps else ureps[u] for u in all_uris] #to_rep = set(_.rsplit('#', 1)[-1].split('_', 1)[0] for _ in all_uris if 'ontology.neuinfo.org' in _) #to_rep = set(_.rsplit('#', 1)[-1] for _ in all_uris if 'ontology.neuinfo.org' in _) ignore = ( # deprecated and only in as annotations 'NIFGA:birnAnatomy_011', 'NIFGA:birnAnatomy_249', 'NIFORG:birnOrganismTaxon_19', 'NIFORG:birnOrganismTaxon_20', 'NIFORG:birnOrganismTaxon_21', 'NIFORG:birnOrganismTaxon_390', 'NIFORG:birnOrganismTaxon_391', 'NIFORG:birnOrganismTaxon_56', 'NIFORG:birnOrganismTaxon_68', 'NIFINV:birnlexInvestigation_174', 'NIFINV:birnlexInvestigation_199', 'NIFINV:birnlexInvestigation_202', 'NIFINV:birnlexInvestigation_204', ) ignore = tuple(ug.expand(i) for i in ignore) non_normal_identifiers = sorted(u for u in all_uris if 'ontology.neuinfo.org' in u and noneMembers(u, *fragment_prefixes) and not u.endswith('.ttl') and not u.endswith('.owl') and u not in ignore) print(len(prefs)) embed()
def uri_normalization(uri): """ NOTE: this does NOT produce uris """ try: # strip hypothesis extension prefix if uri.startswith('chrome-extension://bjfhmglciegochdpefhhlphglcehbmek/content/web/viewer.html?file='): junk, uri = uri.split('=', 1) # universal fixes no_fragment, *_frag = uri.rsplit('#', 1) no_trailing_slash = no_fragment.rstrip('/') # annoying _scheme, no_scheme = no_trailing_slash.split('://', 1) # special cases if 'frontiersin.org' in no_scheme: # og:url on frontiers is incorrect no_scheme = no_scheme.replace('article/', 'articles/') elif 'fasebj.org' in no_scheme: # FIXME this one has _all_ the variants :/ no_scheme = (no_scheme .replace('.abstract', '') .replace('.full', '') .replace('.pdf', '') ) elif no_scheme.endswith('?needAccess=true'): no_scheme = no_scheme[:-len('?needAccess=true')] elif '?systemMessage' in no_scheme: no_scheme, junk = no_scheme.rsplit('?systemMessage', 1) # specific fixes if anyMembers(no_scheme, 'acs.org', 'ahajournals.org', 'biologicalpsychiatryjournal.com', 'ebiomedicine.com', 'fasebj.org', 'frontiersin.org', 'future-science.com', 'hindawi.com', 'ieee.org', 'jclinepi.com', 'jpeds.com', 'liebertpub.com', 'mitpressjournals.org', 'molbiolcell.org', 'molmetab.com', 'neurobiologyofaging.org', 'physiology.org', 'sagepub.com', 'sciencedirect.com', 'tandfonline.com', 'theriojournal.com', 'wiley.com',): # NOTE not all the above hit all of these # almost all still resolve normalized = (no_scheme .replace('/abstract', '') .replace('/abs', '') .replace('/fulltext', '') .replace('/full', '') .replace('/pdf', '')) #elif ('sciencedirect.com' in no_scheme): #normalized = (no_scheme #.replace('/abs', '')) elif ('cell.com' in no_scheme): normalized = (no_scheme # FIXME looks like cell uses /abstract in og:url .replace('/abstract', '/XXX') .replace('/fulltext', '/XXX')) elif 'jneurosci.org' in no_scheme: # TODO content/early -> resolution_chain(doi) normalized = (no_scheme .replace('.short', '') .replace('.long', '') .replace('.full', '') .replace('.pdf', '') # note .full.pdf is a thing ) elif 'pnas.org' in no_scheme: normalized = (no_scheme .replace('.short', '') .replace('.long', '') .replace('.full', '')) elif 'mdpi.com' in no_scheme: normalized = (no_scheme .replace('/htm', '')) elif 'f1000research.com' in no_scheme: # you should be ashamed of yourselves for being in here for this reason normalized, *maybe_version = no_scheme.rsplit('/v', 1) elif 'academic.oup.com' in no_scheme: normalized, *maybesr = no_scheme.rsplit('?searchresult=', 1) _normalized, maybe_junk = normalized.rsplit('/', 1) numbers = '0123456789' if (maybe_junk[0] not in numbers or # various ways to detect the human readable junk after the id maybe_junk[-1] not in numbers or '-' in maybe_junk or len(maybe_junk) > 20): normalized = _normalized elif anyMembers(no_scheme, 'jci.org', 'nature.com'): # cases where safe to remove query fragment normalized, *_query = no_scheme.rsplit('?', 1) normalized, *table_number = normalized.rsplit('/tables/', 1) elif 'pubmed/?term=' in no_scheme and noneMembers(no_scheme, ' ', '+'): normalized = no_scheme.replace('?term=', '') elif 'nih.gov/pubmed/?' in no_scheme: # FIXME scibot vs client norm? normalized = no_scheme.replace(' ', '+') elif 'govhttp' in no_scheme: # lol oh dear hrm, oops = no_scheme.split('govhttp') ded, wat = oops.split('//', 1) blargh, suffix = wat.split('/', 1) normalized = hrm + 'gov/pmc/' + suffix elif 'table/undtbl' in no_scheme: normalized, table_number = no_scheme.rsplit('table/undtbl') elif anyMembers(no_scheme, 'index.php?', ): # cases where we just use hypothes.is normalization _scheme, normalized = uri_normalize(uri).split('://') # FIXME h dependency else: normalized = no_scheme 'onlinelibrary.wiley.com/doi/10.1002/cne.23727?wol1URL=/doi/10.1002/cne.23727®ionCode=US-CA&identityKey=e2523300-b934-48c9-b08e-940de05d7335' 'www.jove.com/video/55441/?language=Japanese' 'www.nature.com/neuro/journal/v19/n5/full/nn.4282.html' 'www.nature.com/cr/journal/vaop/ncurrent/full/cr201669a.html' 'https://www.nature.com/articles/cr201669' #{'www.ingentaconnect.com/content/umrsmas/bullmar/2017/00000093/00000002/art00006': #[OntId('DOI:10.5343/bms.2016.1044'), OntId('DOI:info:doi/10.5343/bms.2016.1044')]} # pmid extract from pmc #<meta name="citation_pmid" content="28955177"> return normalized except ValueError as e: # split fail pdf_prefix = 'urn:x-pdf:' if uri.startswith(pdf_prefix): return uri elif uri in bad_uris: print('AAAAAAAAAAAAAAAAAAAAAAAAAAA', uri) return 'THIS URI IS GARBAGE AND THIS IS ITS NORMALIZED FORM' else: raise TypeError(uri) from e
def main(): branch=auth.get('neurons-branch') remote = OntId('NIFTTL:') if branch == 'master' else OntId(f'NIFRAW:{branch}/') ont_config = ontneurons(remote) ont_neurons = ont_config.neurons() bn_config = Config('basic-neurons', # FIXME this should probably be pulled in automatically # from the import statements, and it doesn't work even as is # also a chicken and an egg problem here imports=[remote.iri + 'ttl/generated/swanson.ttl']) #RDFL = oq.plugin.get('rdflib') # FIXME ick #rdfl = RDFL(bn_config.core_graph, OntId) #OntTerm.query.ladd(rdfl) # FIXME ick bn_config.load_existing() bn_neurons = bn_config.neurons() #OntTerm.query._services = OntTerm.query._services[:-1] # FIXME ick ndl_config = Config('neuron_data_lifted') ndl_config.load_existing() # FIXME this is extremely slow ndl_neurons = sorted(ndl_config.neurons()) resources = auth.get_path('resources') cutcsv = resources / 'cut-development.csv' with open(cutcsv.as_posix(), 'rt') as f: rows = [l for l in csv.reader(f)] bc = byCol(rows) (_, *labels), *_ = zip(*bc) labels_set0 = set(labels) ns = [] skipped = [] bamscok = (NIFSTD.BAMSC1125,) for n in (ont_neurons + ndl_neurons): if n.id_ and 'BAMSC' in n.id_: if n.id_ not in bamscok: skipped.append(n) continue l = str(n.origLabel) if l is not None: for replace, match in rename_rules.items(): # HEH l = l.replace(match, replace) if l in labels: n._origLabel = l ns.append(n) ns = sorted(ns) sns = set(n.origLabel for n in ns) labels_set1 = labels_set0 - sns agen = [c.label for c in bc if c.autogenerated] sagen = set(agen) added = [c.label for c in bc if c.added] sadded = set(added) ans = [] sans = set() missed = set() _bl = [] # XXX NOTE THE CONTINUE BELOW for n in bn_neurons: continue # we actually get all of these with uberon, will map between them later # can't use capitalize here because there are proper names that stay uppercase l = n.label.replace('(swannt) ', '').replace('Intrinsic', 'intrinsic').replace('Projection', 'projection') for replace, match in rename_rules.items(): # HEH l = l.replace(match, replace) if l in agen: n._origLabel = l ans.append(n) sans.add(l) else: missed.add(l) _bl.append(l) agen_missing = sagen - sans labels_set2 = labels_set1 - sans nlx_labels = [c.label for c in bc if c.neurolex] snlx_labels = set(nlx_labels) class SourceCUT(resSource): sourceFile = 'nifstd/resources/cut-development.csv' # FIXME relative to git workingdir... source_original = True sources = SourceCUT(), swanr = rdflib.Namespace(interlex_namespace('swanson/uris/readable/')) SWAN = interlex_namespace('swanson/uris/neuroanatomical-terminology/terms/') SWAA = interlex_namespace('swanson/uris/neuroanatomical-terminology/appendix/') config = Config('cut-development-raw', sources=sources, source_file=relative_path(__file__), prefixes={'swanr': swanr, 'SWAN': SWAN, 'SWAA': SWAA,}) ins = [None if OntId(n.id_).prefix == 'TEMP' else n.id_ for n in ns] ians = [None] * len(ans) with NeuronCUT(CUT.Mammalia): mamns = [NeuronCUT(*zap(n.pes), id_=i, label=n._origLabel, override=bool(i)).adopt_meta(n) for i, n in zip(ins + ians, ns + ans)] smatch, rem = get_smatch(labels_set2) labels_set3 = labels_set2 - smatch added_unmapped = sadded & labels_set3 # TODO preserve the names from neuronlex on import ... Neuron.write() Neuron.write_python() raw_neurons = config.neurons() # do this before creating the new config # even though we are in theory tripling number of neurons in the current config graph # it won't show up in the next config (and this is why we need to reengineer) raw_neurons_ind_undep = [n.asUndeprecated().asIndicator() for n in raw_neurons] config = Config('cut-development', sources=sources, source_file=relative_path(__file__), prefixes={'swanr': swanr, 'SWAN': SWAN, 'SWAA': SWAA,}) # FIXME the call to asUndprecated currenlty triggers addition # to the current config and output graph as a side effect (ick!) ids_updated_neurons = [n.asUndeprecated() for n in raw_neurons] assert len(ids_updated_neurons) == len(raw_neurons) Neuron.write() Neuron.write_python() progress = (len(labels_set0), len(sns), len(sans), len(smatch), len(labels_set1), len(labels_set2), len(labels_set3)) prog_report = ('\nProgress:\n' f'total: {progress[0]}\n' f'from nlx: {progress[1]}\n' f'from basic: {progress[2]}\n' f'from match: {progress[3]}\n' f'TODO after nlx: {progress[4]}\n' f'TODO after basic: {progress[5]}\n' f'TODO after match: {progress[6]}\n') print(prog_report) assert progress[0] == progress[1] + progress[4], 'neurolex does not add up' assert progress[4] == progress[2] + progress[5], 'basic does not add up' lnlx = set(n.lower() for n in snlx_labels) sos = set(n.origLabel.lower() if n.origLabel else None for n in ndl_neurons) # FIXME load origLabel nlx_review = lnlx - sos nlx_missing = sorted(nlx_review) print(f'\nNeuroLex listed as source but no mapping (n = {len(nlx_review)}):') _ = [print(l) for l in nlx_missing] partial = {k:v for k, v in rem.items() if v and v not in terminals} print(f'\nPartially mapped (n = {len(partial)}):') if partial: mk = max((len(k) for k in partial.keys())) + 2 for k, v in sorted(partial.items()): print(f'{k:<{mk}} {v!r}') #print(f'{k!r:<{mk}}{v!r}') #pprint(partial, width=200) unmapped = sorted(labels_set3) print(f'\nUnmapped (n = {len(labels_set3)}):') _ = [print(l) for l in unmapped] no_location = [n for n in Neuron.neurons() if noneMembers((ilxtr.hasSomaLocatedIn, ilxtr.hasSomaLocatedInLayer), *n.unique_predicates)] if __name__ == '__main__': review_rows = export_for_review(config, unmapped, partial, nlx_missing) breakpoint() return config, unmapped, partial, nlx_missing
def get_smatch(labels_set2): contains_rules = make_contains_rules() skip = set() smatch = set() rem = {} for l in labels_set2: pes = tuple() l_rem = l for match, pheno in sorted(contains_rules.items(), key=lambda ab:-len(ab[0])): if not l_rem: break if len(match) > len(l_rem): continue t = None if match not in skip and pheno == OntTerm: try: t = OntTerm(term=match) log.debug(f'WTF {match} {t}') if t.validated: pheno = Phenotype(t.u, ilxtr.hasSomaLocatedIn) else: pheno = None except oq.exceptions.NotFoundError: skip.add(match) pheno = None if match in skip and pheno == OntTerm: pheno = None if match in l_rem and pheno: l_rem = l_rem.replace(match, '').strip() pes += (pheno if isinstance(pheno, tuple) else (pheno,)) if l_rem in exact_rules: pes += (exact_rules[l_rem],) l_rem = '' if l_rem == ' neuron': l_rem = '' elif l_rem.endswith(' cell'): l_rem = l_rem[:-len(' cell')] #print('l_rem no cell:', l_rem) elif l_rem.endswith(' neuron'): l_rem = l_rem[:-len(' neuron')] #print('l_rem no neuron:', l_rem) hrm = [pe for pe in pes if pe.e == ilxtr.hasSomaLocatedIn] if ' ' in l_rem: #print('l_rem:', l_rem) #breakpoint() maybe_region, rest = l_rem.split(' ', 1) elif noneMembers(l_rem, *terminals) and not hrm: maybe_region, rest = l_rem, '' #print('MR:', maybe_region) else: #print(hrm) maybe_region = None if maybe_region: prefix_rank = ('UBERON', 'SWAN', 'BIRNLEX', 'SAO', 'NLXANAT', 'NLX') def key(ot): ranked = ot.prefix in prefix_rank qargs = ot._query_result._QueryResult__query_args if 'term' in qargs and qargs['term'] is not None: arg = qargs['term'].lower() else: arg = None return (not ranked, prefix_rank.index(ot.prefix) if ranked else 0, not (arg == ot.label.lower())) #ots = sorted((term for term in OntTerm.query(term=maybe_region, #exclude_prefix=('FMA', 'NLX'))), key=key) #if not ots: ots = sorted((term for term in OntTerm.query(term=maybe_region, exclude_prefix=('FMA',))), key=key) if not ots: log.error(f'No match for {maybe_region!r}') else: t = ots[0] if 'oboInOwl:id' in t.predicates: # uberon replacement t = OntTerm(t.predicates['oboInOwl:id']) t.set_next_repr('curie', 'label') log.info(f'Match for {maybe_region!r} was {t!r}') if t.validated: l_rem = rest pheno = Phenotype(t.u, ilxtr.hasSomaLocatedIn) # FIXME pes += (pheno,) if pes: smatch.add(l) if not l_rem or l_rem in ('neuron', 'neurons', 'cell', 'Cell', 'positive cell'): with NeuronCUT(CUT.Mammalia): NeuronCUT(*zap(pes), id_=make_cut_id(l), label=l, override=True) else: rem[l] = l_rem return smatch, rem
def default(self): out_path = self.options.out_path BUILD = self.options.BUILD glb = Path(auth.get_path('git-local-base')) theme_repo = glb / 'org-html-themes' theme = theme_repo / 'setup/theme-readtheorg-local.setup' prepare_paths(BUILD, out_path, theme_repo, theme) doc_config = self._doc_config names = tuple(doc_config['repos']) + tuple( self.options.repo) # TODO fetch if missing ? repo_paths = [(glb / name).resolve() for name in names] repos = [p.repo for p in repo_paths] skip_folders = doc_config.get('skip-folders', tuple()) rskip = doc_config.get('skip', {}) # TODO move this into run_all docstring_kwargs = makeDocstrings(BUILD, repo_paths, skip_folders, rskip) wd_docs_kwargs = [docstring_kwargs] if self.options.docstring_only: [ kwargs.update({'theme': theme}) for _, _, kwargs in wd_docs_kwargs ] outname, rendered = render_docs(wd_docs_kwargs, out_path, titles=None, n_jobs=1, debug=self.options.debug)[0] if not outname.parent.exists(): outname.parent.mkdir(parents=True) with open(outname.as_posix(), 'wt') as f: f.write(rendered) return et = tuple() wd_docs_kwargs += [ (rp, rp / f, makeKwargs(rp, f)) for rp in repo_paths for f in rp.repo.git.ls_files().split('\n') if Path(f).suffix in suffixFuncs and only(rp, f) and noneMembers( f, *skip_folders) and f not in rskip.get(rp.name, et) ] [kwargs.update({'theme': theme}) for _, _, kwargs in wd_docs_kwargs] if self.options.spell: spell((f.as_posix() for _, f, _ in wd_docs_kwargs)) return titles = doc_config['titles'] outname_rendered = render_docs(wd_docs_kwargs, out_path, titles, self.options.jobs, debug=self.options.debug) index = [ f'<b class="{heading}">{heading}</b>' for heading in doc_config['index'] ] _NOTITLE = object() for outname, rendered in outname_rendered: apath = outname.relative_to(self.options.out_path) title = titles.get(apath.as_posix(), _NOTITLE) # TODO parse out/add titles if title is not None: value = (hfn.atag(apath) if title is _NOTITLE else hfn.atag( apath, title)) index.append(value) if not outname.parent.exists(): outname.parent.mkdir(parents=True) with open(outname.as_posix(), 'wt') as f: f.write(rendered) lt = list(titles) def title_key(a): title = a.split('"')[1] if title not in lt: msg = (f'{title} missing from {self.options.config}') raise ValueError(msg) return lt.index(title) index_body = '<br>\n'.join(['<h1>Documentation Index</h1>'] + sorted(index, key=title_key)) with open((out_path / 'index.html').as_posix(), 'wt') as f: f.write(hfn.htmldoc(index_body, title=doc_config['title']))
def inner(local_filepath, remote=False): if noneMembers(local_filepath, *bigleaves) or dobig: ext = os.path.splitext(local_filepath)[-1] if ext == '.ttl': infmt = 'turtle' else: log.info((ext, local_filepath)) infmt = None if remote: resp = requests.get( local_filepath ) # TODO nonblocking pull these out, fetch, run inner again until done raw = resp.text.encode() else: try: with open(local_filepath, 'rb') as f: raw = f.read() except FileNotFoundError as e: if local_filepath.startswith('file://'): log.info( f'local_imports has already been run, skipping {local_filepath}' ) return #raise ValueError('local_imports has already been run') from e else: log.exception( e ) # TODO raise a warning if the file cannot be matched # seems like good practice to have any imported ontology under # version control so all imports are guaranteed to have good # provenance and not split the prior informaiton between the # scigraph config and the repository, the repository remains # the source of truth, load.yaml files can then pick a subset # of the properly tracked files to load as they see fit, but # not add to them (at least in pyontutils land) raw = b'' if oo in raw: # we only care if there are imports or an ontology iri scratch = OntGraph() if infmt == 'turtle': data, rest = raw.split(b'###', 1) elif infmt == None: # assume xml xml_tree = etree.parse(BytesIO(raw)) xml_root = xml_tree.getroot() xml_ontology = xml_tree.xpath( "/*[local-name()='RDF']/*[local-name()='Ontology']") xml_root.clear() xml_root.append(xml_ontology[0]) data = etree.tostring(xml_root) scratch.parse(data=data, format=infmt) for s in scratch.subjects(rdf.type, owl.Ontology): triples.add((s, owl.sameAs, rdflib.URIRef(local_filepath))) # somehow this breaks computing the chain #for p in (rdfs.comment, skos.definition, definition, dc.title, rdfs.label): #for o in scratch[s:p]: #triples.add((s, p, o)) for s, o in sorted(scratch.subject_objects(p)): if revert: raise NotImplementedError('TODO') nlfp = o.replace(remote_base, local_base) triples.add((s, p, o)) if 'http://' in local_filepath or 'external' in local_filepath: # FIXME what to do about https used inconsistently :/ if 'external' in local_filepath: imported_iri = rdflib.URIRef( local_filepath.replace( local_base, remote_base)) # inefficient else: imported_iri = rdflib.URIRef(local_filepath) if s != imported_iri: imported_iri_vs_ontology_iri[ imported_iri] = s # kept for the record triples.add((imported_iri, p, s)) # bridge imported != ontology iri if local_base in nlfp and 'file://' not in o: # FIXME file:// should not be slipping through here... scratch.add((s, p, rdflib.URIRef('file://' + nlfp))) scratch.remove((s, p, o)) if nlfp not in done: done.append(nlfp) if local_base in nlfp and 'external' not in nlfp: # skip externals TODO inner(nlfp) elif readonly: # read external imports if 'external' in nlfp: inner(nlfp) else: inner(nlfp, remote=True) if not readonly: _orp = CustomTurtleSerializer.roundtrip_prefixes # FIXME awful hack :/ CustomTurtleSerializer.roundtrip_prefixes = True ttl = scratch.serialize(format='nifttl', encoding='utf-8') CustomTurtleSerializer.roundtrip_prefixes = _orp ndata, comment = ttl.split(b'###', 1) out = ndata + b'###' + rest with open(local_filepath, 'wb') as f: f.write(out)
def main(): from docopt import docopt args = docopt(__doc__) patch_theme_setup(theme) BUILD = working_dir / 'doc_build' if not BUILD.exists(): BUILD.mkdir() docs_dir = BUILD / 'docs' if not docs_dir.exists(): docs_dir.mkdir() theme_styles_dir = theme_repo / 'styles' doc_styles_dir = docs_dir / 'styles' if doc_styles_dir.exists(): shutil.rmtree(doc_styles_dir) shutil.copytree(theme_styles_dir, doc_styles_dir) docstring_kwargs = docstrings() wd_docs_kwargs = [docstring_kwargs] if args['--docstring-only']: outname, rendered = render_docs(wd_docs_kwargs, BUILD, 1)[0] if not outname.parent.exists(): outname.parent.mkdir(parents=True) with open(outname.as_posix(), 'wt') as f: f.write(rendered) return repos = (Repo(Path(devconfig.ontology_local_repo).resolve().as_posix()), Repo(working_dir.as_posix()), *(Repo(Path(devconfig.git_local_base, repo_name).as_posix()) for repo_name in ('ontquery', 'sparc-curation'))) skip_folders = 'notebook-testing', 'complete', 'ilxutils', 'librdflib' rskip = { 'pyontutils': ( 'docs/NeuronLangExample.ipynb', # exact skip due to moving file 'ilxutils/ilx-playground.ipynb'), 'sparc-curation': ('README.md', ), } et = tuple() # TODO move this into run_all #wd_docs_kwargs = [(Path(repo.working_dir).resolve(), wd_docs_kwargs += [ (Path(repo.working_dir).resolve(), Path(repo.working_dir, f).resolve(), makeKwargs(repo, f)) for repo in repos for f in repo.git.ls_files().split('\n') if Path(f).suffix in suffixFuncs #and Path(repo.working_dir).name == 'NIF-Ontology' and f == 'README.md' # DEBUG #and Path(repo.working_dir).name == 'pyontutils' and f == 'README.md' # DEBUG #and Path(repo.working_dir).name == 'sparc-curation' and f == 'docs/setup.org' # DEBUG and noneMembers(f, *skip_folders) and f not in rskip.get( Path(repo.working_dir).name, et) ] # doesn't work because read-from-minibuffer cannot block #compile_org_forever = ['emacs', '-q', '-l', #Path(devconfig.git_local_base, #'orgstrap/init.el').resolve().as_posix(), #'--batch', '-f', 'compile-org-forever'] #org_compile_process = subprocess.Popen(compile_org_forever, #stdin=subprocess.PIPE, #stdout=subprocess.PIPE, #stderr=subprocess.PIPE) if args['--spell']: spell((f.as_posix() for _, f, _ in wd_docs_kwargs)) return outname_rendered = render_docs(wd_docs_kwargs, BUILD, int(args['--jobs'])) titles = { 'Components': 'Components', 'NIF-Ontology/README.html': 'Introduction to the NIF Ontology', # 'ontquery/README.html': 'Introduction to ontquery', 'pyontutils/README.html': 'Introduction to pyontutils', 'pyontutils/nifstd/README.html': 'Introduction to nifstd-tools', 'pyontutils/neurondm/README.html': 'Introduction to neurondm', 'pyontutils/ilxutils/README.html': 'Introduction to ilxutils', 'Developer docs': 'Developer docs', 'NIF-Ontology/docs/processes.html': 'Ontology development processes (START HERE!)', # HOWTO 'NIF-Ontology/docs/development-setup.html': 'Ontology development setup', # HOWTO 'sparc-curation/docs/setup.html': 'Developer and curator setup (broader scope but extremely detailed)', 'NIF-Ontology/docs/import-chain.html': 'Ontology import chain', # Documentation 'pyontutils/nifstd/resolver/README.html': 'Ontology resolver setup', 'pyontutils/nifstd/scigraph/README.html': 'Ontology SciGraph setup', 'sparc-curation/resources/scigraph/README.html': 'SPARC SciGraph setup', 'pyontutils/docstrings.html': 'Command line programs', 'NIF-Ontology/docs/external-sources.html': 'External sources for the ontology', # Other 'ontquery/docs/interlex-client.html': 'InterLex client library doccumentation', 'Contributing': 'Contributing', 'pyontutils/nifstd/development/README.html': 'Contributing to the ontology', 'pyontutils/nifstd/development/community/README.html': 'Contributing term lists to the ontology', 'pyontutils/neurondm/neurondm/models/README.html': 'Contributing neuron terminology to the ontology', 'Ontology content': 'Ontology content', 'NIF-Ontology/docs/brain-regions.html': 'Parcellation schemes', # Ontology Content 'pyontutils/nifstd/development/methods/README.html': 'Methods and techniques', # Ontology content 'NIF-Ontology/docs/Neurons.html': 'Neuron Lang overview', 'pyontutils/neurondm/docs/NeuronLangExample.html': 'Neuron Lang examples', 'pyontutils/neurondm/docs/neurons_notebook.html': 'Neuron Lang setup', 'Specifications': 'Specifications', 'NIF-Ontology/docs/interlex-spec.html': 'InterLex specification', # Documentation 'pyontutils/ttlser/docs/ttlser.html': 'Deterministic turtle specification', 'Other': 'Other', 'pyontutils/htmlfn/README.html': 'htmlfn readme', 'pyontutils/ttlser/README.html': 'ttlser readme', 'sparc-curation/docs/background.html': '', # present but not visibly listed } titles_sparc = { # TODO abstract this out ... 'Background': 'Background', 'sparc-curation/docs/background.html': 'SPARC curation background', 'Other':'Other', 'sparc-curation/README.html': 'sparc-curation readme', } index = [ '<b class="Components">Components</b>', '<b class="Developer docs">Developer docs</b>', '<b class="Contributing">Contributing</b>', '<b class="Ontology content">Ontology content</b>', '<b class="Specifications">Specifications</b>', '<b class="Other">Other</b>', ] for outname, rendered in outname_rendered: apath = outname.relative_to(BUILD / 'docs') title = titles.get(apath.as_posix(), None) # TODO parse out/add titles value = atag(apath) if title is None else atag(apath, title) index.append(value) if not outname.parent.exists(): outname.parent.mkdir(parents=True) with open(outname.as_posix(), 'wt') as f: f.write(rendered) lt = list(titles) def title_key(a): return lt.index(a.split('"')[1]) index_body = '<br>\n'.join(['<h1>Documentation Index</h1>'] + sorted(index, key=title_key)) with open((BUILD / 'docs/index.html').as_posix(), 'wt') as f: f.write(htmldoc(index_body, title='NIF Ontology documentation index'))
def main(): DB_URI = 'mysql+mysqlconnector://{user}:{password}@{host}:{port}/{db}' if socket.gethostname() != 'orpheus': config = mysql_conn_helper('localhost', 'nif_eelg', 'nif_eelg_secure', 33060) # see .ssh/config else: config = mysql_conn_helper('nif-mysql.crbs.ucsd.edu', 'nif_eelg', 'nif_eelg_secure') engine = create_engine(DB_URI.format(**config), echo=True) config = None del(config) insp = inspect(engine) terms = [c['name'] for c in insp.get_columns('terms')] term_existing_ids = [c['name'] for c in insp.get_columns('term_existing_ids')] #breakpoint() #sys.exit() query = engine.execute('SELECT * FROM term_existing_ids as teid JOIN terms as t ON t.id = teid.tid WHERE t.type != "cde"') header = term_existing_ids + terms data = query.fetchall() cdata = list(zip(*data)) def datal(head): return cdata[header.index(head)] ilx_labels = {ilxb[ilx_fragment]:label for ilx_fragment, label in zip(datal('ilx'), datal('label'))} mapping_no_sao = [p for p in zip(datal('iri'), datal('ilx')) if 'neuinfo' in p[0]] # 9446 mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'neuinfo' in p[0] or '/sao' in p[0]] # 9883 done = [ilx for iri, ilx in mapping] obo_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'obolibrary' in p[0] and p[1] not in done] done = done + [ilx for iri, ilx in obo_mapping] db_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'drugbank' in p[0] and p[1] not in done] done = done + [ilx for iri, ilx in db_mapping] t3db_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 't3db' in p[0] and p[1] not in done] done = done + [ilx for iri, ilx in t3db_mapping] wiki_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'neurolex' in p[0] and p[1] not in done] sao_mapping = {o.toPython():s for s, o in Graph().parse((gitf / 'nlxeol/sao-nlxwiki-fixes.ttl').as_posix(), format='ttl').subject_objects(oboInOwl.hasAlternativeId)} scr = Graph().parse((gitf / 'NIF-Ontology/scicrunch-registry.ttl').as_posix(), format='turtle') moved_to_scr = {} #PROBLEM = set() for s, o in scr.subject_objects(oboInOwl.hasDbXref): if 'SCR_' in o: print(f'WARNING Registry identifier listed as alt id! {s} hasDbXref {o}') continue uri = NIFSTD[o] #try: assert uri not in moved_to_scr, f'utoh {uri} was mapped to more than one registry entry! {s} {moved_to_scr[uri]}' #except AssertionError: #PROBLEM.add(uri) moved_to_scr[uri] = s to_scr = [(k, v) for k, v in moved_to_scr.items() if noneMembers(k, 'SciEx_', 'OMICS_', 'rid_', 'SciRes_', 'biodbcore-', 'C0085410', 'doi.org', 'C43960', 'doi:10.', 'GAZ:', # 'birnlex_', 'nlx_', 'nif-' )] replacement_graph = createOntology(filename='NIFSTD-ILX-mapping', name='NLX* to ILX equivalents', prefixes=makePrefixes('ILX'),) scr_rep_graph = createOntology(filename='NIFSTD-SCR-mapping', name='NLX* to SCR equivalents', prefixes=makePrefixes('SCR'),) _existing = {} def dupes(this, other, set_, dupes_): if this not in set_: set_.add(this) _existing[this] = other elif _existing[this] != other: dupes_[this].add(_existing[this]) dupes_[this].add(other) iri_done = set() ilx_done = set() iri_dupes = defaultdict(set) ilx_dupes = defaultdict(set) def check_dupes(iri, ilx): dupes(iri, ilx, iri_done, iri_dupes) dupes(ilx, iri, ilx_done, ilx_dupes) BIRNLEX = Namespace(uPREFIXES['BIRNLEX']) trouble = [ # some are _2 issues :/ # in interlex -- YES WE KNOW THEY DONT MATCH SOME IDIOT DID THIS IN THE PAST BIRNLEX['1006'], # this one appears to be entirely novel despite a note that it was created in 2006... BIRNLEX['1152'], # this was used in uberon ;_; BIRNLEX['2476'], # can be owl:sameAs ed -> _2 version BIRNLEX['2477'], # can be owl:sameAs ed -> _2 version BIRNLEX['2478'], # can be owl:sameAs ed -> _2 version BIRNLEX['2479'], # can be owl:sameAs ed -> _2 version BIRNLEX['2480'], # can be owl:sameAs ed -> _2 version BIRNLEX['2533'], # This is in interlex as a wiki id http://uri.interlex.org/base/ilx_0109349 since never used in the ontology, we could add it to the list of 'same as' for cosmetic purposes which will probably happen... BIRNLEX['3074'], # -> CHEBI:26848 # add to slim and bridge... BIRNLEX['3076'], # -> CHEBI:26195 # XXX when we go to load chebi make sure we don't dupe this... ] aaaaaaaaaaaaaaaaaaaaaaaaaaaaa = [t + '_2' for t in trouble] # _never_ do this # TODO check for cases where there is an ilx and scr for the same id >_< sao_help = set() for iri, ilx_fragment in chain(mapping, to_scr): # XXX core loop if iri in sao_mapping: uri = sao_mapping[iri] sao_help.add(uri) else: uri = URIRef(iri) if uri in trouble: #print('TROUBLE', iri, ilxb[ilx_fragment]) print('TROUBLE', ilxb[ilx_fragment]) if uri in moved_to_scr: # TODO I think we need to have _all_ the SCR redirects here... s, p, o = uri, ilxtr.hasScrId, moved_to_scr[uri] scr_rep_graph.g.add((s, p, o)) else: s, p, o = uri, ilxtr.hasIlxId, ilxb[ilx_fragment] #s, p, o = o, ilxtr.ilxIdFor, s replacement_graph.g.add((s, p, o)) check_dupes(s, o) dupes = {k:v for k, v in iri_dupes.items()} idupes = {k:v for k, v in ilx_dupes.items()} assert not dupes, f'there are duplicate mappings for an external id {dupes}' #print(ilx_dupes) # there are none yet ng = cull_prefixes(replacement_graph.g, prefixes=uPREFIXES) ng.filename = replacement_graph.filename sng = cull_prefixes(scr_rep_graph.g, prefixes=uPREFIXES) sng.filename = scr_rep_graph.filename _ = [print(k.toPython(), ' '.join(sorted(ng.qname(_.toPython()) for _ in v))) for k, v in idupes.items()] # run `resolver_uris = sorted(set(e for t in graph for e in t if 'uri.neuinfo.org' in e))` on a graph with everything loaded to get this file... resources = Path(__file__).resolve().absolute().parent / 'resources' with open((resources / 'all-uri.neuinfo.org-uris.pickle').as_posix(), 'rb') as f: all_uris = pickle.load(f) # come in as URIRefs... with open((resources / 'all-uri.neuinfo.org-uris-old.pickle').as_posix(), 'rb') as f: all_uris_old = pickle.load(f) # come in as URIRefs... with open((resources / 'all-uri.neuinfo.org-uris-old2.pickle').as_posix(), 'rb') as f: all_uris_old2 = pickle.load(f) # come in as URIRefs... resolver_uris = set(e for t in chain(ng.g, sng.g) for e in t if 'uri.neuinfo.org' in e) ilx_only = resolver_uris - all_uris # aka nlxonly resolver_not_ilx_only = resolver_uris - ilx_only problem_uris = all_uris - resolver_uris old_uris = all_uris_old - all_uris old_uris2 = all_uris_old2 - all_uris dold_uris = all_uris_old - all_uris_old2 #idold_uris = all_uris_old2 - all_uris_old # empty as expected #nxrefs = Graph().parse((gitf / 'NIF-Ontology/ttl/generated/nlx-xrefs.ttl').as_posix(), format='turtle') nxrefs = Graph().parse((gitf / 'nlxeol/nlx-xrefs.ttl').as_posix(), format='turtle') xrefs_uris = set(e for t in nxrefs for e in t if 'uri.neuinfo.org' in e) test_old_uris = old_uris2 - xrefs_uris diff_uris = test_old_uris - ilx_only #diff_uris.remove(URIRef('http://uri.neuinfo.org/nif/nifstd/nlx_149160')) # ORNL was included in an old bad version of the xrefs file and was pulled in in the old all-uris # now dealt with by the scr mapping diff_uris.remove(URIRef('http://uri.neuinfo.org/nif/nifstd/nlx_40280,birnlex_1731')) # one of the doubled neurolex ids diff_uris.remove(URIRef('http://uri.neuinfo.org/nif/nifstd')) # i have zero idea how this snuck in assert not diff_uris, 'old uris and problem uris should be identical' _ilx = set(e for t in ng.g for e in t) _scr = set(e for t in sng.g for e in t) for uri in ilx_only: if uri in _ilx and uri in _scr: raise BaseException('AAAAAAAAAAAAAAAAAAAAAAAAAAAAA') elif uri in _ilx: g = ng.g elif uri in _scr: g = sng.g else: raise BaseException('????????????') g.add((uri, ilxtr.isDefinedBy, URIRef('http://neurolex.org'))) # XXX write the graphs ng.write() sng.write() nsuris = set(uri for uri, ilx in mapping_no_sao) auris = set(_.toPython() for _ in all_uris) iuris = set(_.toPython() for _ in resolver_uris) #sao_missing = iuris - nsuris # now fixed and cannot run due to addition of scr ids to resolver_uris #assert not sao_missing, f'whoops {sao_missing}' ilx_missing = auris - iuris all_missing = iuris - auris #assert not all_missing, f'all is not all! {all_missing}' # XXX have to deal with ilx_only separately as NLX-ILX or something # fixed #sao_add = {o.toPython():s.toPython() for s, p, o in ng.g if s.toPython() in sao_missing} #assert len(sao_add) == len(sao_missing), 'EEEEEEEEEEEEEEE' #with open('/tmp/please-add-these-sao-ids-as-existing-ids-to-the-listed-interlex-record.json', 'wt') as f: #json.dump(sao_add, f, indent=2) to_review = sorted(ilx_missing) # not relevant anymore #with open('thought-to-be-missing.json', 'rt') as f: #thought_to_be_missing = json.load(f) # from troy has issues #with open('nifext-duplicates-and-new.json', 'rt') as f: #nifext_data = json.load(f) #nifext_dupes = {v['current_nifext_id']:v['dropped_nifext_ids'][-1] if v['dropped_nifext_ids'] else None for v in nifext_data.values()} sgv = Vocabulary(cache=True) trts = [(v, (sgv.findById(v)['labels'][0] if sgv.findById(v)['labels'] else '<--NO-LABEL-->') if sgv.findById(v) else '<------>') for v in to_review] sgg = sGraph(cache=True) SGG = Namespace(sgg._basePath.rstrip('/') + '/graph/') rg = Graph().parse((gitf / 'NIF-Ontology/ttl/unused/NIF-Retired.ttl').as_posix(), format='turtle') retired = set(e.toPython() for t in rg for e in t if 'uri.neuinfo.org' in e) retfile = '<ttl/unused/NIF-Retired.ttl>' help_graph = createOntology(filename='NIFSTD-BLACKHOLE-mapping', name='HELPPPPPPPP!!!!', prefixes=uPREFIXES,) def make_rt(to_review_tuples, retired=retired): def inner(u, l, retired=retired): ne = sgg.getNeighbors(u, relationshipType="isDefinedBy", depth=1) if ne: curie = help_graph.qname(u) help_graph.g.add((URIRef(u), ilxtr.SciGraphLookup, URIRef(f'http://scigraph.olympiangods.org/scigraph/graph/{curie}'))) if ne and ne['edges']: src = ' '.join([f'<{e["obj"]}>' for e in ne["edges"]]) elif u in retired: src = retfile else: src = '<>' return f'{u:<70} {l:<50} {src}' out = Async(rate=3000)(deferred(inner)(u, l) for u, l in sorted(to_review_tuples, key=lambda a:a[-1])) return '\n'.join(out) review_text = make_rt(trts) trts2 = [(u, l) for u, l in trts if 'nifext' not in u] not_nifext = make_rt(trts2) hng = cull_prefixes(help_graph.g, prefixes=uPREFIXES) hng.filename = help_graph.filename hng.write() ### # Accounting of uri.neuinfo.org ids that do not resolve ### not_in_interlex = set(s for s, o in hng.g.subject_objects(ilxtr.SciGraphLookup)) bh_deprecated = set(s for s in hng.g.subjects() if sgv.findById(s) and sgv.findById(s)['deprecated']) bh_not_deprecated = set(s for s in hng.g.subjects() if sgv.findById(s) and not sgv.findById(s)['deprecated']) bh_nifexts = set(s for s in bh_not_deprecated if 'nifext' in s) bh_readable = set(s for s in bh_not_deprecated if 'readable' in s) unaccounted = not_in_interlex - bh_readable - bh_nifexts - bh_deprecated namedinds = set(s for s in unaccounted if sgv.findById(s) and sgg.getNode(s)['nodes'][0]['meta']['types'] and sgg.getNode(s)['nodes'][0]['meta']['types'][0] == 'NamedIndividual') unaccounted = unaccounted - namedinds ual = sorted(o for s in unaccounted for o in hng.g.objects(s, ilxtr.SciGraphLookup)) report = ( f'Total {len(not_in_interlex)}\n' f'deprecated {len(bh_deprecated)}\n' f'nd nifext {len(bh_nifexts)}\n' f'nd readable {len(bh_readable)}\n' f'nd namedind {len(namedinds)}\n' f'unaccounted {len(unaccounted)}\n' ) print(report) def reverse_report(): ilx = Graph() ilx.parse('/tmp/interlex.ttl', format='turtle') not_in_ontology = set() annotations = set() relations = set() drugbank = set() t3db = set() for subject in ilx.subjects(rdf.type, owl.Class): ok = False for object in ilx.objects(subject, oboInOwl.hasDbXref): if anyMembers(object, 'uri.neuinfo.org', 'GO_', 'CHEBI_', 'PR_', 'PATO_', 'HP_', 'OBI_', 'DOID_', 'COGPO_', 'CAO_', 'UBERON_', 'NCBITaxon_', 'SO_', 'IAO_'): # FIXME doe we areally import HP? ok = True if (subject, rdf.type, owl.AnnotationProperty) in ilx: # FIXME for troy these need to be cleared up annotations.add(subject) elif (subject, rdf.type, owl.ObjectProperty) in ilx: relations.add(subject) elif 'drugbank' in object: drugbank.add(subject) elif 't3db.org' in object: t3db.add(subject) if not ok: not_in_ontology.add(subject) drugbank = drugbank & not_in_ontology t3db = t3db & not_in_ontology annotations = annotations & not_in_ontology relations = relations & not_in_ontology unaccounted = not_in_ontology - drugbank - t3db - annotations - relations report = ( f'Total {len(not_in_ontology)}\n' f'annotations {len(annotations)}\n' f'relations {len(relations)}\n' f'drugbank {len(drugbank)}\n' f't3db {len(t3db)}\n' f'unaccounted {len(unaccounted)}\n' ) print(report) return (not_in_ontology, drugbank, unaccounted) _, _, un = reverse_report() h_uris = set(e for t in hng.g for e in t if 'uri.neuinfo.org' in e) real_problems = problem_uris - h_uris ### # Missing neurons ### with open((gitf / 'nlxeol/neuron_data_curated.csv').as_posix()) as f: r = csv.reader(f) nheader = next(r) rows = list(r) ndata = list(zip(*rows)) def datan(head): return ndata[nheader.index(head)] if __name__ == '__main__': breakpoint()
def main(): ndl_config = Config('neuron_data_lifted') ndl_config.load_existing() ndl_neurons = ndl_config.neurons() bn_config = Config('basic-neurons') bn_config.load_existing() bn_neurons = bn_config.neurons() resources = Path(devconfig.resources) cutcsv = resources / 'common-usage-types.csv' with open(cutcsv.as_posix(), 'rt') as f: rows = [l for l in csv.reader(f)] bc = byCol(rows) (_, *labels), *_ = zip(*bc) labels_set0 = set(labels) ns = [] for n in ndl_neurons: l = str(n.origLabel) if l is not None: for replace, match in rename_rules.items(): # HEH l = l.replace(match, replace) if l in labels: n._origLabel = l ns.append(n) sns = set(n.origLabel for n in ns) labels_set1 = labels_set0 - sns agen = [c.label for c in bc if c.autogenerated] sagen = set(agen) added = [c.label for c in bc if c.added] sadded = set(added) ans = [] sans = set() missed = set() for n in bn_neurons: continue # we actually get all of these with uberon, will map between them later # can't use capitalize here because there are proper names that stay uppercase l = n.label.replace('(swannt) ', '').replace('Intrinsic', 'intrinsic').replace('Projection', 'projection') for replace, match in rename_rules.items(): # HEH l = l.replace(match, replace) if l in agen: n._origLabel = l ans.append(n) sans.add(l) else: missed.add(l) agen_missing = sagen - sans labels_set2 = labels_set1 - sans nlx_labels = [c.label for c in bc if c.neurolex] snlx_labels = set(nlx_labels) class SourceCUT(resSource): sourceFile = 'nifstd/resources/common-usage-types.csv' # FIXME relative to git workingdir... source_original = True sources = SourceCUT(), swanr = rdflib.Namespace(interlex_namespace('swanson/uris/readable/')) config = Config('common-usage-types-raw', sources=sources, source_file=relative_path(__file__), prefixes={'swanr':swanr, 'SWAN':interlex_namespace('swanson/uris/neuroanatomical-terminology/terms/'), 'SWAA':interlex_namespace('swanson/uris/neuroanatomical-terminology/appendix/'),}) ins = [None if OntId(n.id_).prefix == 'TEMP' else n.id_ for n in ns] ians = [None] * len(ans) def zap(pes): for pe in pes: if pe not in (Phenotype('BIRNLEX:212', ilxtr.hasTaxonRank), Phenotype('NCBITaxon:7742', ilxtr.hasTaxonRank), Phenotype('BIRNLEX:252', ilxtr.hasTaxonRank), Phenotype('BIRNLEX:516', ilxtr.hasTaxonRank),): yield pe with Neuron(CUT.Mammalia): mamns = [NeuronCUT(*zap(n.pes), id_=i, label=n._origLabel, override=bool(i)).adopt_meta(n) for i, n in zip(ins + ians, ns + ans)] contains_rules = make_contains_rules() skip = set() smatch = set() rem = {} for l in labels_set2: pes = tuple() l_rem = l for match, pheno in contains_rules.items(): t = None if match not in skip and pheno == OntTerm: try: t = OntTerm(term=match) print('WTF', match, t) if t.validated: pheno = Phenotype(t.u, ilxtr.hasSomaLocatedIn) else: pheno = None except oq.exceptions.NotFoundError: skip.add(match) pheno = None if match in skip and pheno == OntTerm: pheno = None if match in l_rem and pheno: l_rem = l_rem.replace(match, '').strip() pes += (pheno,) if l_rem in exact_rules: pes += (exact_rules[l_rem],) l_rem = '' if l_rem == ' neuron': l_rem = '' elif l_rem.endswith(' cell'): l_rem = l_rem[:-len(' cell')] #print('l_rem no cell:', l_rem) elif l_rem.endswith(' neuron'): l_rem = l_rem[:-len(' neuron')] #print('l_rem no neuron:', l_rem) hrm = [pe for pe in pes if pe.e == ilxtr.hasSomaLocatedIn] if ' ' in l_rem: #print('l_rem:', l_rem) #embed() maybe_region, rest = l_rem.split(' ', 1) elif noneMembers(l_rem, *terminals) and not hrm: maybe_region, rest = l_rem, '' #print('MR:', maybe_region) else: #print(hrm) maybe_region = None if maybe_region: prefix_rank = ('UBERON', 'SWAN', 'BIRNLEX', 'SAO', 'NLXANAT') def key(ot): ranked = ot.prefix in prefix_rank arg = ot._query_result._QueryResult__query_args['term'].lower() return (not ranked, prefix_rank.index(ot.prefix) if ranked else 0, not (arg == ot.label.lower())) #t = OntTerm(term=maybe_region) # using query avoids the NoExplicitIdError ots = sorted((qr.OntTerm for qr in OntTerm.query(term=maybe_region, exclude_prefix=('FMA',))), key=key) if not ots: log.error(f'No match for {maybe_region!r}') else: t = ots[0] if 'oboInOwl:id' in t.predicates: # uberon replacement t = OntTerm(t.predicates['oboInOwl:id']) t.set_next_repr('curie', 'label') log.info(f'Match for {maybe_region!r} was {t!r}') if t.validated: l_rem = rest pheno = Phenotype(t.u, ilxtr.hasSomaLocatedIn) # FIXME pes += (pheno,) if pes: smatch.add(l) rem[l] = l_rem with Neuron(CUT.Mammalia): NeuronCUT(*zap(pes), id_=make_cut_id(l), label=l, override=True) labels_set3 = labels_set2 - smatch added_unmapped = sadded & labels_set3 # TODO preserve the names from neuronlex on import ... Neuron.write() Neuron.write_python() raw_neurons = config.neurons() config = Config('common-usage-types', sources=sources, source_file=relative_path(__file__), prefixes={'swanr':swanr, 'SWAN':interlex_namespace('swanson/uris/neuroanatomical-terminology/terms/'), 'SWAA':interlex_namespace('swanson/uris/neuroanatomical-terminology/appendix/'),}) ids_updated_neurons = [n.asUndeprecated() for n in raw_neurons] assert len(ids_updated_neurons) == len(raw_neurons) Neuron.write() Neuron.write_python() progress = len(labels_set0), len(sns), len(sans), len(smatch), len(labels_set1), len(labels_set2), len(labels_set3) print('\nProgress:\n' f'total: {progress[0]}\n' f'from nlx: {progress[1]}\n' f'from basic: {progress[2]}\n' f'from match: {progress[3]}\n' f'TODO after nlx: {progress[4]}\n' f'TODO after basic: {progress[5]}\n' f'TODO after match: {progress[6]}\n') assert progress[0] == progress[1] + progress[4], 'neurolex does not add up' assert progress[4] == progress[2] + progress[5], 'basic does not add up' lnlx = set(n.lower() for n in snlx_labels) sos = set(n.origLabel.lower() if n.origLabel else None for n in ndl_neurons) # FIXME load origLabel nlx_review = lnlx - sos nlx_missing = sorted(nlx_review) print(f'\nNeuroLex listed as source but no mapping (n = {len(nlx_review)}):') _ = [print(l) for l in nlx_missing] partial = {k:v for k, v in rem.items() if v and v not in terminals} print(f'\nPartially mapped (n = {len(partial)}):') if partial: mk = max((len(k) for k in partial.keys())) + 2 for k, v in sorted(partial.items()): print(f'{k:<{mk}} {v!r}') #print(f'{k!r:<{mk}}{v!r}') #pprint(partial, width=200) unmapped = sorted(labels_set3) print(f'\nUnmapped (n = {len(labels_set3)}):') _ = [print(l) for l in unmapped] if __name__ == '__main__': rows = export_for_review(config, unmapped, partial, nlx_missing) embed() return config, unmapped, partial, nlx_missing
def inner(local_filepath, remote=False): if noneMembers(local_filepath, *bigleaves) or dobig: ext = os.path.splitext(local_filepath)[-1] if ext == '.ttl': infmt = 'turtle' else: print(ext, local_filepath) infmt = None if remote: resp = requests.get( local_filepath ) # TODO nonblocking pull these out, fetch, run inner again until done raw = resp.text.encode() else: try: with open(local_filepath, 'rb') as f: raw = f.read() except FileNotFoundError as e: if local_filepath.startswith('file://'): print('local_imports has already been run, skipping', local_filepath) return #raise ValueError('local_imports has already been run') from e else: print(e) raw = b'' if oo in raw: # we only care if there are imports or an ontology iri scratch = rdflib.Graph() if infmt == 'turtle': data, rest = raw.split(b'###', 1) elif infmt == None: # assume xml xml_tree = etree.parse(BytesIO(raw)) xml_root = xml_tree.getroot() xml_ontology = xml_tree.xpath( "/*[local-name()='RDF']/*[local-name()='Ontology']") xml_root.clear() xml_root.append(xml_ontology[0]) data = etree.tostring(xml_root) scratch.parse(data=data, format=infmt) for s in scratch.subjects(rdf.type, owl.Ontology): triples.add((s, owl.sameAs, rdflib.URIRef(local_filepath))) for s, o in sorted(scratch.subject_objects(p)): nlfp = o.replace(remote_base, local_base) triples.add((s, p, o)) if 'http://' in local_filepath or 'external' in local_filepath: # FIXME what to do about https used inconsistently :/ if 'external' in local_filepath: imported_iri = rdflib.URIRef( local_filepath.replace( local_base, remote_base)) # inefficient else: imported_iri = rdflib.URIRef(local_filepath) if s != imported_iri: imported_iri_vs_ontology_iri[ imported_iri] = s # kept for the record triples.add((imported_iri, p, s)) # bridge imported != ontology iri if local_base in nlfp and 'file://' not in o: # FIXME file:// should not be slipping through here... scratch.add((s, p, rdflib.URIRef('file://' + nlfp))) scratch.remove((s, p, o)) if nlfp not in done: done.append(nlfp) if local_base in nlfp and 'external' not in nlfp: # skip externals TODO inner(nlfp) elif readonly: # read external imports if 'external' in nlfp: inner(nlfp) else: inner(nlfp, remote=True) if not readonly: ttl = scratch.serialize(format='nifttl') ndata, comment = ttl.split(b'###', 1) out = ndata + b'###' + rest with open(local_filepath, 'wb') as f: f.write(out)