def load_header(filepath, remote=False): oo = b'owl:Ontology' path = Path(filepath) if path.suffix == '.ttl': infmt = 'turtle' else: infmt = 'xml' # FIXME assumption if remote: resp = requests.get( filepath ) # TODO nonblocking pull these out, fetch, run inner again until done raw = resp.text.encode() else: with open(filepath, 'rb') as f: # do not catch FileNotFoundErrors raw = f.read() if oo in raw: # we only care if there are imports or an ontology iri scratch = OntGraph() if infmt == 'turtle': data, rest = raw.split(b'###', 1) elif infmt == None: # assume xml xml_tree = etree.parse(BytesIO(raw)) xml_root = xml_tree.getroot() xml_ontology = xml_tree.xpath( "/*[local-name()='RDF']/*[local-name()='Ontology']") xml_root.clear() xml_root.append(xml_ontology[0]) data = etree.tostring(xml_root) scratch.parse(data=data, format=infmt) return scratch
def npokb(): index_graph = OntGraph(path=auth.get_path('ontology-local-repo') / 'ttl/generated/neurons/npokb-index.ttl') if index_graph.path.exists(): index_graph.parse() # testing index_graph.bind('npokb', npokb) #[index_graph.add((npokb[str(i)], rdf.type, owl.Class)) for i in range(1, 11)] #[index_graph.add((npokb[str(i)], ilxtr.hasTemporaryId, TEMP[str(i)])) for i in range(1, 11)] ios = [] for eff in ('common-usage-types', 'huang-2017', 'markram-2015', 'allen-cell-types'): path = auth.get_path( 'ontology-local-repo') / f'ttl/generated/neurons/{eff}.ttl' input_graph = OntGraph(path=path) input_graph.parse() output_graph = input_graph.mapTempToIndex(index_graph, npokb, TEMP) ios.append((input_graph, output_graph)) input_graph, output_graph = ios[0] a, r, c = output_graph.subjectsChanged(input_graph) index_graph.write() # [o.write() for i, o, in ios] # when ready #from sparcur.paths import Path #Path(index_graph.path).xopen() breakpoint()
def main(): #InterLexSneechenator() test() return # testing index_graph.bind('ILX', ILX) #[index_graph.add((npokb[str(i)], rdf.type, owl.Class)) for i in range(1, 11)] #[index_graph.add((npokb[str(i)], ilxtr.hasTemporaryId, TEMP[str(i)])) for i in range(1, 11)] ios = [] for eff in ('phenotype-core.ttl', 'phenotypes.ttl'): path = auth.get_path('ontology-local-repo') / eff input_graph = OntGraph(path=path) input_graph.parse() output_graph = input_graph.mapTempToIndex(index_graph, ILX, ilxtr) ios.append((input_graph, output_graph)) input_graph, output_graph = ios[0] a, r, c = output_graph.subjectsChanged(input_graph) index_graph.write() # [o.write() for i, o, in ios] # when ready #from sparcur.paths import Path #Path(index_graph.path).xopen() breakpoint()
def triples(self): crossref_doi_pred = rdflib.term.URIRef('http://prismstandard.org/namespaces/basic/2.1/doi') for blob in self.data['identifier_metadata']: id = blob['id'] if not isinstance(id, idlib.Stream): id = idlib.Auto(id) if not hasattr(id, 'asUri'): breakpoint() s = id.asUri(rdflib.URIRef) if 'source' in blob: source = blob['source'] # FIXME we need to wrap this in our normalized representation if source == 'Crossref': # FIXME CrossrefConvertor etc. OR put it in idlib as a an alternate ttl pos = ( (rdf.type, owl.NamedIndividual), (rdf.type, TEMP[blob['type']]), (dc.publisher, blob['publisher']), #(dc.type, blob['type']), # FIXME semantify (dc.title, blob['title']), (dc.date, self.published_online(blob)), # FIXME .... dangerzone ) g = OntGraph() doi = idlib.Doi(id) if not isinstance(id, idlib.Doi) else id # FIXME idlib streams need to recognize their own type in __new__ data = doi.ttl() if data is None: # blackfynn has some bad settings on their doi records ... return try: g.parse(data=data, format='ttl') # FIXME network bad except BaseException as e: loge.exception(e) _tr = [s for s, p, o in g if p == crossref_doi_pred] if _tr: _their_record_s = _tr[0] yield s, owl.sameAs, _their_record_s yield from g else: g.debug() log.critical('No crossref doi section in graph!') else: msg = f'dont know what to do with {source}' log.error(msg) #raise NotImplementedError(msg) return else: msg = f'dont know what to do with {blob} for {id.identifier}' log.error(msg) #raise NotImplementedError(msg) return for p, oraw in pos: if oraw is not None: o = rdflib.Literal(oraw) if not isinstance(oraw, rdflib.URIRef) else oraw yield s, p, o
def triples(self): for blob in self.data['identifier_metadata']: id = blob['id'] if not isinstance(id, idlib.Stream): id = idlib.Auto(id) s = id.asType(rdflib.URIRef) if 'source' in blob: source = blob[ 'source'] # FIXME we need to wrap this in our normalized representation if source == 'Crossref': # FIXME CrossrefConvertor etc. OR put it in idlib as a an alternate ttl pos = ( (rdf.type, owl.NamedIndividual), (rdf.type, TEMP[blob['type']]), (dc.publisher, blob['publisher']), #(dc.type, blob['type']), # FIXME semantify (dc.title, blob['title']), (dc.date, self.published_online(blob)), # FIXME .... dangerzone ) g = OntGraph() doi = idlib.Doi(id) if not isinstance( id, idlib.Doi ) else id # FIXME idlib streams need to recognize their own type in __new__ g.parse(data=doi.ttl(), format='ttl') # FIXME network bad _their_record_s = [ s for s, p, o in g if p == rdflib.term.URIRef( 'http://prismstandard.org/namespaces/basic/2.1/doi' ) ][0] yield s, owl.sameAs, _their_record_s yield from g else: msg = f'dont know what to do with {source}' log.error(msg) #raise NotImplementedError(msg) return else: msg = f'dont know what to do with {blob} for {id.identifier}' log.error(msg) #raise NotImplementedError(msg) return for p, oraw in pos: if oraw is not None: o = rdflib.Literal(oraw) if not isinstance( oraw, rdflib.URIRef) else oraw yield s, p, o
def loadall(git_local, repo_name, local=False, dobig=False): local_base = jpth(git_local, repo_name) lb_ttl = os.path.realpath(jpth(local_base, 'ttl')) #match = (rdflib.term.URIRef('http://purl.org/dc/elements/1.1/member'), # iao.owl #rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), #rdflib.term.URIRef('http://www.w3.org/2002/07/owl#AnnotationProperty')) done = [] filenames = [ f for g in ('*', '*/*', '*/*/*') for f in glob(lb_ttl + '/' + g + '.ttl') ] graph = OntGraph() for f in filenames: print(f) done.append(os.path.basename(f)) graph.parse(f, format='turtle') #if match in graph: #raise BaseException('Evil file found %s' % f) def repeat( dobig=dobig): # we don't really know when to stop, so just adjust for s, o in graph.subject_objects(owl.imports): if os.path.basename(o) not in done and o not in done: #if (o, rdf.type, owl.Ontology) not in graph: print(o) done.append(o) ext = os.path.splitext(o)[1] fmt = 'turtle' if ext == '.ttl' else 'xml' if noneMembers(o, *bigleaves) or dobig: graph.parse(o, format=fmt) #if match in graph: #raise BaseException('Evil file found %s' % o) #if local: #repeat(False) #else: if not local: for i in range(10): repeat(True) return graph
def npokb_mapping(): index_graph = OntGraph(path=auth.get_path('ontology-local-repo') / 'ttl/generated/neurons/npokb-index.ttl') if index_graph.path.exists(): index_graph.parse() # testing index_graph.bind('npokb', npokb) #[index_graph.add((npokb[str(i)], rdf.type, owl.Class)) for i in range(1, 11)] #[index_graph.add((npokb[str(i)], ilxtr.hasTemporaryId, TEMP[str(i)])) for i in range(1, 11)] ios = [] for eff in ( 'common-usage-types', 'huang-2017', 'markram-2015', 'allen-cell-types', ): # FIXME if the index id is already being used it is still added as a temp id incorrectly path = auth.get_path( 'ontology-local-repo') / f'ttl/generated/neurons/{eff}.ttl' org = OntResGit( path, ref='HEAD' ) # HEAD is default but just for clarity set it explicitly here prev_graph = org.graph input_graph = OntGraph(path=path) input_graph.parse() mapped_graph = input_graph.mapStableIdentifiers( prev_graph, ilxtr.origLabel) output_graph = mapped_graph.mapTempToIndex(index_graph, npokb, TEMP) ios.append((mapped_graph, output_graph)) mapped_graph, output_graph = ios[0] a, r, c = output_graph.subjectsChanged(mapped_graph) index_graph.write() [o.write() for i, o, in ios] # when ready #from sparcur.paths import Path #Path(index_graph.path).xopen() breakpoint()
def inner(local_filepath, remote=False): if noneMembers(local_filepath, *bigleaves) or dobig: ext = os.path.splitext(local_filepath)[-1] if ext == '.ttl': infmt = 'turtle' else: log.info((ext, local_filepath)) infmt = None if remote: resp = requests.get( local_filepath ) # TODO nonblocking pull these out, fetch, run inner again until done raw = resp.text.encode() else: try: with open(local_filepath, 'rb') as f: raw = f.read() except FileNotFoundError as e: if local_filepath.startswith('file://'): log.info( f'local_imports has already been run, skipping {local_filepath}' ) return #raise ValueError('local_imports has already been run') from e else: log.exception( e ) # TODO raise a warning if the file cannot be matched # seems like good practice to have any imported ontology under # version control so all imports are guaranteed to have good # provenance and not split the prior informaiton between the # scigraph config and the repository, the repository remains # the source of truth, load.yaml files can then pick a subset # of the properly tracked files to load as they see fit, but # not add to them (at least in pyontutils land) raw = b'' if oo in raw: # we only care if there are imports or an ontology iri scratch = OntGraph() if infmt == 'turtle': data, rest = raw.split(b'###', 1) elif infmt == None: # assume xml xml_tree = etree.parse(BytesIO(raw)) xml_root = xml_tree.getroot() xml_ontology = xml_tree.xpath( "/*[local-name()='RDF']/*[local-name()='Ontology']") xml_root.clear() xml_root.append(xml_ontology[0]) data = etree.tostring(xml_root) scratch.parse(data=data, format=infmt) for s in scratch.subjects(rdf.type, owl.Ontology): triples.add((s, owl.sameAs, rdflib.URIRef(local_filepath))) # somehow this breaks computing the chain #for p in (rdfs.comment, skos.definition, definition, dc.title, rdfs.label): #for o in scratch[s:p]: #triples.add((s, p, o)) for s, o in sorted(scratch.subject_objects(p)): if revert: raise NotImplementedError('TODO') nlfp = o.replace(remote_base, local_base) triples.add((s, p, o)) if 'http://' in local_filepath or 'external' in local_filepath: # FIXME what to do about https used inconsistently :/ if 'external' in local_filepath: imported_iri = rdflib.URIRef( local_filepath.replace( local_base, remote_base)) # inefficient else: imported_iri = rdflib.URIRef(local_filepath) if s != imported_iri: imported_iri_vs_ontology_iri[ imported_iri] = s # kept for the record triples.add((imported_iri, p, s)) # bridge imported != ontology iri if local_base in nlfp and 'file://' not in o: # FIXME file:// should not be slipping through here... scratch.add((s, p, rdflib.URIRef('file://' + nlfp))) scratch.remove((s, p, o)) if nlfp not in done: done.append(nlfp) if local_base in nlfp and 'external' not in nlfp: # skip externals TODO inner(nlfp) elif readonly: # read external imports if 'external' in nlfp: inner(nlfp) else: inner(nlfp, remote=True) if not readonly: _orp = CustomTurtleSerializer.roundtrip_prefixes # FIXME awful hack :/ CustomTurtleSerializer.roundtrip_prefixes = True ttl = scratch.serialize(format='nifttl', encoding='utf-8') CustomTurtleSerializer.roundtrip_prefixes = _orp ndata, comment = ttl.split(b'###', 1) out = ndata + b'###' + rest with open(local_filepath, 'wb') as f: f.write(out)
def processData(cls): ids_raw, ids = cls._id_src() tree = cls.raw r = tree.getroot() cs = r.getchildren() classes = [ _ for _ in cs if _.tag == '{http://www.w3.org/2002/07/owl#}Class' and _.values()[0] in ids ] ontology = tree.xpath( "/*[local-name()='RDF']/*[local-name()='Ontology']") ops = tree.xpath( "/*[local-name()='RDF']/*[local-name()='ObjectProperty']") # TODO wanted = [etree.ElementTree(_) for _ in classes] rpl_check = tree.xpath( "/*[local-name()='RDF']/*[local-name()='Class']/*[local-name()='hasAlternativeId']" ) rpl_dict = { _.text: _.getparent() for _ in rpl_check if _.text in ids_raw } # we also need to have any new classes that have replaced old ids also_classes = list(rpl_dict.values()) a = ontology + ops + classes + also_classes def rec(start_set, done): ids_ = set() for c in start_set: ids_.update([ _.items()[0][1] for _ in etree.ElementTree(c).xpath( "/*[local-name()='Class']/*[local-name()='subClassOf']" ) if _.items() ]) ids_.update([ _.items()[0][1] for _ in etree.ElementTree(c).xpath( "/*[local-name()='Class']/*[local-name()='subClassOf']/*[local-name()='Restriction']/*[local-name()='someValuesFrom']" ) if _.items() ]) supers = [ _ for _ in cs if _.tag == '{http://www.w3.org/2002/07/owl#}Class' and _.values()[0] in ids_ and _ not in done ] if supers: msup, more_ids = rec(supers, done + supers) supers += msup ids_.update(more_ids) return supers, ids_ more, more_ids = rec(a, a) all_nodes = a if cls.more: all_nodes = a + more all_ = set(all_nodes) r.clear() # wipe all the stuff we don't need for c in all_: r.append(c) data = etree.tostring(r) g = OntGraph() g.parse( data=data ) # now _this_ is stupidly slow (like 20 minutes of slow) might make more sense to do the xml directly? cls.iri = list( g.query( 'SELECT DISTINCT ?match WHERE { ?temp rdf:type owl:Ontology . ?temp owl:versionIRI ?match . }' ))[0][0] return more, more_ids, g
def main(): olr = auth.get_path('ontology-local-repo') resources = auth.get_path('resources') if not olr.exists(): raise FileNotFoundError(f'{olr} does not exist cannot continue') if not resources.exists(): raise FileNotFoundError(f'{resources} does not exist cannot continue') PREFIXES = makePrefixes('definition', 'replacedBy', 'hasRole', 'oboInOwl', 'CHEBI', 'owl', 'skos', 'oboInOwl') ug = makeGraph('utilgraph', prefixes=PREFIXES) file = resources / 'chebi-subset-ids.txt' with open(file.as_posix(), 'rt') as f: ids_raw = set((_.strip() for _ in f.readlines())) ids = sorted(set((ug.expand(_.strip()) for _ in ids_raw))) def check_chebis(g): a = [] for id_ in ids: l = sorted(g.triples((id_, None, None))) ll = len(l) a.append(ll) return a def fixIons(g): # there are a series of atom/ion confusions that shall be dealt with, solution is to add 'iron' as a synonym to the charged form since that is what the biologists are usually referring to... ng = makeGraph('', graph=g, prefixes=makePrefixes('CHEBI')) # atom ion None, 'CHEBI:29108' # calcium is ok ng.replace_uriref('CHEBI:30145', 'CHEBI:49713') # lithium ng.replace_uriref('CHEBI:18248', 'CHEBI:29033') # iron ng.replace_uriref('CHEBI:26216', 'CHEBI:29103') # potassium ng.replace_uriref('CHEBI:26708', 'CHEBI:29101') # sodium None, 'CHEBI:29105' # zinc is ok g = OntGraph() cg = OntGraph() cd = OntGraph() chemg = OntGraph() molg = OntGraph() cg.parse(olr / 'ttl/generated/chebislim.ttl', format='turtle') list(g.add(t) for t in cg) a1 = check_chebis(g) cd.parse(olr / 'ttl/generated/chebi-dead.ttl', format='turtle') list(g.add(t) for t in cd) a2 = check_chebis(g) chemg.parse(olr / 'ttl/NIF-Chemical.ttl', format='turtle') chemgg = makeGraph('NIF-Chemical', graph=chemg) fixIons(chemg) list(g.add(t) for t in chemg) a3 = check_chebis(g) molg.parse(olr / 'ttl/NIF-Molecule.ttl', format='turtle') molgg = makeGraph('NIF-Molecule', graph=molg) fixIons(molg) list(g.add(t) for t in molg) a4 = check_chebis(g) replacedBy = ug.expand('replacedBy:') deads = {s: o for s, o in cd.subject_objects(replacedBy)} def switch_dead(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl')) for f, r in deads.items(): ng.replace_uriref(f, r) ng.add_trip(r, 'oboInOwl:hasAlternateId', rdflib.Literal(f, datatype=rdflib.XSD.string)) g.remove( (r, replacedBy, r)) # in case the replaced by was already in switch_dead(g) switch_dead(cg) switch_dead(chemg) switch_dead(molg) def fixHasAltId(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl', 'NIFCHEM', 'NIFRID')) ng.replace_uriref('NIFCHEM:hasAlternativeId', 'oboInOwl:hasAlternativeId') # ng.replace_uriref('NIFRID:ChEBIid', 'oboInOwl:id') # :id does not exist, do we need an alternative? list(map(fixHasAltId, (g, cg, chemg))) def fixAltIdIsURIRef(g): hai = ug.expand('oboInOwl:hasAlternativeId') # i = ug.expand('oboInOwl:id') # :id does not exist makeGraph('', graph=g, prefixes=makePrefixes( 'CHEBI')) # amazlingly sometimes this is missing... def inner(s, p, o): if type(o) == rdflib.URIRef: qn = g.namespace_manager.qname(o) g.add((s, p, rdflib.Literal(qn, datatype=rdflib.XSD.string))) if 'ns' in qn: print('WARNING UNKNOWN NAMESPACE BEING SHORTENED', str(o), qn) g.remove((s, p, o)) for s, o in g.subject_objects(hai): inner(s, hai, o) #for s, o in g.subject_objects(i): # :id does not exist #inner(s, i, o) list(map(fixAltIdIsURIRef, (g, cg, chemg))) matches = [_ for _ in zip(a1, a2, a3, a4)] changed = [len(set(_)) != 1 for _ in matches] review = [(id_, m) for id_, changed, m in zip(ids, changed, matches) if changed and m[0]] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_c = [ set([(s, str(o.toPython())) for s, p, o in cg.triples((u, None, None))]) for u, _ in review ] wat_a = [ set([(s, str(o.toPython())) for s, p, o in g.triples((u, None, None))]) for u, _ in review ] wat_c_ = [ set(cg.triples((u, None, None))) for u, _ in review ] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_a_ = [ set(g.triples((u, None, None))) for u, _ in review ] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython diff = [a - c for a, c in zip(wat_a, wat_c)] diff_ = [a - c for a, c in zip(wat_a_, wat_c_)] cb = createOntology( 'chebi-bridge', 'NIF ChEBI bridge', makePrefixes('CHEBI', 'BFO1SNAP', 'owl', 'skos', 'dc', 'hasRole', 'NIFCHEM', 'oboInOwl', 'NIFMOL', 'NIFRID'), 'chebibridge', ('This bridge file contains additional annotations' ' on top of CHEBI identifiers that were originally' ' included in NIF-Chemical or NIF-Molecule that have' ' not since been added to CHEBI upstream'), path='ttl/bridge/', #imports=('https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebislim.ttl', #'https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebi-dead.ttl')) imports=( 'http://ontology.neuinfo.org/NIF/ttl/generated/chebislim.ttl', 'http://ontology.neuinfo.org/NIF/ttl/generated/chebi-dead.ttl')) out = [] for set_ in diff: for sub, string in sorted(set_): for t in g.triples((sub, None, None)): # please not that this process will do things like remove hasStreenName ectasy from CHEBI:1391 since chebislim has it listed as a synonym py = t[-1].toPython() if py == string and not py.startswith( 'ub' ): # ignore restrictions... this is safe because nifmol and nifchem dont have any restrictions... cb.add_recursive(t, g) cb.add_class( sub ) # only need to go at the end because sub is the same for each set def hasImplicitSuperclass(s, o): for super_ in cg.objects(s, rdflib.RDFS.subClassOf): if super_ == o: return True elif hasImplicitSuperclass(super_, o): return True # curation decisions after review (see outtc for full list) curatedOut = [] def curateOut(*t): curatedOut.append( tuple( ug.expand(_) if type(_) is not rdflib.Literal else _ for _ in t)) cb.del_trip(*t) curateOut( 'CHEBI:6887', 'rdfs:subClassOf', 'CHEBI:23367' ) # defer to the chebi choice of chemical substance over molecular entity since it is classified as a racemate which doesn't quite match the mol ent def curateOut( 'CHEBI:26519', 'rdfs:subClassOf', 'CHEBI:24870' ) # some ions may also be free radicals, but all free radicals are not ions! #natural product removal since natural product should probably be a role if anything... curateOut('CHEBI:18059', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:24921', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:37332', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:50906', 'rdfs:label', rdflib.Literal('Chemical role', datatype=rdflib.XSD.string) ) # chebi already has a chemical role... curateOut( 'CHEBI:22586', 'rdfs:subClassOf', 'CHEBI:24432' ) # antioxidant is already modelled as a chemical role instead of a biological role, the distinction is that the biological roles affect biological processes/property, not chemical processes/property curateOut('CHEBI:22720', 'rdfs:subClassOf', 'CHEBI:27171') # not all children are bicyclic curateOut( 'CHEBI:23447', 'rdfs:subClassOf', 'CHEBI:17188' ) # this one seems obviously flase... all cyclic nucleotides are not nucleoside 5'-monophosphate... curateOut( 'CHEBI:24922', 'rdfs:subClassOf', 'CHEBI:27171' ) # not all children are bicyclic, some may be poly, therefore removing curateOut( 'CHEBI:48706', 'rdfs:subClassOf', 'CHEBI:33232' ) # removing since antagonist is more incidental and pharmacological role is more appropriate (as chebi has it) curateOut('CHEBI:51064', 'rdfs:subClassOf', 'CHEBI:35338') # removing since chebi models this with has part curateOut( 'CHEBI:8247', 'rdfs:subClassOf', 'CHEBI:22720' ) # the structure is 'fused to' a benzo, but it is not a benzo, chebi has the correct #curateOut('CHEBI:9463', 'rdfs:subClassOf', 'CHEBI:50786') # not sure what to make of this wikipedia says one thing, but chebi says another, very strange... not an anabolic agent?!??! wat no idea # review hold over subClassOf statements intc = [] outtc = [] for s, o in cb.g.subject_objects(rdflib.RDFS.subClassOf): if str( o ) == 'http://ontology.neuinfo.org/NIF/Backend/BIRNLex_annotation_properties.owl#_birnlex_retired_class' or str( o ) == 'http://ontology.neuinfo.org/nif/nifstd/readable/birnlexRetiredClass': # we need to remove any of the cases where deprecation was misused cb.g.remove((s, rdflib.RDFS.subClassOf, o)) elif hasImplicitSuperclass(s, o): cb.g.remove((s, rdflib.RDFS.subClassOf, o)) intc.append((s, rdflib.RDFS.subClassOf, o)) else: outtc.append((s, rdflib.RDFS.subClassOf, o)) def qname(trips): return tuple( tuple(cb.g.namespace_manager.qname(_) for _ in t) for t in trips) for a, p, b in sorted(qname(outtc)): if 'NIFMOL' in b: continue # not considering cases where NIFMOL/NIFCHEM ids are used, that can come later s = sgv.findById(a) o = sgv.findById(b) if s is None or o is None: print(a, '=>', s) print(b, '=>', o) else: print(s['labels'], s['curie']) print('subClassOf') print(o['labels'], o['curie']) print((a, p, b)) print('---------------------') cb.write( ) # re-add only the missing edges so that we can zap them from NIF-Molecule and NIF-Chemical (recurse is needed...) # validation diff2 = set(cb.g) - set(cg) diff3 = set(cb.g) - diff2 # should just be all the owl:Class entries diff4 = set(cb.g) - set(chemg) | set(cb.g) - set(molg) # not informative diff5 = set(cb.g) - diff4 # not informative both = set(chemg) & set( molg) # there is no overlap beyond the owl:Class declarations def getChebis(set_): return set(t for t in set_ if 'CHEBI_' in t[0]) def nodt(graph): return set((s, str(o) if type(o) is rdflib.Literal else o) for s, p, o in graph) cmc = getChebis(((( (nodt(chemg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) - nodt(curatedOut)) cmc = sorted(t for s, o in cmc for t in chemg.triples((s, None, o))) mmc = getChebis(((( (nodt(molg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) - nodt(curatedOut)) mmc = sorted(t for s, o in mmc for t in molg.triples((s, None, o))) # remove chebi classes from nifchem and nifmol def remstuff(sources, targets): for source in sources: for id_ in source.subjects(rdflib.RDF.type, rdflib.OWL.Class): for target in targets: target.del_class(id_) remstuff((cg, cd), (chemgg, molgg)) chemgg.write() molgg.write() if __name__ == '__main__': breakpoint()