def test_makeGraph(): img_data, title, new_record = utils.makeGraph("Berlin", "DE") assert (type(img_data) == bytes) img_data, title, new_record = utils.makeGraph("Berlin", "DE", date=dt.datetime.now(), current_temp=20) assert (new_record == True) img_data, title, new_record = utils.makeGraph("Berlin", "DE", date=dt.datetime.now(), current_temp=-10) assert (new_record == False)
def make_neurolex_graph(): # neurolex test stuff nlxpref = {'ilx':'http://uri.interlex.org/base/'} nlxpref.update(NIFPREFIXES) neurolex = makeGraph('neurolex-temp', nlxpref) neurolex.g.parse('/tmp/neurolex_basic.ttl', format='turtle') ILXPO = 'ilx:partOf' nj = neurolex.make_scigraph_json(ILXPO) g_, h = creatTree(*Query('NIFGA:birnlex_796', ILXPO, 'INCOMING', 10), json=nj) i_, j_ = creatTree(*Query('NIFGA:nlx_412', ILXPO, 'INCOMING', 10), json=nj) brht = sorted(set(flatten(h[0],[]))) wmht = sorted(set(flatten(j_[0],[]))) ufixedrb = {'NIFGA:' + k.split(':')[1]:v for k, v in u_replaced_by.items()} b_nlx_replaced_by = new_replaced_by(brht, ufixedrb) w_nlx_replaced_by = new_replaced_by(wmht, ufixedrb) additional_edges = defaultdict(list) # TODO this could be fun for the future but is a nightmare atm for edge in h[-1]['edges'] + j_[-1]['edges']: additional_edges[edge['sub']] = edge additional_edges[edge['obj']] = edge #filter out bad edges becase we are lazy additional_edges = {k:v for k, v in additional_edges.items() if k in b_nlx_replaced_by or k in w_nlx_replaced_by} print('neurolex tree') # computed above print(g_) print(i_) return additional_edges
def fixHasAltId(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl', 'NIFCHEM', 'BIRNANN')) ng.replace_uriref('NIFCHEM:hasAlternativeId', 'oboInOwl:hasAlternativeId') ng.replace_uriref('BIRNANN:ChEBIid', 'oboInOwl:id')
def __new__(cls, validate=False): error = 'Expected %s got %s' if type(cls.ont) != OntMeta: raise TypeError(error % (OntMeta, type(cls.ont))) elif type(cls.concept) != PScheme: raise TypeError(error % (PScheme, type(cls.concept))) elif type(cls.atlas) != PSArtifact: raise TypeError(error % (PSArtifact, type(cls.atlas))) ontid = cls.ont.path + cls.ont.filename + '.ttl' PREFIXES = {k: v for k, v in cls.PREFIXES.items()} PREFIXES.update(genericPScheme.PREFIXES) #if '' in cls.PREFIXES: # NOT ALLOWED! #if PREFIXES[''] is None: #PREFIXES[''] = ontid + '/' graph = makeGraph(cls.ont.filename, PREFIXES, writeloc=WRITELOC) graph.add_ont(ontid, *cls.ont[2:]) make_scheme(graph, cls.concept, cls.atlas.curie) data = cls.datagetter() cls.datamunge(data) cls.dataproc(graph, data) add_ops(graph) graph.write() if validate or getattr(cls, 'VALIDATE', False): cls.validate(graph) return ontid, cls.atlas
def __new__(cls, validate=False): error = 'Expected %s got %s' if type(cls.ont) != OntMeta: raise TypeError(error % (OntMeta, type(cls.ont))) elif type(cls.concept) != PScheme: raise TypeError(error % (PScheme, type(cls.concept))) elif type(cls.atlas) != PSArtifact: raise TypeError(error % (PSArtifact, type(cls.atlas))) ontid = cls.ont.path + cls.ont.filename + '.ttl' PREFIXES = {k:v for k, v in cls.PREFIXES.items()} PREFIXES.update(genericPScheme.PREFIXES) if '' in cls.PREFIXES: if PREFIXES[''] is None: PREFIXES[''] = ontid + '/' graph = makeGraph(cls.ont.filename, PREFIXES, writeloc='/tmp/parc/') graph.add_ont(ontid, *cls.ont[2:]) make_scheme(graph, cls.concept, cls.atlas.curie) data = cls.datagetter() cls.datamunge(data) cls.dataproc(graph, data) add_ops(graph) graph.write(convert=False) if validate or getattr(cls, 'VALIDATE', False): cls.validate(graph) return ontid, cls.atlas
def switch_dead(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl')) for f, r in deads.items(): ng.replace_uriref(f, r) ng.add_node(r, 'oboInOwl:hasAlternateId', rdflib.Literal(f, datatype=rdflib.XSD.string)) g.remove( (r, replacedBy, r)) # in case the replaced by was already in
def fixAltIdIsURIRef(g): hai = ug.expand('oboInOwl:hasAlternativeId') i = ug.expand('oboInOwl:id') makeGraph('', graph=g, prefixes=makePrefixes( 'CHEBI')) # amazlingly sometimes this is missing... def inner(s, p, o): if type(o) == rdflib.URIRef: qn = g.namespace_manager.qname(o) g.add((s, p, rdflib.Literal(qn, datatype=rdflib.XSD.string))) if 'ns' in qn: print('WARNING UNKNOWN NAMESPACE BEING SHORTENED', str(o), qn) g.remove((s, p, o)) for s, o in g.subject_objects(hai): inner(s, hai, o) for s, o in g.subject_objects(i): inner(s, i, o)
def fixIons(g): # there are a series of atom/ion confusions that shall be dealt with, solution is to add 'iron' as a synonym to the charged form since that is what the biologists are usually referring to... ng = makeGraph('', graph=g, prefixes=makePrefixes('CHEBI')) # atom ion None, 'CHEBI:29108' # calcium is ok ng.replace_uriref('CHEBI:30145', 'CHEBI:49713') # lithium ng.replace_uriref('CHEBI:18248', 'CHEBI:29033') # iron ng.replace_uriref('CHEBI:26216', 'CHEBI:29103') # potassium ng.replace_uriref('CHEBI:26708', 'CHEBI:29101') # sodium None, 'CHEBI:29105' # zinc is ok
def ncbigene_make(): IDS_FILE = 'gene-subset-ids.txt' with open(IDS_FILE, 'rt') as f: # this came from neuroNER ids = [l.split(':')[1].strip() for l in f.readlines()] #url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&retmax=5000&db=gene&id=' #for id_ in ids: #data = requests.get(url + id_).json()['result'][id_] url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' data = { 'db':'gene', 'retmode':'json', 'retmax':5000, 'id':None, } chunks = [] for i, idset in enumerate(chunk_list(ids, 100)): print(i, len(idset)) data['id'] = ','.join(idset), resp = requests.post(url, data=data).json() chunks.append(resp) base = chunks[0]['result'] uids = base['uids'] for more in chunks[1:]: data = more['result'] uids.extend(data['uids']) base.update(data) #base['uids'] = uids # i mean... its just the keys base.pop('uids') prefixes = { 'ilx':'http://uri.interlex.org/base/', 'OBOANN':'http://ontology.neuinfo.org/NIF/Backend/OBO_annotation_properties.owl#', # FIXME needs to die a swift death 'NCBIGene':'http://www.ncbi.nlm.nih.gov/gene/', 'NCBITaxon':'http://purl.obolibrary.org/obo/NCBITaxon_', } ng = makeGraph('ncbigeneslim', prefixes) for k, v in base.items(): #if k != 'uids': ncbi(v, ng) ontid = 'http://ontology.neuinfo.org/NIF/ttl/generated/ncbigeneslim.ttl' ng.add_node(ontid, rdflib.RDF.type, rdflib.OWL.Ontology) ng.add_node(ontid, rdflib.RDFS.label, 'NIF NCBI Gene subset') ng.add_node(ontid, rdflib.RDFS.comment, 'This subset is automatically generated from the NCBI Gene database on a subset of terms listed in %s.' % IDS_FILE) ng.add_node(ontid, rdflib.OWL.versionInfo, date.isoformat(date.today())) ng.write()
def parcellation_schemes(ontids_atlases): ont = OntMeta( GENERATED, 'parcellation', 'NIF collected parcellation schemes ontology', 'NIF Parcellations', 'Brain parcellation schemes as represented by root concepts.', TODAY) ontid = ont.path + ont.filename + '.ttl' PREFIXES = makePrefixes('', 'ilx', 'owl', 'skos', 'NIFRID', 'ILXREPLACE') graph = makeGraph(ont.filename, PREFIXES, writeloc=WRITELOC) graph.add_ont(ontid, *ont[2:]) for import_id, atlas in sorted(ontids_atlases): graph.add_trip(ontid, owl.imports, import_id) add_triples(graph, atlas, make_atlas) graph.add_class(ATLAS_SUPER, label=atname) graph.add_class(PARC_SUPER, label=psname) graph.write()
def parcellation_schemes(ontids_atlases): ont = OntMeta('http://ontology.neuinfo.org/NIF/ttl/generated/', 'parcellation', 'NIF collected parcellation schemes ontology', 'NIF Parcellations', 'Brain parcellation schemes as represented by root concepts.', TODAY) ontid = ont.path + ont.filename + '.ttl' PREFIXES = makePrefixes('ilx', 'owl', 'skos', 'OBOANN') graph = makeGraph(ont.filename, PREFIXES, writeloc = '/tmp/parc/') graph.add_ont(ontid, *ont[2:]) for import_id, atlas in sorted(ontids_atlases): graph.add_node(ontid, rdflib.OWL.imports, import_id) add_triples(graph, atlas, make_atlas) graph.add_class(PARC_SUPER[0], label=PARC_SUPER[1]) graph.write(convert=False)
def make_neurolex_graph(): # neurolex test stuff nlxpref = {'ilx': 'http://uri.interlex.org/base/'} nlxpref.update(NIFPREFIXES) neurolex = makeGraph('neurolex-temp', nlxpref) neurolex.g.parse('/tmp/neurolex_basic.ttl', format='turtle') ILXPO = 'ilx:partOf' nj = neurolex.make_scigraph_json(ILXPO) g_, h = creatTree(*Query('NIFGA:birnlex_796', ILXPO, 'INCOMING', 10), json=nj) i_, j_ = creatTree(*Query('NIFGA:nlx_412', ILXPO, 'INCOMING', 10), json=nj) brht = sorted(set(flatten(h[0], []))) wmht = sorted(set(flatten(j_[0], []))) ufixedrb = { 'NIFGA:' + k.split(':')[1]: v for k, v in u_replaced_by.items() } b_nlx_replaced_by = new_replaced_by(brht, ufixedrb) w_nlx_replaced_by = new_replaced_by(wmht, ufixedrb) additional_edges = defaultdict( list) # TODO this could be fun for the future but is a nightmare atm for edge in h[-1]['edges'] + j_[-1]['edges']: additional_edges[edge['sub']] = edge additional_edges[edge['obj']] = edge #filter out bad edges becase we are lazy additional_edges = { k: v for k, v in additional_edges.items() if k in b_nlx_replaced_by or k in w_nlx_replaced_by } print('neurolex tree') # computed above print(g_) print(i_) return additional_edges
'uberon_id': uid, 'uberon_label': uberon_labs[uid], 'aba_id': aid, 'aba_label': abalabs[aid], 'aba_syns': '\n'.join(sorted(abasyns[aid] + abaacro[aid])), 'uberon_syns': '\n'.join(insert_uberon) } return to_format.format(**kwargs) text = '\n\n'.join( [make_record(uid, aid[0]) for uid, aid in sorted(u_a_map.items()) if aid]) with open('aba_uberon_syn_review.txt', 'wt') as f: f.write(text) print('total uberon terms checked:', len(uberon_labs)) print('total aba terms: ', len(abalabs)) print('total uberon with aba xref:', len([a for a in u_a_map.values() if a])) ubridge = makeGraph('uberon-parcellation-mappings', prefixes=makePrefixes('ilx', 'UBERON', 'MBA')) for u, arefs in u_a_map.items(): if arefs: # TODO check for bad assumptions here ubridge.add_node(u, 'ilx:delineatedBy', arefs[0]) ubridge.add_node(arefs[0], 'ilx:delineates', u) ubridge.write() embed()
def swanson(): """ not really a parcellation scheme """ ONT_PATH = 'http://ontology.neuinfo.org/NIF/ttl/generated/' filename = 'swanson_hierarchies' ontid = ONT_PATH + filename + '.ttl' PREFIXES = makePrefixes('ilx', 'owl', 'OBOANN', 'UBERON') PREFIXES.update({ '':ontid + '/', # looking for better options 'SWAN':'http://swanson.org/node/', 'SWAA':'http://swanson.org/appendix/', }) new_graph = makeGraph(filename, PREFIXES, writeloc='/tmp/parc/') new_graph.add_ont(ontid, 'Swanson brain partomies', 'Swanson 2014 Partonomies', 'This file is automatically generated from....', TODAY) with open('resources/swanson_aligned.txt', 'rt') as f: lines = [l.strip() for l in f.readlines()] # join header on page 794 lines[635] += ' ' + lines.pop(636) #fix for capitalization since this header is reused fixed = ' or '.join([' ('.join([n.capitalize() for n in _.split(' (')]) for _ in lines[635].lower().split(' or ')]).replace('human','HUMAN') lines[635] = fixed data = [] for l in lines: if not l.startswith('#'): level = l.count('.'*5) l = l.strip('.') if ' (' in l: if ') or' in l: n1, l = l.split(') or') area_name, citationP = n1.strip().split(' (') citation = citationP.rstrip(')') d = (level, area_name, citation, 'NEXT SYN') data.append(d) #print(tc.red(tc.bold(repr(d)))) area_name, citationP = l.strip().split(' (') citation = citationP.rstrip(')') else: area_name = l citation = None d = (level, area_name, citation, None) #print(d) data.append(d) results = async_getter(sgv.findByTerm, [(d[1],) for d in data]) #results = [None] * len(data) curies = [[r['curie'] for r in _ if 'UBERON' in r['curie']] if _ else [] for _ in results] output = [_[0] if _ else None for _ in curies] header = ['Depth', 'Name', 'Citation', 'NextSyn', 'Uberon'] zoop = [header] + [r for r in zip(*zip(*data), output)] + \ [(0, 'Appendix END None', None, None, None)] # needed to add last appendix class SP(rowParse): def __init__(self): self.nodes = defaultdict(dict) self._appendix = 0 self.appendicies = {} self._last_at_level = {} self.names = defaultdict(set) self.children = defaultdict(set) self.parents = defaultdict(set) self.next_syn = False super().__init__(zoop) def Depth(self, value): if self.next_syn: self.synonym = self.next_syn else: self.synonym = False self.depth = value def Name(self, value): self.name = value def Citation(self, value): self.citation = value def NextSyn(self, value): if value: self.next_syn = self._rowind else: self.next_syn = False def Uberon(self, value): self.uberon = value def _row_post(self): # check if we are in the next appendix # may want to xref ids between appendicies as well... if self.depth == 0: if self.name.startswith('Appendix'): if self._appendix: self.appendicies[self._appendix]['children'] = dict(self.children) self.appendicies[self._appendix]['parents'] = dict(self.parents) self._last_at_level = {} self.children = defaultdict(set) self.parents = defaultdict(set) _, num, apname = self.name.split(' ', 2) if num == 'END': return self._appendix = int(num) self.appendicies[self._appendix] = { 'name':apname.capitalize(), 'type':self.citation.capitalize() if self.citation else None} return else: if ' [' in self.name: name, taxonB = self.name.split(' [') self.name = name self.appendicies[self._appendix]['taxon'] = taxonB.rstrip(']').capitalize() else: # top level is animalia self.appendicies[self._appendix]['taxon'] = 'ANIMALIA'.capitalize() self.name = self.name.capitalize() self.citation = self.citation.capitalize() # nodes if self.synonym: self.nodes[self.synonym]['synonym'] = self.name self.nodes[self.synonym]['syn-cite'] = self.citation self.nodes[self.synonym]['syn-uberon'] = self.uberon return else: if self.citation: # Transverse Longitudinal etc all @ lvl4 self.names[self.name + ' ' + self.citation].add(self._rowind) else: self.name += str(self._appendix) + self.nodes[self._last_at_level[self.depth - 1]]['label'] #print(level, self.name) # can't return here because they are their own level # replace with actually doing something... self.nodes[self._rowind]['label'] = self.name self.nodes[self._rowind]['citation'] = self.citation self.nodes[self._rowind]['uberon'] = self.uberon # edges self._last_at_level[self.depth] = self._rowind # TODO will need something to deal with the Lateral/ if self.depth > 0: try: parent = self._last_at_level[self.depth - 1] except: embed() self.children[parent].add(self._rowind) self.parents[self._rowind].add(parent) def _end(self): replace = {} for asdf in [sorted(n) for k,n in self.names.items() if len(n) > 1]: replace_with, to_replace = asdf[0], asdf[1:] for r in to_replace: replace[r] = replace_with for r, rw in replace.items(): #print(self.nodes[rw]) o = self.nodes.pop(r) #print(o) for vals in self.appendicies.values(): children = vals['children'] parents = vals['parents'] # need reversed so children are corrected before swap for r, rw in reversed(sorted(replace.items())): if r in parents: child = r new_child = rw parent = parents.pop(child) parents[new_child] = parent parent = list(parent)[0] children[parent].remove(child) children[parent].add(new_child) if r in children: parent = r new_parent = rw childs = children.pop(parent) children[new_parent] = childs for child in childs: parents[child] = {new_parent} self.nodes = dict(self.nodes) sp = SP() tp = [_ for _ in sorted(['{: <50}'.format(n['label']) + n['uberon'] if n['uberon'] else n['label'] for n in sp.nodes.values()])] #print('\n'.join(tp)) #print(sp.appendicies[1].keys()) #print(sp.nodes[1].keys()) nbase = 'http://swanson.org/node/%s' json_ = {'nodes':[],'edges':[]} for node, anns in sp.nodes.items(): nid = nbase % node new_graph.add_class(nid, 'ilx:swansonBrainRegionConcept', label=anns['label']) new_graph.add_node(nid, 'OBOANN:definingCitation', anns['citation']) json_['nodes'].append({'lbl':anns['label'],'id':'SWA:' + str(node)}) #if anns['uberon']: #new_graph.add_node(nid, rdflib.OWL.equivalentClass, anns['uberon']) # issues arrise here... for appendix, data in sp.appendicies.items(): aid = 'http://swanson.org/appendix/%s' % appendix new_graph.add_class(aid, label=data['name'].capitalize()) new_graph.add_node(aid, 'ilx:hasTaxonRank', data['taxon']) # FIXME appendix is the data artifact... children = data['children'] ahp = HASPART + str(appendix) apo = PARTOF + str(appendix) new_graph.add_op(ahp, transitive=True) new_graph.add_op(apo, inverse=ahp, transitive=True) for parent, childs in children.items(): # FIXME does this give complete coverage? pid = nbase % parent for child in childs: cid = nbase % child new_graph.add_hierarchy(cid, ahp, pid) # note hierarhcy inverts direction new_graph.add_hierarchy(pid, apo, cid) json_['edges'].append({'sub':'SWA:' + str(child),'pred':apo,'obj':'SWA:' + str(parent)}) new_graph.write(convert=False) if False: Query = namedtuple('Query', ['root','relationshipType','direction','depth']) mapping = (1, 1, 1, 1, 30, 83, 69, 70, 74, 1) # should generate? for i, n in enumerate(mapping): a, b = creatTree(*Query('SWA:' + str(n), 'ilx:partOf' + str(i + 1), 'INCOMING', 10), json=json_) print(a) return ontid, None
def make_neurons(syn_mappings, pedges, ilx_start_, defined_graph): ilx_start = ilx_start_ cheating = { 'vasoactive intestinal peptide': 'VIP', 'star': None, # is a morphological phen that is missing but hits scigraph } ng = makeGraph('NIF-Neuron', prefixes=PREFIXES) #""" It seemed like a good idea at the time... nif_cell = '~/git/NIF-Ontology/ttl/NIF-Cell.ttl' # need to be on neurons branch cg = rdflib.Graph() cg.parse(os.path.expanduser(nif_cell), format='turtle') missing = ( 'NIFCELL:nifext_55', 'NIFCELL:nifext_56', 'NIFCELL:nifext_57', 'NIFCELL:nifext_59', 'NIFCELL:nifext_81', 'NIFCELL:nlx_cell_091205', NIFCELL_NEURON, 'NIFCELL:sao2128417084', 'NIFCELL:sao862606388', # secondary, not explicitly in the hbp import ) for m in missing: m = ng.expand(m) for s, p, o in cg.triples((m, None, None)): ng.add_trip(s, p, o) #cg.remove((None, rdflib.OWL.imports, None)) # DONOTWANT NIF-Cell imports #for t in cg.triples((None, None, None)): #ng.add_trip(*t) # only way to clean prefixes :/ #cg = None #""" hbp_cell = '~/git/NIF-Ontology/ttl/generated/NIF-Neuron-HBP-cell-import.ttl' # need to be on neurons branch _temp = rdflib.Graph() # use a temp to strip nasty namespaces _temp.parse(os.path.expanduser(hbp_cell), format='turtle') for s, p, o in _temp.triples((None, None, None)): if s != rdflib.URIRef( 'http://ontology.neuinfo.org/NIF/ttl/generated/NIF-Neuron-HBP-cell-import.ttl' ): ng.g.add((s, p, o)) base = 'http://ontology.neuinfo.org/NIF/ttl/' ontid = base + ng.name + '.ttl' ng.add_trip(ontid, rdflib.RDF.type, rdflib.OWL.Ontology) ng.add_trip(ontid, rdflib.OWL.imports, base + 'NIF-Neuron-Phenotype.ttl') ng.add_trip(ontid, rdflib.OWL.imports, base + 'NIF-Neuron-Defined.ttl') ng.add_trip(ontid, rdflib.OWL.imports, base + 'hbp-special.ttl') #ng.add_trip(ontid, rdflib.OWL.imports, base + 'NIF-Cell.ttl') # NO! #ng.add_trip(ontid, rdflib.OWL.imports, base + 'external/uberon.owl') #ng.add_trip(ontid, rdflib.OWL.imports, base + 'external/pr.owl') ng.replace_uriref('ilx:hasMolecularPhenotype', 'ilx:hasExpressionPhenotype') #defined_graph = makeGraph('NIF-Neuron-Defined', prefixes=PREFIXES, graph=_g) defined_graph.add_trip(base + defined_graph.name + '.ttl', rdflib.RDF.type, rdflib.OWL.Ontology) defined_graph.add_trip(base + defined_graph.name + '.ttl', rdflib.OWL.imports, base + 'NIF-Neuron-Phenotype.ttl') done = True #False done_ = set() for pedge in pedges: for s, p, o_lit in ng.g.triples((None, pedge, None)): o = o_lit.toPython() success = False true_o = None true_id = None if o in syn_mappings: id_ = syn_mappings[o] ng.add_hierarchy(id_, p, s) ng.g.remove((s, p, o_lit)) #print('SUCCESS, substituting', o, 'for', id_) success = True true_o = o_lit true_id = id_ elif 'Location' in p.toPython() or 'LocatedIn' in p.toPython( ): # lift location to restrictions if o.startswith('http://'): ng.add_hierarchy(o_lit, p, s) ng.g.remove((s, p, o_lit)) data = sgv.findById(o) label = data['labels'][0] ng.add_trip(o, rdflib.RDF.type, rdflib.OWL.Class) ng.add_trip(o, rdflib.RDFS.label, label) success = True true_o = label true_id = o_lit else: if o in cheating: o = cheating[o] data = sgv.findByTerm(o) if data: print('SCIGRAPH', [(d['curie'], d['labels']) for d in data]) for d in data: if 'PR:' in d['curie']: sgt = ng.expand(d['curie']) ng.add_hierarchy(sgt, p, s) ng.g.remove((s, p, o_lit)) label = d['labels'][0] ng.add_trip(sgt, rdflib.RDF.type, rdflib.OWL.Class) ng.add_trip(sgt, rdflib.RDFS.label, label) success = True true_o = label true_id = sgt break if not success: for d in data: if 'NIFMOL:' in d['curie']: sgt = ng.expand(d['curie']) ng.add_hierarchy(sgt, p, s) ng.g.remove((s, p, o_lit)) label = d['labels'][0] ng.add_trip(sgt, rdflib.RDF.type, rdflib.OWL.Class) ng.add_trip(sgt, rdflib.RDFS.label, label) success = True true_o = label true_id = sgt break if o not in done_ and success: done_.add(o) t = tuple( defined_graph.g.triples( (None, rdflib.OWL.someValuesFrom, true_id))) if t: print('ALREADY IN', t) else: ilx_start += 1 id_ = ng.expand(ilx_base.format(ilx_start)) defined_graph.add_trip(id_, rdflib.RDF.type, rdflib.OWL.Class) restriction = infixowl.Restriction(p, graph=defined_graph.g, someValuesFrom=true_id) intersection = infixowl.BooleanClass( members=(defined_graph.expand(NIFCELL_NEURON), restriction), graph=defined_graph.g) this = infixowl.Class(id_, graph=defined_graph.g) this.equivalentClass = [intersection] this.subClassOf = [ defined_graph.expand(defined_class_parent) ] this.label = rdflib.Literal(true_o + ' neuron') print('make_neurons ilx_start', ilx_start, list(this.label)[0]) if not done: embed() done = True defined_graph.add_class(defined_class_parent, NIFCELL_NEURON, label='defined class neuron') defined_graph.add_trip(defined_class_parent, rdflib.namespace.SKOS.definition, 'Parent class For all defined class neurons') defined_graph.write() ng.write() for sub, syn in [ _ for _ in ng.g.subject_objects(ng.expand('NIFRID:synonym')) ] + [_ for _ in ng.g.subject_objects(rdflib.RDFS.label)]: syn = syn.toPython() if syn in syn_mappings: print('ERROR duplicate synonym!', syn, sub) syn_mappings[syn] = sub return ilx_start
def chebi_make(): PREFIXES = makePrefixes('definition', 'hasRole', 'BFO', 'CHEBI', 'owl', 'skos', 'oboInOwl') dPREFIXES = makePrefixes('CHEBI', 'replacedBy', 'owl', 'skos') ug = makeGraph('utilgraph', prefixes=PREFIXES) IDS_FILE = 'resources/chebi-subset-ids.txt' with open(IDS_FILE, 'rt') as f: ids_raw = set((_.strip() for _ in f.readlines())) ids = set((ug.expand(_.strip()).toPython() for _ in ids_raw)) #gzed = requests.get('http://localhost:8000/chebi.owl') #raw = BytesIO(gzed.content) gzed = requests.get( 'http://ftp.ebi.ac.uk/pub/databases/chebi/ontology/nightly/chebi.owl.gz' ) raw = BytesIO(gzip.decompress(gzed.content)) t = etree.parse(raw) r = t.getroot() cs = r.getchildren() classes = [ _ for _ in cs if _.tag == '{http://www.w3.org/2002/07/owl#}Class' and _.values()[0] in ids ] ontology = t.xpath("/*[local-name()='RDF']/*[local-name()='Ontology']") ops = t.xpath( "/*[local-name()='RDF']/*[local-name()='ObjectProperty']") # TODO wanted = [etree.ElementTree(_) for _ in classes] rpl_check = t.xpath( "/*[local-name()='RDF']/*[local-name()='Class']/*[local-name()='hasAlternativeId']" ) rpl_dict = { _.text: _.getparent() for _ in rpl_check if _.text in ids_raw } # we also need to have any new classes that have replaced old ids also_classes = list(rpl_dict.values()) def rec(start_set, done): ids_ = set() for c in start_set: ids_.update([ _.items()[0][1] for _ in etree.ElementTree(c).xpath( "/*[local-name()='Class']/*[local-name()='subClassOf']") if _.items() ]) ids_.update([ _.items()[0][1] for _ in etree.ElementTree(c).xpath( "/*[local-name()='Class']/*[local-name()='subClassOf']/*[local-name()='Restriction']/*[local-name()='someValuesFrom']" ) if _.items() ]) supers = [ _ for _ in cs if _.tag == '{http://www.w3.org/2002/07/owl#}Class' and _.values()[0] in ids_ and _ not in done ] if supers: msup, mids = rec(supers, done + supers) supers += msup ids_.update(mids) return supers, ids_ a = ontology + ops + classes + also_classes more, mids = rec(a, a) all_ = set(a + more) r.clear() # wipe all the stuff we don't need for c in all_: r.append(c) data = etree.tostring(r) g = rdflib.Graph() g.parse( data=data ) # now _this_ is stupidly slow (like 20 minutes of slow) might make more sense to do the xml directly? src_version = list( g.query( 'SELECT DISTINCT ?match WHERE { ?temp rdf:type owl:Ontology . ?temp owl:versionIRI ?match . }' ))[0][0] new_graph = createOntology( 'chebislim', 'NIF ChEBI slim', PREFIXES, 'chebislim', 'This file is generated by pyontutils/slimgen from the full ChEBI nightly at versionIRI %s based on the list of terms in %s.' % (src_version, IDS_FILE), remote_base='http://ontology.neuinfo.org/NIF/') chebi_dead = createOntology( 'chebi-dead', 'NIF ChEBI deprecated', dPREFIXES, 'chebidead', 'This file is generated by pyontutils/slimgen to make deprecated classes resolvablefrom the full ChEBI nightly at versionIRI %s based on the list of terms in %s.' % (src_version, IDS_FILE), remote_base='http://ontology.neuinfo.org/NIF/') depwor = { 'CHEBI:33243': 'natural product', # FIXME remove these? 'CHEBI:36809': 'tricyclic antidepressant', } for id_ in sorted( set(ids_raw) | set((ug.g.namespace_manager.qname(_) for _ in mids))): eid = ug.expand(id_) trips = list(g.triples((eid, None, None))) if not trips: #looks for the id_ as a literal alts = list( g.triples(( None, rdflib.term.URIRef( 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId' ), rdflib.Literal( id_, datatype=rdflib.term.URIRef( 'http://www.w3.org/2001/XMLSchema#string'))))) if alts: replaced_by, _, __ = alts[0] if replaced_by.toPython( ) not in ids: # we need to add any replacment classes to the bridge print('REPLACED BY NEW CLASS', id_) for t in g.triples((replaced_by, None, None)): new_graph.add_recursive(t, g) chebi_dead.add_class(id_) chebi_dead.add_node(id_, 'replacedBy:', replaced_by) chebi_dead.add_node(id_, rdflib.OWL.deprecated, True) else: if id_ not in depwor: raise BaseException('wtf error', id_) else: for trip in trips: new_graph.add_recursive(trip, g) # https://github.com/ebi-chebi/ChEBI/issues/3294 madness = new_graph.expand('oboInOwl:hasRelatedSynonym'), rdflib.Literal( '0', datatype=rdflib.namespace.XSD.string) for a in new_graph.g.subjects(*madness): new_graph.g.remove((a, ) + madness) new_graph.write() chebi_dead.write() embed()
kwargs = { 'uberon_id':uid, 'uberon_label':uberon_labs[uid], 'aba_id':aid, 'aba_label':abalabs[aid], 'aba_syns':'\n'.join(sorted(abasyns[aid] + abaacro[aid])), 'uberon_syns':'\n'.join(insert_uberon) } return to_format.format(**kwargs) text = '\n\n'.join([make_record(uid, aid[0]) for uid, aid in sorted(u_a_map.items()) if aid]) with open('aba_uberon_syn_review.txt', 'wt') as f: f.write(text) print('total uberon terms checked:', len(uberon_labs)) print('total aba terms: ', len(abalabs)) print('total uberon with aba xref:', len([a for a in u_a_map.values() if a])) ubridge = makeGraph('uberon-parcellation-mappings',prefixes=makePrefixes('ilx', 'UBERON', 'MBA')) for u, arefs in u_a_map.items(): if arefs: # TODO check for bad assumptions here ubridge.add_node(u, 'ilx:delineatedBy', arefs[0]) ubridge.add_node(arefs[0], 'ilx:delineates', u) ubridge.write() embed()
def _rest_make_phenotypes(): #phenotype sources neuroner = '~/git/neuroNER/resources/bluima/neuroner/hbp_morphology_ontology.obo' neuroner1 = '~/git/neuroNER/resources/bluima/neuroner/hbp_electrophysiology_ontology.obo' neuroner2 = '~/git/neuroNER/resources/bluima/neuroner/hbp_electrophysiology-triggers_ontology.obo' nif_qual = '~/git/NIF-Ontology/ttl/NIF-Quality.ttl' mo = OboFile(os.path.expanduser(neuroner)) mo1 = OboFile(os.path.expanduser(neuroner1)) mo2 = OboFile(os.path.expanduser(neuroner2)) mo_ttl = mo.__ttl__() + mo1.__ttl__() + mo2.__ttl__() mo_ttl = """\ @prefix : <http://FIXME.org/> . @prefix nsu: <http://www.FIXME.org/nsupper#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix owl: <http://www.w3.org/2002/07/owl#> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . """ + mo_ttl #sio = io.StringIO() #sio.write(mo_ttl) ng = rdflib.Graph() ng.parse(data=mo_ttl, format='turtle') ng.parse(os.path.expanduser(nif_qual), format='turtle') #ng.namespace_manager.bind('default1', None, override=False, replace=True) ng.remove((None, rdflib.OWL.imports, None)) bad_match = { 'http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-Quality.owl#nlx_qual_20090505', 'http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-Quality.owl#sao1693353776', 'http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-Quality.owl#sao1288413465', 'http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-Quality.owl#sao4459136323', 'http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-Quality.owl#nlx_qual_20090507', } exact = [] similar = [] quals = [] s2 = {} for subject, label in sorted(ng.subject_objects(rdflib.RDFS.label)): syns = set([ a for a in ng.objects( subject, rdflib.URIRef('http://www.FIXME.org/nsupper#synonym')) ]) syns.update( set([ a for a in ng.objects( subject, rdflib.URIRef( 'http://ontology.neuinfo.org/NIF/Backend/OBO_annotation_properties.owl#synonym' )) ])) #if syns: #print(syns) #print(subject) #print(label.lower()) if 'quality' in label.lower(): quals.append((subject, label)) subpre = ng.namespace_manager.compute_qname(subject)[1] llower = rdflib.Literal(label.lower(), lang='en') for s in ng.subjects(rdflib.RDFS.label, llower): if s != subject: exact.append((subject, s, label, llower)) for s, p, o in sorted(ng.triples((None, rdflib.RDFS.label, None))): spre = ng.namespace_manager.compute_qname(s)[1] if subject != s and label.lower() in o.lower().split( ' ') and spre != subpre: if s.toPython() in bad_match or subject.toPython( ) in bad_match: continue #print() #print(spre, subpre) similar.append((subject, s, label, o)) if subpre.toPython() == 'http://FIXME.org/': print('YAY') print(label, ',', o) print(subject, s) subject, s = s, subject label, o = o, label if subject in s2: #print('YES IT EXISTS') #print(syns, label, [subject, s]) s2[subject]['syns'].update(syns) s2[subject]['syns'].add(label) s2[subject]['xrefs'] += [subject, s] else: s2[subject] = { 'label': label.toPython(), 'o': o.toPython(), 'xrefs': [subject, s], 'syns': syns } # FIXME overwrites pprint(quals) """ print stuff print('matches') pprint(exact) pprint(similar) #print('EXACT', exact) print() for k, v in s2.items(): print(k) for k, v2 in sorted(v.items()): print(' ', k, ':', v2) #""" desired_nif_terms = set() #{ #'NIFQUAL:sao1959705051', # dendrite #'NIFQUAL:sao2088691397', # axon #'NIFQUAL:sao1057800815', # morphological #'NIFQUAL:sao-1126011106', # soma #'NIFQUAL:', #'NIFQUAL:', #} starts = [ #"NIFQUAL:sao2088691397", #"NIFQUAL:sao1278200674", #"NIFQUAL:sao2088691397", #"NIFQUAL:sao-1126011106", # FIXME WTF IS THIS NONSENSE (scigraph bug?) quote( "http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-Quality.owl#sao1959705051" ).replace('/', '%2F'), quote( "http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-Quality.owl#sao2088691397" ).replace('/', '%2F'), quote( "http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-Quality.owl#sao1278200674" ).replace('/', '%2F'), quote( "http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-Quality.owl#sao2088691397" ).replace('/', '%2F'), quote( "http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-Quality.owl#sao-1126011106" ).replace('/', '%2F'), ] for id_ in starts: want = sgg.getNeighbors(id_, relationshipType='subClassOf', direction='INCOMING', depth=5) #print(id_, want) desired_nif_terms.update([n['id'] for n in want['nodes']]) print(desired_nif_terms) ilx_start = 50114 print(ilx_base.format(ilx_start)) new_terms = {} dg = makeGraph('uwotm8', prefixes=PREFIXES) xr = makeGraph('xrefs', prefixes=PREFIXES) for s, o in sorted(ng.subject_objects(rdflib.RDFS.label))[::-1]: spre = ng.namespace_manager.compute_qname(s)[1] #if spre.toPython() == g.namespaces['NIFQUAL']: #print('skipping', s) #continue # TODO if s in new_terms: print(s, 'already in as xref probably') continue #elif spre.toPython() != 'http://uri.interlex.org/base/ilx_' or spre.toPython() != 'http://FIXME.org/' and s.toPython() not in desired_nif_terms: #elif spre.toPython() != 'http://FIXME.org/' and s.toPython() not in desired_nif_terms: #print('DO NOT WANT', s, spre) #continue syns = set([s for s in ng.objects(s, dg.namespaces['nsu']['synonym'])]) #data['syns'] += syns data = {} id_ = ilx_base.format(ilx_start) ilx_start += 1 if s in s2: d = s2[s] syns.update(d['syns']) new_terms[d['xrefs'][0]] = {'replaced_by': id_} xr.add_trip(d['xrefs'][0], 'oboInOwl:replacedBy', id_) #dg.add_trip(d['xrefs'][0], 'oboInOwl:replacedBy', id_) new_terms[d['xrefs'][1]] = {'replaced_by': id_} xr.add_trip(d['xrefs'][1], 'oboInOwl:replacedBy', id_) #dg.add_trip(d['xrefs'][1], 'oboInOwl:replacedBy', id_) data['labels'] = [d['label'], d['o']] #dg.add_trip(id_, rdflib.RDFS.label, d['label']) dg.add_trip(id_, rdflib.RDFS.label, d['o']) data['xrefs'] = d['xrefs'] for x in d[ 'xrefs']: # FIXME... expecting order of evaluation errors here... dg.add_trip(id_, 'oboInOwl:hasDbXref', x) # xr xr.add_trip(id_, 'oboInOwl:hasDbXref', x) # x elif spre.toPython( ) != 'http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-Quality.owl#' or ng.namespace_manager.qname( s).replace( 'default1', 'NIFQUAL') in desired_nif_terms: # skip non-xref quals #print(ng.namespace_manager.qname(s).replace('default1','NIFQUAL')) new_terms[s] = {'replaced_by': id_} xr.add_trip(s, 'oboInOwl:replacedBy', id_) data['labels'] = [o.toPython()] dg.add_trip(id_, rdflib.RDFS.label, o.toPython()) data['xrefs'] = [s] dg.add_trip(id_, 'oboInOwl:hasDbXref', s) # xr xr.add_trip(id_, 'oboInOwl:hasDbXref', s) # xr else: ilx_start -= 1 continue new_terms[id_] = data dg.add_trip(id_, rdflib.RDF.type, rdflib.OWL.Class) xr.add_trip(id_, rdflib.RDF.type, rdflib.OWL.Class) for syn in syns: if syn.toPython() not in data['labels']: if len(syn) > 3: dg.add_trip(id_, 'NIFRID:synonym', syn) elif syn: dg.add_trip(id_, 'NIFRID:abbrev', syn) if 'EPHYS' in s or any(['EPHYS' in x for x in data['xrefs']]): dg.add_trip(id_, rdflib.RDFS.subClassOf, ephys_phenotype) elif 'MORPHOLOGY' in s or any( ['MORPHOLOGY' in x for x in data['xrefs']]): dg.add_trip(id_, rdflib.RDFS.subClassOf, morpho_phenotype) #dg.write(convert=False) xr.write(convert=False) #skip this for now, we can use DG to do lookups later #for t in dg.g.triples((None, None, None)): #g.add_trip(*t) # only way to clean prefixes :/ add_phenotypes(g) g.write(convert=False) g2 = makeGraph('pheno-comp', PREFIXES) for t in ng.triples((None, None, None)): g2.add_trip(*t) # only way to clean prefixes :/ g2.write(convert=False) syn_mappings = {} for sub, syn in [ _ for _ in g.g.subject_objects(g.expand('NIFRID:synonym')) ] + [_ for _ in g.g.subject_objects(rdflib.RDFS.label)]: syn = syn.toPython() if syn in syn_mappings: print('ERROR duplicate synonym!', syn, sub) syn_mappings[syn] = sub #embed() return syn_mappings, pedges, ilx_start
PREFIXES = makePrefixes( "SCR", "MBA", "NIFMOL", "NIFNEURON", "NIFCELL", "NIFGA", "UBERON", "PR", "NIFNEURMOR", "skos", "owl" ) ont = OntMeta( "http://ontology.neuinfo.org/NIF/ttl/generated/", "ksdesc-defs", "Knolwedge Space Defs", "KSDEFS", "Definitions from knowledge space descriptions. Generated by pyontutils/ksdesc_bridge.py", TODAY, ) ontid = ont.path + ont.filename + ".ttl" g = makeGraph(ont.filename, prefixes=PREFIXES) g.add_ont(ontid, *ont[2:]) top_level = glob(os.path.expanduser("~/git/ksdesc/") + "*") for putative_dir in top_level: if os.path.isdir(putative_dir): for putative_md in glob(putative_dir + "/*.md"): ident = os.path.split(putative_dir)[-1] + ":" + os.path.splitext(os.path.split(putative_md)[-1])[0] print(ident) with open(putative_md, "rt") as f: def_ = f.read() for test in ("Description", "Definition"): if test in def_: def_ = def_.split(test, 1)[-1].strip().strip("=").strip()
def chebi_imp(): PREFIXES = makePrefixes('definition', 'hasRole', 'CHEBI', 'owl', 'skos', 'oboInOwl') ug = makeGraph('utilgraph', prefixes=PREFIXES) with open('chebi-subset-ids.txt', 'rt') as f: ids_raw = set((_.strip() for _ in f.readlines())) ids = sorted(set((ug.expand(_.strip()) for _ in ids_raw))) def check_chebis(g): a = [] for id_ in ids: l = sorted(g.triples((id_, None, None))) ll = len(l) a.append(ll) return a g = rdflib.Graph() cg = rdflib.Graph() chemg = rdflib.Graph() molg = rdflib.Graph() g.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebislim.ttl', format='turtle') cg.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebislim.ttl', format='turtle') a1 = check_chebis(g) g.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebi-dead.ttl', format='turtle') a2 = check_chebis(g) g.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Chemical.ttl', format='turtle') chemg.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Chemical.ttl', format='turtle') a3 = check_chebis(g) g.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Molecule.ttl', format='turtle') molg.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Molecule.ttl', format='turtle') a4 = check_chebis(g) matches = [_ for _ in zip(a1, a2, a3, a4)] changed = [len(set(_)) != 1 for _ in matches] review = [(id_, m) for id_, changed, m in zip(ids, changed, matches) if changed and m[0]] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_c = [set([(s, str(o.toPython())) for s, p, o in cg.triples((u, None, None))]) for u, _ in review] wat_a = [set([(s, str(o.toPython())) for s, p, o in g.triples((u, None, None))]) for u, _ in review] wat_c_ = [set(cg.triples((u, None, None))) for u, _ in review] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_a_ = [set(g.triples((u, None, None))) for u, _ in review] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython diff = [a - c for a, c in zip(wat_a, wat_c)] diff_ = [a - c for a, c in zip(wat_a_, wat_c_)] cb = makeGraph('chebi-bridge', makePrefixes('CHEBI', 'owl', 'skos', 'dc', 'hasRole', 'NIFCHEM', 'NIFMOL', 'OBOANN', 'BIRNANN')) out = [] for set_ in diff: for sub, string in sorted(set_): for t in g.triples((sub, None, None)): py = t[-1].toPython() if py == string and not py.startswith('ub'): # ignore restrictions... this is safe because nifmol and nifchem dont have any restrictions... cb.add_recursive(t, g) cb.add_class(sub) # only need to go at the end because sub is the same for each set cb.write() # re-add only the missing edges so that we can zap them from NIF-Molecule and NIF-Chemical (recurse is needed...) embed()
def main(): DB_URI = 'mysql+mysqlconnector://{user}:{password}@{host}:{port}/{db}' #config = mysql_conn_helper('mysql5-stage.crbs.ucsd.edu', 'nif_eelg', 'nif_eelg_secure') config = mysql_conn_helper('nif-mysql.crbs.ucsd.edu', 'nif_eelg', 'nif_eelg_secure') engine = create_engine(DB_URI.format(**config)) config = None # all weakrefs should be gone by now? del(config) # i wonder whether this actually cleans it up when using **config insp = inspect(engine) #names = [c['name'] for c in insp.get_columns('registry')] #resource_columns = [c['name'] for c in insp.get_columns('resource_columns')] #resource_data = [c['name'] for c in insp.get_columns('resource_data')] #resource_fields = [c['name'] for c in insp.get_columns('resource_fields')] #resources = [c['name'] for c in insp.get_columns('resources')] #conn.execute('SELECT * from registry;') if 1: #with engine.connect() as conn: conn = engine tables = ('resource_columns', 'resource_data', 'resource_fields', 'resources') data = {t:([c['name'] for c in insp.get_columns(t)], conn.execute('SELECT * from %s limit 20;' % t).fetchall()) for t in tables} all_fields = [n[0] for n in conn.execute('SELECT distinct(name) FROM resource_fields;').fetchall()] #query = conn.execute('SELECT r.rid, r.original_id, r.type, rc.name, rc.value from resources as r JOIN' #' resource_columns as rc ON r.id=rc.rid' #' WHERE rc.name IN %s limit 1000;' % str(tuple([n for n in field_mapping if n != 'MULTI']))) # XXX DANGER THIS QUERY IS O(x^n) :x #' ORDER BY r.rid limit 2000;' #query = conn.execute('SELECT r.rid, r.original_id, r.type, rc.name, rc.value from resource_columns as rc JOIN' #' resources as r ON rc.rid=r.id' #' WHERE rc.name IN %s;' % str(tuple([n for n in field_mapping if n != 'MULTI']))) # XXX DANGER why does > 2000 limit break stuff? #join = query.fetchall() #embed() #return #print('running join') print('running 1') r_query = conn.execute('SELECT id, rid, original_id, type, status FROM resources WHERE id < 16000;') # avoid the various test entries :( print('fetching 1 ') r = r_query.fetchall() print('running 2') rc_query = conn.execute('SELECT rid, name, value, version FROM resource_columns as rc WHERE rc.rid < 16000 AND rc.name IN %s;' % str(tuple([n for n in field_mapping if n != 'MULTI']))) print('fetching 2') rc = rc_query.fetchall() #embed() #return r.append( (-100, 'NIF:nlx_63400', 'nlx_63400', 'Resource', 'Curated') ) r.append( (-101, 'NIF:nlx_152342', 'nlx_152342', 'Organization', 'Curated') ) r.append( (-102, 'NIF:nlx_152328', 'nlx_152328', 'Organization', 'Curated') ) r.append( (-103, 'NIF:NEMO_0569000', 'NEMO_0569000', 'Institution', 'Curated') ) r.append( (-104, 'NIF:birnlex_2431', 'birnlex_2431', 'Institution', 'Curated') ) r.append( (-105, 'NIF:SIO_000688', 'SIO_000688', 'Institution', 'Curated') ) r.append( (-106, 'NIF:birnlex_2085', 'birnlex_2085', 'Institution', 'Curated') ) rc.append( (-100, 'Resource Name', 'Resource', 1) ) rc.append( (-101, 'Resource Name', 'Commercial Organization', 1) ) rc.append( (-102, 'Resource Name', 'Organization', 1) ) rc.append( (-103, 'Resource Name', 'University', 1) ) rc.append( (-104, 'Resource Name', 'Government granting agency', 1) ) rc.append( (-105, 'Resource Name', 'Institute', 1) ) rc.append( (-106, 'Resource Name', 'Institution', 1) ) rc.append( (-101, 'Supercategory', 'NIF:nlx_152328', 1) ) # TODO extract this more intelligently from remap supers please output = make_records(r, rc, field_mapping) print('Fetching and data prep done.') g = makeGraph('scicrunch-registry', PREFIXES) # ontology metadata ontid = ONTOLOGY_DEF['iri'] g.add_node(ontid, rdflib.RDF.type, rdflib.OWL.Ontology) g.add_node(ontid, rdflib.RDFS.label, ONTOLOGY_DEF['label']) g.add_node(ontid, rdflib.RDFS.comment, ONTOLOGY_DEF['comment']) g.add_node(ontid, rdflib.OWL.versionInfo, ONTOLOGY_DEF['version']) for id_, rec in output.items(): for field, value in rec: #print(field, value) if not value: # don't add empty edges # FIXME issue with False literal print('caught an empty value on field', id_, field) continue if field != 'id' and str(value) in id_: #if field == 'alt_id' and id_[1:] == value: print('caught a mainid appearing as altid', field, value) continue g.add_node(*make_node(id_, field, value)) g.write()
#!/usr/bin/env python3.5 import rdflib from utils import makePrefixes, makeGraph PREFIXES = makePrefixes('NIFGA', 'NIFSTD', 'owl') g = rdflib.Graph() g.parse('http://purl.obolibrary.org/obo/uberon/bridge/uberon-bridge-to-nifstd.owl', format='xml') name = 'NIFGA-Equivs' ng = makeGraph(name, PREFIXES) [ng.g.add(t) for t in ((rdflib.URIRef(PREFIXES['NIFGA'] + o.rsplit('/',1)[-1]), p, o) for s, p, o in g.triples((None, rdflib.OWL.equivalentClass, None)))] ng.add_ont('http://ontology.neuinfo.org/NIF/ttl/generated/' + name + '.ttl', 'NIFGA to NIFSTD mappings') ng.write()
def do_deprecation(replaced_by, g, additional_edges, conflated): bmeta = OntMeta('http://ontology.neuinfo.org/NIF/ttl/bridge/', 'uberon-bridge', 'NIFSTD Uberon Bridge', 'UBERON Bridge', ('This is the bridge file that holds local NIFSTD additions to uberon. ' 'This is also staging for any changes that we want to push upstream.'), TODAY) ontid = bmeta.path + bmeta.filename + '.ttl' bridge = makeGraph('uberon-bridge', PREFIXES) bridge.add_ont(ontid, *bmeta[2:]) graph = makeGraph('NIF-GrossAnatomy', NIFPREFIXES, graph=g) #graph.g.namespace_manager._NamespaceManager__cache = {} #g.namespace_manager.bind('UBERON','http://purl.obolibrary.org/obo/UBERON_') # this has to go in again because we reset g FIXME udone = set('NOREP') uedges = defaultdict(lambda:defaultdict(set)) def inner(nifga, uberon): # check neuronames id TODO udepr = sgv.findById(uberon)['deprecated'] if uberon != 'NOREP' else False if udepr: # add xref to the now deprecated uberon term graph.add_node(nifga, 'oboInOwl:hasDbXref', uberon) #print('Replacement is deprecated, not replacing:', uberon) graph.add_node(nifga, RDFS.comment, 'xref %s is deprecated, so not using replacedBy:' % uberon) else: # add replaced by -> uberon graph.add_node(nifga, 'replacedBy:', uberon) # add deprecated true (ok to do twice...) graph.add_node(nifga, OWL.deprecated, True) # review nifga relations, specifically has_proper_part, proper_part_of # put those relations on the uberon term in the # if there is no uberon term raise an error so we can look into it #if uberon not in uedges: #uedges[uberon] = defaultdict(set) resp = sgg.getNeighbors(nifga) edges = resp['edges'] if nifga in additional_edges: edges.append(additional_edges[nifga]) include = False # set this to True when running anns for edge in edges: # FIXME TODO hierarchy extraction and porting #print(edge) if udepr: # skip everything if uberon is deprecated include = False hier = False break sub = edge['sub'] obj = edge['obj'] pred = edge['pred'] hier = False if pred == 'subClassOf': pred = RDFS.subClassOf continue elif pred == 'equivalentClass': pred = OWL.equivalentClass continue elif pred == 'isDefinedBy': pred = RDFS.isDefinedBy continue elif pred == 'http://www.obofoundry.org/ro/ro.owl#has_proper_part': hier = True include = True elif pred == 'http://www.obofoundry.org/ro/ro.owl#proper_part_of': hier = True include = True elif pred == 'ilx:partOf': hier = True include = True if sub == nifga: try: obj = replaced_by[obj] if obj == 'NOREP': hier = False except KeyError: print('not in replaced_by', obj) if type(obj) == tuple: continue # TODO if hier: if uberon not in uedges[obj][pred]: uedges[obj][pred].add(uberon) bridge.add_hierarchy(obj, pred, uberon) else: #bridge.add_node(uberon, pred, obj) pass elif obj == nifga: try: sub = replaced_by[sub] if sub == 'NOREP': hier = False except KeyError: print('not in replaced_by', sub) if type(sub) == tuple: continue # TODO if hier: if sub not in uedges[uberon][pred]: uedges[uberon][pred].add(sub) bridge.add_hierarchy(uberon, pred, sub) else: #bridge.add_node(sub, pred, uberon) pass if False and uberon not in udone and include: # skip porting annotations and labels for now #udone.add(uberon) try: label = sgv.findById(uberon)['labels'][0] except IndexError: WAT = sgv.findById(uberon) embed() bridge.add_class(uberon, label=label) # annotations to port for p in anns_to_port: os_ = list(graph.g.objects(graph.expand(nifga), p)) for o in os_: if label.lower() != o.lower(): # we can simply capitalize labels print(label.lower()) print(o.lower()) print() bridge.add_node(uberon, p, o) if p == SKOS.prefLabel and not os_: if uberon not in conflated or (uberon in conflated and nifga in preflabs): l = list(graph.g.objects(graph.expand(nifga), RDFS.label))[0] bridge.add_node(uberon, SKOS.prefLabel, l) # port label to prefLabel if no prefLabel for nifga, uberon in replaced_by.items(): if type(uberon) == tuple: print(uberon) for ub in uberon: print(ub) inner(nifga, ub) elif uberon == 'NOREP': graph.add_node(nifga, OWL.deprecated, True) # TODO check for missing edges? elif uberon is None: continue # BUT TODAY IS NOT THAT DAY! else: inner(nifga, uberon) return graph, bridge, uedges
def do_deprecation(replaced_by, g, additional_edges, conflated): bmeta = OntMeta( 'http://ontology.neuinfo.org/NIF/ttl/bridge/', 'uberon-bridge', 'NIFSTD Uberon Bridge', 'UBERON Bridge', ('This is the bridge file that holds local NIFSTD additions to uberon. ' 'This is also staging for any changes that we want to push upstream.' ), TODAY) ontid = bmeta.path + bmeta.filename + '.ttl' bridge = makeGraph('uberon-bridge', PREFIXES) bridge.add_ont(ontid, *bmeta[2:]) graph = makeGraph('NIF-GrossAnatomy', NIFPREFIXES, graph=g) #graph.g.namespace_manager._NamespaceManager__cache = {} #g.namespace_manager.bind('UBERON','http://purl.obolibrary.org/obo/UBERON_') # this has to go in again because we reset g FIXME udone = set('NOREP') uedges = defaultdict(lambda: defaultdict(set)) def inner(nifga, uberon): # check neuronames id TODO udepr = sgv.findById( uberon)['deprecated'] if uberon != 'NOREP' else False if udepr: # add xref to the now deprecated uberon term graph.add_trip(nifga, 'oboInOwl:hasDbXref', uberon) #print('Replacement is deprecated, not replacing:', uberon) graph.add_trip( nifga, RDFS.comment, 'xref %s is deprecated, so not using replacedBy:' % uberon) else: # add replaced by -> uberon graph.add_trip(nifga, 'replacedBy:', uberon) # add deprecated true (ok to do twice...) graph.add_trip(nifga, OWL.deprecated, True) # review nifga relations, specifically has_proper_part, proper_part_of # put those relations on the uberon term in the # if there is no uberon term raise an error so we can look into it #if uberon not in uedges: #uedges[uberon] = defaultdict(set) resp = sgg.getNeighbors(nifga) edges = resp['edges'] if nifga in additional_edges: edges.append(additional_edges[nifga]) include = False # set this to True when running anns for edge in edges: # FIXME TODO hierarchy extraction and porting #print(edge) if udepr: # skip everything if uberon is deprecated include = False hier = False break sub = edge['sub'] obj = edge['obj'] pred = edge['pred'] hier = False if pred == 'subClassOf': pred = RDFS.subClassOf continue elif pred == 'equivalentClass': pred = OWL.equivalentClass continue elif pred == 'isDefinedBy': pred = RDFS.isDefinedBy continue elif pred == 'http://www.obofoundry.org/ro/ro.owl#has_proper_part': hier = True include = True elif pred == 'http://www.obofoundry.org/ro/ro.owl#proper_part_of': hier = True include = True elif pred == 'ilx:partOf': hier = True include = True if sub == nifga: try: obj = replaced_by[obj] if obj == 'NOREP': hier = False except KeyError: print('not in replaced_by', obj) if type(obj) == tuple: continue # TODO if hier: if uberon not in uedges[obj][pred]: uedges[obj][pred].add(uberon) bridge.add_hierarchy(obj, pred, uberon) else: #bridge.add_trip(uberon, pred, obj) pass elif obj == nifga: try: sub = replaced_by[sub] if sub == 'NOREP': hier = False except KeyError: print('not in replaced_by', sub) if type(sub) == tuple: continue # TODO if hier: if sub not in uedges[uberon][pred]: uedges[uberon][pred].add(sub) bridge.add_hierarchy(uberon, pred, sub) else: #bridge.add_trip(sub, pred, uberon) pass if False and uberon not in udone and include: # skip porting annotations and labels for now #udone.add(uberon) try: label = sgv.findById(uberon)['labels'][0] except IndexError: WAT = sgv.findById(uberon) embed() bridge.add_class(uberon, label=label) # annotations to port for p in anns_to_port: os_ = list(graph.g.objects(graph.expand(nifga), p)) for o in os_: if label.lower() != o.lower( ): # we can simply capitalize labels print(label.lower()) print(o.lower()) print() bridge.add_trip(uberon, p, o) if p == SKOS.prefLabel and not os_: if uberon not in conflated or (uberon in conflated and nifga in preflabs): l = list( graph.g.objects(graph.expand(nifga), RDFS.label))[0] bridge.add_trip( uberon, SKOS.prefLabel, l) # port label to prefLabel if no prefLabel for nifga, uberon in replaced_by.items(): if type(uberon) == tuple: print(uberon) for ub in uberon: print(ub) inner(nifga, ub) elif uberon == 'NOREP': graph.add_trip(nifga, OWL.deprecated, True) # TODO check for missing edges? elif uberon is None: continue # BUT TODAY IS NOT THAT DAY! else: inner(nifga, uberon) return graph, bridge, uedges
from utils import makeGraph, sendTweet, CITIES import os for city in CITIES: makeGraph(city) if os.environ["DEBUG"] == "False": sendTweet("Berlin")
def __init__(self): g = makeGraph('merged', prefixes={k:str(v) for k, v in EXISTING_GRAPH.namespaces()}, graph=EXISTING_GRAPH) self.g = g self.bag_existing()
def make_table1(syn_mappings, ilx_start, phenotypes): # TODO when to explicitly subClassOf? I think we want this when the higher level phenotype bag is shared # it may turn out that things like the disjointness exist at a higher level while correlated properties # should be instantiated together as sub classes, for example if cck and # FIXME disagreement about nest basket cells # TODO hasPhenotypes needs to be function to get phenotypeOf to work via reasoner??? this seems wrong. # this also works if phenotypeOf is inverseFunctional # hasPhenotype shall be asymmetric, irreflexive, and intransitive # XXX in answer to Maryann's question about why we need the morphological phenotypes by themselves: # if we don't have them we can't agregate across orthogonal phenotypes since owl correctly keeps the classes distinct # TODO disjointness axioms work really well on defined classes and propagate excellently # TODO add 'Petilla' or something like that to the phenotype definitions # we want this because 'Petilla' denotes the exact ANALYSIS used to determine the phenotype # there are some additional 'protocol' related restrictions on what you can apply analysis to # but we don't have to model those explicitly which would be a nightmare and break the # orthogonality of the cell type decomposition # TODO to make this explicit we need to include that phenotypes require 2 things # 1) a type of data (data type?) 2) a way to classify that data (analysis protocol) # # need a full type restriction... property chain? graph = makeGraph('hbp-special', prefixes=PREFIXES) # XXX fix all prefixes with open(refile(__file__, 'resources/26451489 table 1.csv'), 'rt') as f: rows = [list(r) for r in zip(*csv.reader(f))] base = 'http://ontology.neuinfo.org/NIF/ttl/' ontid = base + graph.name + '.ttl' graph.add_trip(ontid, rdflib.RDF.type, rdflib.OWL.Ontology) graph.add_trip(ontid, rdflib.OWL.imports, base + 'NIF-Neuron-Phenotype.ttl') graph.add_trip(ontid, rdflib.OWL.imports, base + 'NIF-Neuron-Defined.ttl') def lsn(word): syn_mappings[word] = graph.expand( sgv.findByTerm(word)[0]['curie']) # cheating lsn('Parvalbumin') lsn('neuropeptide Y') lsn('VIP peptides') lsn('somatostatin') syn_mappings['calbindin'] = graph.expand('PR:000004967') # cheating syn_mappings['calretinin'] = graph.expand('PR:000004968') # cheating t = table1(graph, rows, syn_mappings, ilx_start) ilx_start = t.ilx_start # adding fake mouse data #with open(refile(__file__, 'resources/26451489 table 1.csv'), 'rt') as f: # FIXME annoying #rows = [list(r) for r in zip(*csv.reader(f))] #t2 = table1(graph, rows, syn_mappings, ilx_start, species='NCBITaxon:10090') # FIXME double SOM+ phenos etc #ilx_start = t2.ilx_start def do_graph(d): sgt = graph.expand(d['curie']) label = d['labels'][0] graph.add_trip(sgt, rdflib.RDF.type, rdflib.OWL.Class) graph.add_trip(sgt, rdflib.RDFS.label, label) done = set() for s, p, o in graph.g.triples( (None, None, None)): #(rdflib.RDFS.subClassOf,rdflib.OWL.Thing)): if o not in done and type(o) == rdflib.term.URIRef: done.add(o) if not [_ for _ in graph.g.objects((o, rdflib.RDFS.label))]: d = sgv.findById(o) if d: if 'PR:' in d['curie']: do_graph(d) elif 'NIFMOL:' in d['curie']: do_graph(d) elif 'UBERON:' in d['curie']: do_graph(d) elif 'NCBITaxon:' in d['curie']: do_graph(d) elif 'NIFCELL:' in d['curie']: do_graph(d) # FIXME this is a dupe with defined_class #graph.add_trip(defined_class_parent, rdflib.RDF.type, rdflib.OWL.Class) #graph.add_trip(defined_class_parent, rdflib.RDFS.label, 'defined class neuron') #graph.add_trip(defined_class_parent, rdflib.namespace.SKOS.description, 'Parent class For all defined class neurons') #graph.add_trip(defined_class_parent, rdflib.RDFS.subClassOf, NIFCELL_NEURON) #graph.add_trip(morpho_defined, rdflib.RDFS.subClassOf, defined_class_parent) #graph.add_trip(morpho_defined, rdflib.RDFS.label, 'Morphologically classified neuron') # FIXME -- need asserted in here... #graph.add_trip(ephys_defined, rdflib.RDFS.subClassOf, defined_class_parent) #graph.add_trip(ephys_defined, rdflib.RDFS.label, 'Electrophysiologically classified neuron') graph.add_class(expression_defined, NIFCELL_NEURON, autogen=True) graph.add_class('ilx:NeuroTypeClass', NIFCELL_NEURON, label='Neuron TypeClass') graph.g.commit() phenotype_dju_dict = add_types(graph, phenotypes) for pheno, disjoints in phenotype_dju_dict.items(): name = ' '.join(re.findall( r'[A-Z][a-z]*', pheno.split(':')[1])[:-1]) #-1: drops Phenotype ilx_start += 1 # = make_defined(graph, ilx_start, name + ' neuron type', pheno, 'ilx:hasPhenotype') id_ = graph.expand(ilx_base.format(ilx_start)) typeclass = infixowl.Class(id_, graph=graph.g) typeclass.label = rdflib.Literal(name + ' neuron type') restriction = infixowl.Restriction(graph.expand('ilx:hasPhenotype'), graph=graph.g, someValuesFrom=pheno) #typeclass.subClassOf = [restriction, graph.expand('ilx:NeuroTypeClass')] ntc = graph.expand('ilx:NeuroTypeClass') intersection = infixowl.BooleanClass(members=(ntc, restriction), graph=graph.g) typeclass.equivalentClass = [intersection] # FIXME not clear that we should be doing typeclasses this way.... :/ # requires more thought, on the plus side you do get better reasoning... disjointunion = disjointUnionOf(graph=graph.g, members=list(disjoints)) graph.add_trip(id_, rdflib.OWL.disjointUnionOf, disjointunion) graph.write() return t
def swanson(): """ not really a parcellation scheme """ source = 'resources/swanson_aligned.txt' ONT_PATH = GENERATED filename = 'swanson_hierarchies' ontid = ONT_PATH + filename + '.ttl' PREFIXES = makePrefixes('', 'ilx', 'owl', 'skos', 'NIFRID', 'ILXREPLACE') PREFIXES.update({ #'':ontid + '/', # looking for better options 'SWAN': interlex_namespace('swanson/nt/term'), 'SWAA': interlex_namespace('swanson/nt/appendix'), }) new_graph = makeGraph(filename, PREFIXES, writeloc=WRITELOC) new_graph.add_ont( ontid, 'Swanson brain partomies', 'Swanson 2014 Partonomies', 'This file is automatically generated from ' + source + '.' + NOTICE, TODAY) # FIXME citations should really go on the ... anatomy? scheme artifact definingCitation = 'Swanson, Larry W. Neuroanatomical Terminology: a lexicon of classical origins and historical foundations. Oxford University Press, USA, 2014.' definingCitationID = 'ISBN:9780195340624' new_graph.add_trip(ontid, 'NIFRID:definingCitation', definingCitation) new_graph.add_trip(ontid, 'NIFRID:definingCitationID', definingCitationID) with open(source, 'rt') as f: lines = [l.strip() for l in f.readlines()] # join header on page 794 lines[635] += ' ' + lines.pop(636) #fix for capitalization since this header is reused fixed = ' or '.join([ ' ('.join([n.capitalize() for n in _.split(' (')]) for _ in lines[635].lower().split(' or ') ]).replace('human', 'HUMAN') lines[635] = fixed data = [] for l in lines: if not l.startswith('#'): level = l.count('.' * 5) l = l.strip('.') if ' (' in l: if ') or' in l: n1, l = l.split(') or') area_name, citationP = n1.strip().split(' (') citation = citationP.rstrip(')') d = (level, area_name, citation, 'NEXT SYN') data.append(d) #print(tc.red(tc.bold(repr(d)))) area_name, citationP = l.strip().split(' (') citation = citationP.rstrip(')') else: area_name = l citation = None d = (level, area_name, citation, None) #print(d) data.append(d) results = async_getter(sgv.findByTerm, [(d[1], ) for d in data]) #results = [None] * len(data) curies = [[r['curie'] for r in _ if 'UBERON' in r['curie']] if _ else [] for _ in results] output = [_[0] if _ else None for _ in curies] header = ['Depth', 'Name', 'Citation', 'NextSyn', 'Uberon'] zoop = [header] + [r for r in zip(*zip(*data), output)] + \ [(0, 'Appendix END None', None, None, None)] # needed to add last appendix class SP(rowParse): def __init__(self): self.nodes = defaultdict(dict) self._appendix = 0 self.appendicies = {} self._last_at_level = {} self.names = defaultdict(set) self.children = defaultdict(set) self.parents = defaultdict(set) self.next_syn = False super().__init__(zoop) def Depth(self, value): if self.next_syn: self.synonym = self.next_syn else: self.synonym = False self.depth = value def Name(self, value): self.name = value def Citation(self, value): self.citation = value def NextSyn(self, value): if value: self.next_syn = self._rowind else: self.next_syn = False def Uberon(self, value): self.uberon = value def _row_post(self): # check if we are in the next appendix # may want to xref ids between appendicies as well... if self.depth == 0: if self.name.startswith('Appendix'): if self._appendix: self.appendicies[self._appendix]['children'] = dict( self.children) self.appendicies[self._appendix]['parents'] = dict( self.parents) self._last_at_level = {} self.children = defaultdict(set) self.parents = defaultdict(set) _, num, apname = self.name.split(' ', 2) if num == 'END': return self._appendix = int(num) self.appendicies[self._appendix] = { 'name': apname.capitalize(), 'type': self.citation.capitalize() if self.citation else None } return else: if ' [' in self.name: name, taxonB = self.name.split(' [') self.name = name self.appendicies[self._appendix][ 'taxon'] = taxonB.rstrip(']').capitalize() else: # top level is animalia self.appendicies[ self._appendix]['taxon'] = 'ANIMALIA'.capitalize() self.name = self.name.capitalize() self.citation = self.citation.capitalize() # nodes if self.synonym: self.nodes[self.synonym]['synonym'] = self.name self.nodes[self.synonym]['syn-cite'] = self.citation self.nodes[self.synonym]['syn-uberon'] = self.uberon return else: if self.citation: # Transverse Longitudinal etc all @ lvl4 self.names[self.name + ' ' + self.citation].add( self._rowind) else: self.name += str(self._appendix) + self.nodes[ self._last_at_level[self.depth - 1]]['label'] #print(level, self.name) # can't return here because they are their own level # replace with actually doing something... self.nodes[self._rowind]['label'] = self.name self.nodes[self._rowind]['citation'] = self.citation self.nodes[self._rowind]['uberon'] = self.uberon # edges self._last_at_level[self.depth] = self._rowind # TODO will need something to deal with the Lateral/ if self.depth > 0: try: parent = self._last_at_level[self.depth - 1] except: embed() self.children[parent].add(self._rowind) self.parents[self._rowind].add(parent) def _end(self): replace = {} for asdf in [ sorted(n) for k, n in self.names.items() if len(n) > 1 ]: replace_with, to_replace = asdf[0], asdf[1:] for r in to_replace: replace[r] = replace_with for r, rw in replace.items(): #print(self.nodes[rw]) o = self.nodes.pop(r) #print(o) for vals in self.appendicies.values(): children = vals['children'] parents = vals['parents'] # need reversed so children are corrected before swap for r, rw in reversed(sorted(replace.items())): if r in parents: child = r new_child = rw parent = parents.pop(child) parents[new_child] = parent parent = list(parent)[0] children[parent].remove(child) children[parent].add(new_child) if r in children: parent = r new_parent = rw childs = children.pop(parent) children[new_parent] = childs for child in childs: parents[child] = {new_parent} self.nodes = dict(self.nodes) sp = SP() tp = [ _ for _ in sorted([ '{: <50}'.format(n['label']) + n['uberon'] if n['uberon'] else n['label'] for n in sp.nodes.values() ]) ] #print('\n'.join(tp)) #print(sp.appendicies[1].keys()) #print(sp.nodes[1].keys()) nbase = PREFIXES['SWAN'] + '%s' json_ = {'nodes': [], 'edges': []} parent = ILXREPLACE('swansonBrainRegionConcept') for node, anns in sp.nodes.items(): nid = nbase % node new_graph.add_class(nid, parent, label=anns['label']) new_graph.add_trip(nid, 'NIFRID:definingCitation', anns['citation']) json_['nodes'].append({'lbl': anns['label'], 'id': 'SWA:' + str(node)}) #if anns['uberon']: #new_graph.add_trip(nid, owl.equivalentClass, anns['uberon']) # issues arrise here... for appendix, data in sp.appendicies.items(): aid = PREFIXES['SWAA'] + str(appendix) new_graph.add_class(aid, label=data['name'].capitalize()) new_graph.add_trip( aid, 'ilx:hasTaxonRank', data['taxon']) # FIXME appendix is the data artifact... children = data['children'] ahp = HASPART + str(appendix) apo = PARTOF + str(appendix) new_graph.add_op(ahp, transitive=True) new_graph.add_op(apo, inverse=ahp, transitive=True) for parent, childs in children.items( ): # FIXME does this give complete coverage? pid = nbase % parent for child in childs: cid = nbase % child new_graph.add_hierarchy( cid, ahp, pid) # note hierarhcy inverts direction new_graph.add_hierarchy(pid, apo, cid) json_['edges'].append({ 'sub': 'SWA:' + str(child), 'pred': apo, 'obj': 'SWA:' + str(parent) }) new_graph.write() if False: Query = namedtuple('Query', ['root', 'relationshipType', 'direction', 'depth']) mapping = (1, 1, 1, 1, 30, 83, 69, 70, 74, 1) # should generate? for i, n in enumerate(mapping): a, b = creatTree(*Query('SWA:' + str(n), 'ilx:partOf' + str(i + 1), 'INCOMING', 10), json=json_) print(a) return ontid, None
def make_phenotypes(): ilx_start = 50114 graph = makeGraph('NIF-Phenotype-Core', prefixes=PREFIXES) graph2 = makeGraph('NIF-Phenotypes', prefixes=PREFIXES) eont = OntMeta( 'http://ontology.neuinfo.org/NIF/ttl/', 'NIF-Neuron-Defined', 'NIF Neuron Defined Classes', 'NIFNEUDEF', 'This file contains defined classes derived from neuron phenotypes.', TODAY) defined_graph = makeGraph(eont.filename, prefixes=PREFIXES) ontid = eont.path + eont.filename + '.ttl' defined_graph.add_ont(ontid, *eont[2:]) # do edges first since we will need them for the phenotypes later # TODO real ilx_ids and use prefixes to manage human readability with open(refile(__file__, 'resources/neuron_phenotype_edges.csv'), 'rt') as f: rows = [r for r in csv.reader(f)] lookup = { 'asymmetric': 'owl:AsymmetricProperty', 'irreflexive': 'owl:IrreflexiveProperty', 'functional': 'owl:FunctionalProperty', } pedges = set() def irn(inp): return ILXREPLACE(__name__ + inp) for row in rows[1:]: if row[0].startswith('#') or not row[0]: if row[0] == '#references': break print(row) continue id_ = PREFIXES['ilx'] + row[0] pedges.add(graph.expand('ilx:' + row[0])) graph.add_trip(id_, rdflib.RDFS.label, row[0]) # FIXME graph.add_trip(id_, rdflib.RDF.type, rdflib.OWL.ObjectProperty) if row[3]: graph.add_trip(id_, rdflib.namespace.SKOS.definition, row[3]) if row[6]: graph.add_trip(id_, rdflib.RDFS.subPropertyOf, 'ilx:' + row[6]) if row[7]: graph.add_trip(id_, rdflib.OWL.inverseOf, 'ilx:' + row[7]) if row[8]: for t in row[8].split(','): t = t.strip() graph.add_trip(id_, rdflib.RDF.type, lookup[t]) with open(refile(__file__, 'resources/neuron_phenotype.csv'), 'rt') as f: rows = [ r for r in csv.reader(f) if any(r) and not r[0].startswith('#') ] class PP(rowParse): # FIXME use add_new in _row_post? SCD = 'subClassesDisjoint' DJW = 'disjointWith' def __init__(self): self.ilx_start = ilx_start self.parent_child_map = defaultdict(set) self.child_parent_map = defaultdict(set) self.scd = set() super().__init__(rows) def ilx_id(self, value): self.id_ = graph2.expand(value) self.Class = infixowl.Class(self.id_, graph=graph2.g) label = ' '.join(re.findall(r'[A-Z][a-z]*', self.id_.split(':')[1])) self._label = label def subClassOf(self, value): if value: self.parent = graph2.expand(value) self.parent_child_map[self.parent].add(self.id_) self.child_parent_map[self.id_].add(self.parent) self.Class.subClassOf = [self.parent] def label(self, value): if value: self._label = value self.Class.label = value else: self.Class.label = rdflib.Literal(self._label) def synonyms(self, value): if value: for v in value.split(','): graph2.add_trip(self.id_, 'NIFRID:synonym', v) def rules(self, value): if value == PP.SCD: self.scd.add(self.id_) elif value.startswith(PP.DJW): [ graph2.add_trip(self.id_, rdflib.OWL.disjointWith, _) for _ in value.split(' ')[1:] ] def use_edge(self, value): if value: graph2.add_trip(self.id_, 'ilx:useObjectProperty', graph.expand('ilx:' + value)) def _row_post(self): # defined class lookup = { graph.expand('ilx:AxonPhenotype'): rdflib.URIRef('http://axon.org'), graph.expand('ilx:AxonMorphologicalPhenotype'): None, graph.expand('ilx:DendritePhenotype'): rdflib.URIRef('http://dendrite.org'), graph.expand('ilx:DendriteMorphologicalPhenotype'): None, graph.expand('ilx:SomaPhenotype'): rdflib.URIRef('http://soma.org'), graph.expand('ilx:SomaMorphologicalPhenotype'): None, graph.expand('ilx:NeuronPhenotype'): graph.expand(NIFCELL_NEURON), graph.expand('ilx:CellPhenotype'): None, graph.expand('ilx:Phenotype'): graph.expand('ilx:Phenotype'), } if self.id_ in lookup: return #elif 'Petilla' in self.id_: #return #else: #print(self.id_) # hidden label for consturctions graph2.add_trip(self.id_, rdflib.namespace.SKOS.hiddenLabel, self._label.rsplit(' Phenotype')[0]) self.ilx_start += 1 id_ = defined_graph.expand(ilx_base.format(self.ilx_start)) defined = infixowl.Class(id_, graph=defined_graph.g) #defined.label = rdflib.Literal(self._label.rstrip(' Phenotype') + ' neuron') # the extra space in rstrip removes 'et ' as well WTF! defined.label = rdflib.Literal( self._label.rstrip('Phenotype') + 'neuron') #print(self._label) print('_row_post ilx_start', self.ilx_start, list(defined.label)[0]) def getPhenotypeEdge(phenotype): print(phenotype) edge = 'ilx:hasPhenotype' # TODO in neuronManager... return edge edge = getPhenotypeEdge(self.id_) restriction = infixowl.Restriction(graph.expand(edge), graph=defined_graph.g, someValuesFrom=self.id_) parent = [p for p in self.child_parent_map[self.id_] if p] if parent: parent = parent[0] while 1: if parent == defined_graph.expand('ilx:NeuronPhenotype'): #defined.subClassOf = [graph.expand(defined_class_parent)] # XXX this does not produce what we want break #else: #print(parent, graph.expand('ilx:NeuronPhenotype')) #print('xxxxxxxxxxxxxxxx', parent) new_parent = [ p for p in self.child_parent_map[parent] if p ] if new_parent: parent = new_parent[0] else: break phenotype_equiv = lookup[parent] else: return intersection = infixowl.BooleanClass(members=(phenotype_equiv, restriction), graph=defined_graph.g) ##intersection = infixowl.BooleanClass(members=(restriction,), graph=self.graph.g) defined.equivalentClass = [intersection] def _end(self): for parent in self.scd: make_mutually_disjoint(graph2, list(self.parent_child_map[parent])) pp = PP() ilx_start = pp.ilx_start to_add = {} def lsn(word): rank = defaultdict(lambda: 0) rank['PR'] = -100 rank['NIFMOL'] = -50 rank['UBERON'] = -10 rank['NCBITaxon'] = -9 rank['NIFCELL'] = -8 sort_rank = lambda r: rank[r['curie'].split(':')[0]] to_add[word] = graph2.expand( sorted(sgv.findByTerm(word), key=sort_rank)[0]['curie']) # cheating # FIXME naming lsn('Parvalbumin') lsn('neuropeptide Y') lsn('VIP peptides') lsn('somatostatin') lsn('calbindin') lsn('calretinin') #for name, iri in to_add.items(): # XXX do not need, is already covered elsewhere #print('make_phenotypes ilx_start', ilx_start, name) #ilx_start = make_defined(defined_graph, ilx_start, name, iri, 'ilx:hasExpressionPhenotype', parent=expression_defined) #syn_mappings['calbindin'] = graph.expand('PR:000004967') # cheating #syn_mappings['calretinin'] = graph.expand('PR:000004968') # cheating ontid = 'http://ontology.neuinfo.org/NIF/ttl/' + graph.name + '.ttl' graph.add_ont( ontid, 'NIF Phenotype core', comment= 'This is the core set of predicates used to model phenotypes and the parent class for phenotypes.' ) graph.add_class('ilx:Phenotype', label='Phenotype') graph.add_trip( 'ilx:Phenotype', 'skos:definition', 'A Phenotype is a binary property of a biological entity. Phenotypes are derived from measurements made on the subject of interest. While Phenotype is not currently placed within the BFO hierarchy, if we were to place it, it would fall under BFO:0000016 -> disposition, since these phenotypes are contingent on the experimental conditions under which measurements were made and are NOT qualities. For consideration: in theory this would mean that disjointness does not make sense, even for things that would seem to be obviously disjoint such as Accomodating and Non-Accomodating. However, this information can still be captured on a subject by subject basis by asserting that for this particular entity, coocurrance of phenotypes is not possilbe. This still leaves the question of whether the class of biological entities that correspond to the bag of phenotypes is implicitly bounded/limited only to the extrinsic and unspecified experimental contidions, some of which are not and cannot be included in a bag of phenotypes. The way to deal with this when we want to include 2 \'same time\' disjoint phenotypes, is to use a logical phenotype to wrap them with an auxillary variable that we think accounts for the difference.' ) #graph.add_trip(ontid, rdflib.RDFS.comment, 'The NIF Neuron ontology holds materialized neurons that are collections of phenotypes.') #graph.add_trip(ontid, rdflib.OWL.versionInfo, ONTOLOGY_DEF['version']) #graph.g.commit() #get_defined_classes(graph) # oops... graph.write() # moved below to incorporate uwotm8 ontid2 = 'http://ontology.neuinfo.org/NIF/ttl/' + graph2.name + '.ttl' graph2.add_ont( ontid2, 'NIF Phenotypes', comment= 'A taxonomy of phenotypes used to model biological types as collections of measurements.' ) graph2.add_trip(ontid2, 'owl:imports', ontid) graph2.write() syn_mappings = {} for sub, syn in [ _ for _ in graph.g.subject_objects(graph.expand('NIFRID:synonym')) ] + [_ for _ in graph.g.subject_objects(rdflib.RDFS.label)]: syn = syn.toPython() if syn in syn_mappings: print('ERROR duplicate synonym!', syn, sub) syn_mappings[syn] = sub phenotypes = [ s for s, p, o in graph.g.triples((None, None, None)) if ' Phenotype' in o ] inc = get_transitive_closure( graph, rdflib.RDFS.subClassOf, graph.expand('ilx:NeuronPhenotype')) # FIXME not very configurable... return syn_mappings, pedges, ilx_start, inc, defined_graph
def chebi_imp(): PREFIXES = makePrefixes('definition', 'replacedBy', 'hasRole', 'oboInOwl', 'CHEBI', 'owl', 'skos', 'oboInOwl') ug = makeGraph('utilgraph', prefixes=PREFIXES) with open('resources/chebi-subset-ids.txt', 'rt') as f: ids_raw = set((_.strip() for _ in f.readlines())) ids = sorted(set((ug.expand(_.strip()) for _ in ids_raw))) def check_chebis(g): a = [] for id_ in ids: l = sorted(g.triples((id_, None, None))) ll = len(l) a.append(ll) return a def fixIons(g): # there are a series of atom/ion confusions that shall be dealt with, solution is to add 'iron' as a synonym to the charged form since that is what the biologists are usually referring to... ng = makeGraph('', graph=g, prefixes=makePrefixes('CHEBI')) # atom ion None, 'CHEBI:29108' # calcium is ok ng.replace_uriref('CHEBI:30145', 'CHEBI:49713') # lithium ng.replace_uriref('CHEBI:18248', 'CHEBI:29033') # iron ng.replace_uriref('CHEBI:26216', 'CHEBI:29103') # potassium ng.replace_uriref('CHEBI:26708', 'CHEBI:29101') # sodium None, 'CHEBI:29105' # zinc is ok g = rdflib.Graph() cg = rdflib.Graph() cd = rdflib.Graph() chemg = rdflib.Graph() molg = rdflib.Graph() #g.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebislim.ttl', format='turtle') cg.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebislim.ttl', format='turtle') list(g.add(t) for t in cg) a1 = check_chebis(g) #g.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebi-dead.ttl', format='turtle') cd.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebi-dead.ttl', format='turtle') list(g.add(t) for t in cd) a2 = check_chebis(g) #g.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Chemical.ttl', format='turtle') chemg.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Chemical.ttl', format='turtle') chemgg = makeGraph('NIF-Chemical', graph=chemg) fixIons(chemg) list(g.add(t) for t in chemg) a3 = check_chebis(g) #g.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Molecule.ttl', format='turtle') molg.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Molecule.ttl', format='turtle') molgg = makeGraph('NIF-Molecule', graph=molg) fixIons(molg) list(g.add(t) for t in molg) a4 = check_chebis(g) replacedBy = ug.expand('replacedBy:') deads = {s: o for s, o in cd.subject_objects(replacedBy)} def switch_dead(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl')) for f, r in deads.items(): ng.replace_uriref(f, r) ng.add_node(r, 'oboInOwl:hasAlternateId', rdflib.Literal(f, datatype=rdflib.XSD.string)) g.remove( (r, replacedBy, r)) # in case the replaced by was already in switch_dead(g) switch_dead(cg) switch_dead(chemg) switch_dead(molg) def fixHasAltId(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl', 'NIFCHEM', 'BIRNANN')) ng.replace_uriref('NIFCHEM:hasAlternativeId', 'oboInOwl:hasAlternativeId') ng.replace_uriref('BIRNANN:ChEBIid', 'oboInOwl:id') list(map(fixHasAltId, (g, cg, chemg))) def fixAltIdIsURIRef(g): hai = ug.expand('oboInOwl:hasAlternativeId') i = ug.expand('oboInOwl:id') makeGraph('', graph=g, prefixes=makePrefixes( 'CHEBI')) # amazlingly sometimes this is missing... def inner(s, p, o): if type(o) == rdflib.URIRef: qn = g.namespace_manager.qname(o) g.add((s, p, rdflib.Literal(qn, datatype=rdflib.XSD.string))) if 'ns' in qn: print('WARNING UNKNOWN NAMESPACE BEING SHORTENED', str(o), qn) g.remove((s, p, o)) for s, o in g.subject_objects(hai): inner(s, hai, o) for s, o in g.subject_objects(i): inner(s, i, o) list(map(fixAltIdIsURIRef, (g, cg, chemg))) matches = [_ for _ in zip(a1, a2, a3, a4)] changed = [len(set(_)) != 1 for _ in matches] review = [(id_, m) for id_, changed, m in zip(ids, changed, matches) if changed and m[0]] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_c = [ set([(s, str(o.toPython())) for s, p, o in cg.triples((u, None, None))]) for u, _ in review ] wat_a = [ set([(s, str(o.toPython())) for s, p, o in g.triples((u, None, None))]) for u, _ in review ] wat_c_ = [ set(cg.triples((u, None, None))) for u, _ in review ] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_a_ = [ set(g.triples((u, None, None))) for u, _ in review ] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython diff = [a - c for a, c in zip(wat_a, wat_c)] diff_ = [a - c for a, c in zip(wat_a_, wat_c_)] cb = createOntology( 'chebi-bridge', 'NIF ChEBI bridge', makePrefixes('CHEBI', 'BFO1SNAP', 'owl', 'skos', 'dc', 'hasRole', 'NIFCHEM', 'oboInOwl', 'NIFMOL', 'OBOANN', 'BIRNANN'), 'chebibridge', ('This bridge file contains additional annotations' ' on top of CHEBI identifiers that were originally' ' included in NIF-Chemical or NIF-Molecule that have' ' not since been added to CHEBI upstream'), path='ttl/bridge/', #imports=('https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebislim.ttl', #'https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebi-dead.ttl')) imports=( 'http://ontology.neuinfo.org/NIF/ttl/generated/chebislim.ttl', 'http://ontology.neuinfo.org/NIF/ttl/generated/chebi-dead.ttl')) out = [] for set_ in diff: for sub, string in sorted(set_): for t in g.triples((sub, None, None)): # please not that this process will do things like remove hasStreenName ectasy from CHEBI:1391 since chebislim has it listed as a synonym py = t[-1].toPython() if py == string and not py.startswith( 'ub' ): # ignore restrictions... this is safe because nifmol and nifchem dont have any restrictions... cb.add_recursive(t, g) cb.add_class( sub ) # only need to go at the end because sub is the same for each set def hasImplicitSuperclass(s, o): for super_ in cg.objects(s, rdflib.RDFS.subClassOf): if super_ == o: return True elif hasImplicitSuperclass(super_, o): return True # curation decisions after review (see outtc for full list) curatedOut = [] def curateOut(*t): curatedOut.append( tuple( ug.expand(_) if type(_) is not rdflib.Literal else _ for _ in t)) cb.del_trip(*t) curateOut( 'CHEBI:6887', 'rdfs:subClassOf', 'CHEBI:23367' ) # defer to the chebi choice of chemical substance over molecular entity since it is classified as a racemate which doesn't quite match the mol ent def curateOut( 'CHEBI:26519', 'rdfs:subClassOf', 'CHEBI:24870' ) # some ions may also be free radicals, but all free radicals are not ions! #natural product removal since natural product should probably be a role if anything... curateOut('CHEBI:18059', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:24921', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:37332', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:50906', 'rdfs:label', rdflib.Literal('Chemical role', datatype=rdflib.XSD.string) ) # chebi already has a chemical role... curateOut( 'CHEBI:22586', 'rdfs:subClassOf', 'CHEBI:24432' ) # antioxidant is already modelled as a chemical role instead of a biological role, the distinction is that the biological roles affect biological processes/property, not chemical processes/property curateOut('CHEBI:22720', 'rdfs:subClassOf', 'CHEBI:27171') # not all children are bicyclic curateOut( 'CHEBI:23447', 'rdfs:subClassOf', 'CHEBI:17188' ) # this one seems obviously flase... all cyclic nucleotides are not nucleoside 5'-monophosphate... curateOut( 'CHEBI:24922', 'rdfs:subClassOf', 'CHEBI:27171' ) # not all children are bicyclic, some may be poly, therefore removing curateOut( 'CHEBI:48706', 'rdfs:subClassOf', 'CHEBI:33232' ) # removing since antagonist is more incidental and pharmacological role is more appropriate (as chebi has it) curateOut('CHEBI:51064', 'rdfs:subClassOf', 'CHEBI:35338') # removing since chebi models this with has part curateOut( 'CHEBI:8247', 'rdfs:subClassOf', 'CHEBI:22720' ) # the structure is 'fused to' a benzo, but it is not a benzo, chebi has the correct #curateOut('CHEBI:9463', 'rdfs:subClassOf', 'CHEBI:50786') # not sure what to make of this wikipedia says one thing, but chebi says another, very strange... not an anabolic agent?!??! wat no idea # review hold over subClassOf statements intc = [] outtc = [] for s, o in cb.g.subject_objects(rdflib.RDFS.subClassOf): if str( o ) == 'http://ontology.neuinfo.org/NIF/Backend/BIRNLex_annotation_properties.owl#_birnlex_retired_class': # we need to remove any of the cases where deprecation was misused cb.g.remove((s, rdflib.RDFS.subClassOf, o)) elif hasImplicitSuperclass(s, o): cb.g.remove((s, rdflib.RDFS.subClassOf, o)) intc.append((s, rdflib.RDFS.subClassOf, o)) else: outtc.append((s, rdflib.RDFS.subClassOf, o)) def qname(trips): return tuple( tuple(cb.g.namespace_manager.qname(_) for _ in t) for t in trips) for a, p, b in sorted(qname(outtc)): if 'NIFMOL' in b: continue # not considering cases where NIFMOL/NIFCHEM ids are used, that can come later s = sgv.findById(a) o = sgv.findById(b) if s is None or o is None: print(a, '=>', s) print(b, '=>', o) else: print(s['labels'], s['curie']) print('subClassOf') print(o['labels'], o['curie']) print((a, p, b)) print('---------------------') cb.write( ) # re-add only the missing edges so that we can zap them from NIF-Molecule and NIF-Chemical (recurse is needed...) # validation diff2 = set(cb.g) - set(cg) diff3 = set(cb.g) - diff2 # should just be all the owl:Class entries diff4 = set(cb.g) - set(chemg) | set(cb.g) - set(molg) # not informative diff5 = set(cb.g) - diff4 # not informative both = set(chemg) & set( molg) # there is no overlap beyond the owl:Class declarations def getChebis(set_): return set(t for t in set_ if 'CHEBI_' in t[0]) def nodt(graph): return set((s, str(o) if type(o) is rdflib.Literal else o) for s, p, o in graph) cmc = getChebis(((( (nodt(chemg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) - nodt(curatedOut)) cmc = sorted(t for s, o in cmc for t in chemg.triples((s, None, o))) mmc = getChebis(((( (nodt(molg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) - nodt(curatedOut)) mmc = sorted(t for s, o in mmc for t in molg.triples((s, None, o))) # remove chebi classes from nifchem and nifmol def remstuff(sources, targets): for source in sources: for id_ in source.subjects(rdflib.RDF.type, rdflib.OWL.Class): for target in targets: target.del_class(id_) remstuff((cg, cd), (chemgg, molgg)) chemgg.write() molgg.write() embed()
def clean_hbp_cell(): #old graph g = rdflib.Graph() g.parse(os.path.expanduser('~/git/methodsOntology/ttl/hbp_cell_ontology.ttl'), format='turtle') g.remove((None, rdflib.OWL.imports, None)) g.remove((None, rdflib.RDF.type, rdflib.OWL.Ontology)) #new graph NAME = 'NIF-Neuron-HBP-cell-import' mg = makeGraph(NAME, prefixes=PREFIXES) ontid = 'http://ontology.neuinfo.org/NIF/ttl/generated/' + NAME + '.ttl' mg.add_node(ontid, rdflib.RDF.type, rdflib.OWL.Ontology) mg.add_node(ontid, rdflib.RDFS.label, 'NIF Neuron HBP cell import') mg.add_node(ontid, rdflib.RDFS.comment, 'this file was automatically using pyontutils/hbp_cells.py') mg.add_node(ontid, rdflib.OWL.versionInfo, date.isoformat(date.today())) newgraph = mg.g skip = { '0000000':'NIFCELL:sao1813327414', # cell #'0000001':NEURON, # neuron (equiv) #'0000002':'NIFCELL:sao313023570', # glia (equiv) #'0000021':'NIFNEURNT:nlx_neuron_nt_090804', # glut (equiv, but phen) #'0000022':'NIFNEURNT:nlx_neuron_nt_090803', # gaba (equiv, but phen) '0000003':NEURON, '0000004':NEURON, '0000005':NEURON, '0000006':NEURON, '0000007':NEURON, '0000008':NEURON, '0000009':NEURON, '0000010':NEURON, '0000019':NEURON, '0000020':NEURON, '0000033':NEURON, '0000034':NEURON, '0000070':NEURON, '0000071':NEURON, } to_phenotype = { '0000021':('ilx:hasExpressionPhenotype', 'NIFMOL:sao1744435799'), # glut, all classes that might be here are equived out '0000022':('ilx:hasExperssionPhenotype', 'NIFMOL:sao229636300'), # gaba } lookup = {'NIFCELL', 'NIFNEURNT'} missing_supers = { 'HBP_CELL:0000136', 'HBP_CELL:0000137', 'HBP_CELL:0000140', } replace = set() phen = set() equiv = {} for triple in sorted(g.triples((None, None, None))): id_suffix = newgraph.namespace_manager.compute_qname(triple[0].toPython())[2] try: obj_suffix = newgraph.namespace_manager.compute_qname(triple[2].toPython())[2] except: # it wasn't a url pass # equiv insert for help if triple[1] == rdflib.OWL.equivalentClass and id_suffix not in skip and id_suffix not in to_phenotype: qnt = newgraph.namespace_manager.compute_qname(triple[2].toPython()) #print(qnt) if qnt[0] in lookup: try: lab = v.findById(qnt[0] + ':' + qnt[2])['labels'][0] print('REMOTE', qnt[0] + ':' + qnt[2], lab) #mg.add_node(triple[2], rdflib.RDFS.label, lab) #mg.add_node(triple[0], PREFIXES['OBOANN'] + 'synonym', lab) # so we can see it except TypeError: if qnt[2].startswith('nlx'): triple = (triple[0], triple[1], expand('NIFSTD:' + qnt[2])) #print('bad identifier') #check for equiv if triple[0] not in equiv: eq = [o for o in g.objects(triple[0], rdflib.OWL.equivalentClass)] if eq and id_suffix not in skip and id_suffix not in to_phenotype: if len(eq) > 1: print(eq) equiv[triple[0]] = eq[0] continue elif triple[0] in equiv: continue # edge replace if triple[1].toPython() == 'http://www.FIXME.org/nsupper#synonym': edge = rdflib.URIRef('http://ontology.neuinfo.org/NIF/Backend/OBO_annotation_properties.owl#abbrev') elif triple[1].toPython() == 'http://www.FIXME.org/nsupper#definition': edge = rdflib.namespace.SKOS.definition else: edge = triple[1] # skip or to phenotype or equiv if id_suffix in skip: # have to make a manual edit to rdflib to include 'Nd' in allowed 1st chars replace.add(triple[0]) #print('MEEP MEEP') elif id_suffix in to_phenotype: # have to make a manual edit to rdflib to include 'Nd' in allowed 1st chars phen.add(triple[0]) elif triple[1] == rdflib.RDFS.label: # fix labels if not triple[2].startswith('Hippocampus'): new_label = rdflib.Literal('Neocortex ' + triple[2], lang='en') newgraph.add((triple[0], edge, new_label)) else: newgraph.add((triple[0], edge, triple[2])) elif triple[2] in replace: mg.add_node(triple[0], edge, skip[obj_suffix]) elif triple[2] in phen: edge_, rst_on = to_phenotype[obj_suffix] edge_ = expand(edge_) rst_on = expand(rst_on) this = triple[0] this = infixowl.Class(this, graph=newgraph) this.subClassOf = [expand(NEURON)] + [c for c in this.subClassOf] restriction = infixowl.Restriction(edge_, graph=newgraph, someValuesFrom=rst_on) this.subClassOf = [restriction] + [c for c in this.subClassOf] elif triple[2] in equiv: newgraph.add((triple[0], edge, equiv[triple[2]])) else: newgraph.add((triple[0], edge, triple[2])) # final cleanup for forward references (since we iterate through sorted) tt = rdflib.URIRef(expand('HBP_CELL:0000033')) tf = rdflib.URIRef(expand('HBP_CELL:0000034')) newgraph.remove((None, None, tt)) newgraph.remove((None, None, tf)) # add missing subClasses for nosub in missing_supers: mg.add_node(nosub, rdflib.RDFS.subClassOf, NEURON) # cleanup for subClassOf for subject in sorted(newgraph.subjects(rdflib.RDFS.subClassOf, expand(NEURON))): sco = [a for a in newgraph.triples((subject, rdflib.RDFS.subClassOf, None))] #print('U WOT M8') if len(sco) > 1: #print('#############\n', sco) for s, p, o in sco: if 'hbp_cell_ontology' in o or 'NIF-Cell' in o and o != expand(NEURON): #or 'sao2128417084' in o: # neocortex pyramidal cell #print(sco) newgraph.remove((subject, rdflib.RDFS.subClassOf, expand(NEURON))) break # do ilx ilx_start = ilx_get_start() #ilx_conv_mem = memoize('hbp_cell_interlex.json')(ilx_conv) # FIXME NOPE, also need to modify the graph :/ ilx_labels, ilx_replace = ilx_conv(graph=newgraph, prefix='HBP_CELL', ilx_start=ilx_start) ilx_add_ids(ilx_labels) with open('hbp_cell_ilx_ids.json', 'wt') as f: json.dump(ilx_replace, f) replace_map = ilx_replace for hbp, rep in skip.items(): ori = 'HBP_CELL:'+hbp if ori in replace_map: raise KeyError('identifier already in!??! %s' % ori) replace_map[ori] = rep for hbp, (e, rep) in to_phenotype.items(): ori = 'HBP_CELL:'+hbp if ori in replace_map: raise KeyError('identifier already in!??! %s' % ori) replace_map[ori] = edge, rep for hbp_iri, rep_iri in equiv.items(): hbp = newgraph.compute_qname(hbp_iri)[2] rep = newgraph.qname(rep_iri) ori = 'HBP_CELL:'+hbp if ori in replace_map: raise KeyError('identifier already in!??! %s' % ori) replace_map[ori] = rep return mg, replace_map
#!/usr/bin/env python3.5 import rdflib from utils import makePrefixes, makeGraph PREFIXES = makePrefixes('NIFGA', 'NIFSTD', 'owl') g = rdflib.Graph() g.parse( 'http://purl.obolibrary.org/obo/uberon/bridge/uberon-bridge-to-nifstd.owl', format='xml') name = 'NIFGA-Equivs' ng = makeGraph(name, PREFIXES) [ ng.g.add(t) for t in ((rdflib.URIRef(PREFIXES['NIFGA'] + o.rsplit('/', 1)[-1]), p, o) for s, p, o in g.triples((None, rdflib.OWL.equivalentClass, None))) ] ng.add_ont('http://ontology.neuinfo.org/NIF/ttl/generated/' + name + '.ttl', 'NIFGA to NIFSTD mappings') ng.write()
def clean_hbp_cell(): #old graph g = rdflib.Graph() g.parse( os.path.expanduser('~/git/methodsOntology/ttl/hbp_cell_ontology.ttl'), format='turtle') g.remove((None, rdflib.OWL.imports, None)) g.remove((None, rdflib.RDF.type, rdflib.OWL.Ontology)) #new graph NAME = 'NIF-Neuron-HBP-cell-import' mg = makeGraph(NAME, prefixes=PREFIXES) ontid = 'http://ontology.neuinfo.org/NIF/ttl/generated/' + NAME + '.ttl' mg.add_trip(ontid, rdflib.RDF.type, rdflib.OWL.Ontology) mg.add_trip(ontid, rdflib.RDFS.label, 'NIF Neuron HBP cell import') mg.add_trip(ontid, rdflib.RDFS.comment, 'this file was automatically using pyontutils/hbp_cells.py') mg.add_trip(ontid, rdflib.OWL.versionInfo, date.isoformat(date.today())) newgraph = mg.g skip = { '0000000': 'SAO:1813327414', # cell #'0000001':NEURON, # neuron (equiv) #'0000002':'SAO:313023570', # glia (equiv) #'0000021':'NLXNEURNT:090804', # glut (equiv, but phen) #'0000022':'NLXNEURNT:090803', # gaba (equiv, but phen) '0000003': NEURON, '0000004': NEURON, '0000005': NEURON, '0000006': NEURON, '0000007': NEURON, '0000008': NEURON, '0000009': NEURON, '0000010': NEURON, '0000019': NEURON, '0000020': NEURON, '0000033': NEURON, '0000034': NEURON, '0000070': NEURON, '0000071': NEURON, } to_phenotype = { '0000021': ('ilx:hasExpressionPhenotype', 'SAO:1744435799' ), # glut, all classes that might be here are equived out '0000022': ('ilx:hasExperssionPhenotype', 'SAO:229636300'), # gaba } lookup = {'NIFCELL', 'NIFNEURNT'} missing_supers = { 'HBP_CELL:0000136', 'HBP_CELL:0000137', 'HBP_CELL:0000140', } replace = set() phen = set() equiv = {} for triple in sorted(g.triples((None, None, None))): id_suffix = newgraph.namespace_manager.compute_qname( triple[0].toPython())[2] try: obj_suffix = newgraph.namespace_manager.compute_qname( triple[2].toPython())[2] except: # it wasn't a url pass # equiv insert for help if triple[ 1] == rdflib.OWL.equivalentClass and id_suffix not in skip and id_suffix not in to_phenotype: qnt = newgraph.namespace_manager.compute_qname( triple[2].toPython()) #print(qnt) if qnt[0] in lookup: try: lab = v.findById(qnt[0] + ':' + qnt[2])['labels'][0] print('REMOTE', qnt[0] + ':' + qnt[2], lab) #mg.add_trip(triple[2], rdflib.RDFS.label, lab) #mg.add_trip(triple[0], PREFIXES['NIFRID'] + 'synonym', lab) # so we can see it except TypeError: if qnt[2].startswith('nlx'): triple = (triple[0], triple[1], expand('NIFSTD:' + qnt[2])) #print('bad identifier') #check for equiv if triple[0] not in equiv: eq = [o for o in g.objects(triple[0], rdflib.OWL.equivalentClass)] if eq and id_suffix not in skip and id_suffix not in to_phenotype: if len(eq) > 1: print(eq) equiv[triple[0]] = eq[0] continue elif triple[0] in equiv: continue # edge replace if triple[1].toPython() == 'http://www.FIXME.org/nsupper#synonym': edge = mg.expand('NIFRID:abbrev') elif triple[1].toPython() == 'http://www.FIXME.org/nsupper#definition': edge = rdflib.namespace.SKOS.definition else: edge = triple[1] # skip or to phenotype or equiv if id_suffix in skip: # have to make a manual edit to rdflib to include 'Nd' in allowed 1st chars replace.add(triple[0]) #print('MEEP MEEP') elif id_suffix in to_phenotype: # have to make a manual edit to rdflib to include 'Nd' in allowed 1st chars phen.add(triple[0]) elif triple[1] == rdflib.RDFS.label: # fix labels if not triple[2].startswith('Hippocampus'): new_label = rdflib.Literal('Neocortex ' + triple[2], lang='en') newgraph.add((triple[0], edge, new_label)) else: newgraph.add((triple[0], edge, triple[2])) elif triple[2] in replace: mg.add_trip(triple[0], edge, skip[obj_suffix]) elif triple[2] in phen: edge_, rst_on = to_phenotype[obj_suffix] edge_ = expand(edge_) rst_on = expand(rst_on) this = triple[0] this = infixowl.Class(this, graph=newgraph) this.subClassOf = [expand(NEURON)] + [c for c in this.subClassOf] restriction = infixowl.Restriction(edge_, graph=newgraph, someValuesFrom=rst_on) this.subClassOf = [restriction] + [c for c in this.subClassOf] elif triple[2] in equiv: newgraph.add((triple[0], edge, equiv[triple[2]])) else: newgraph.add((triple[0], edge, triple[2])) # final cleanup for forward references (since we iterate through sorted) tt = rdflib.URIRef(expand('HBP_CELL:0000033')) tf = rdflib.URIRef(expand('HBP_CELL:0000034')) newgraph.remove((None, None, tt)) newgraph.remove((None, None, tf)) # add missing subClasses for nosub in missing_supers: mg.add_trip(nosub, rdflib.RDFS.subClassOf, NEURON) # cleanup for subClassOf for subject in sorted( newgraph.subjects(rdflib.RDFS.subClassOf, expand(NEURON))): sco = [ a for a in newgraph.triples((subject, rdflib.RDFS.subClassOf, None)) ] #print('U WOT M8') if len(sco) > 1: #print('#############\n', sco) for s, p, o in sco: if 'hbp_cell_ontology' in o or 'NIF-Cell' in o and o != expand( NEURON ): #or 'sao2128417084' in o: # neocortex pyramidal cell #print(sco) newgraph.remove( (subject, rdflib.RDFS.subClassOf, expand(NEURON))) break # do ilx ilx_start = ilx_get_start() #ilx_conv_mem = memoize('hbp_cell_interlex.json')(ilx_conv) # FIXME NOPE, also need to modify the graph :/ ilx_labels, ilx_replace = ilx_conv(graph=newgraph, prefix='HBP_CELL', ilx_start=ilx_start) ilx_add_ids(ilx_labels) with open('hbp_cell_ilx_ids.json', 'wt') as f: json.dump(ilx_replace, f) replace_map = ilx_replace for hbp, rep in skip.items(): ori = 'HBP_CELL:' + hbp if ori in replace_map: raise KeyError('identifier already in!??! %s' % ori) replace_map[ori] = rep for hbp, (e, rep) in to_phenotype.items(): ori = 'HBP_CELL:' + hbp if ori in replace_map: raise KeyError('identifier already in!??! %s' % ori) replace_map[ori] = edge, rep for hbp_iri, rep_iri in equiv.items(): hbp = newgraph.compute_qname(hbp_iri)[2] rep = newgraph.qname(rep_iri) ori = 'HBP_CELL:' + hbp if ori in replace_map: raise KeyError('identifier already in!??! %s' % ori) replace_map[ori] = rep return mg, replace_map
from glob import glob from rdflib.namespace import SKOS from parcellation import OntMeta from utils import TODAY, makeGraph, makePrefixes PREFIXES = makePrefixes('SCR', 'MBA', 'NIFMOL', 'NIFNEURON', 'NIFCELL', 'NIFGA', 'UBERON', 'PR', 'NIFNEURMOR', 'skos', 'owl') ont = OntMeta( 'http://ontology.neuinfo.org/NIF/ttl/generated/', 'ksdesc-defs', 'Knolwedge Space Defs', 'KSDEFS', 'Definitions from knowledge space descriptions. Generated by pyontutils/ksdesc_bridge.py', TODAY) ontid = ont.path + ont.filename + '.ttl' g = makeGraph(ont.filename, prefixes=PREFIXES) g.add_ont(ontid, *ont[2:]) top_level = glob(os.path.expanduser('~/git/ksdesc/') + '*') for putative_dir in top_level: if os.path.isdir(putative_dir): for putative_md in glob(putative_dir + '/*.md'): ident = os.path.split(putative_dir)[-1] + ':' + os.path.splitext( os.path.split(putative_md)[-1])[0] print(ident) with open(putative_md, 'rt') as f: def_ = f.read() for test in ('Description', 'Definition'): if test in def_:
def chebi_make(): PREFIXES = makePrefixes('definition', 'hasRole', 'CHEBI', 'owl', 'skos', 'oboInOwl') dPREFIXES = makePrefixes('CHEBI','replacedBy','owl','skos') ug = makeGraph('utilgraph', prefixes=PREFIXES) IDS_FILE = 'chebi-subset-ids.txt' with open(IDS_FILE, 'rt') as f: ids_raw = set((_.strip() for _ in f.readlines())) ids = set((ug.expand(_.strip()).toPython() for _ in ids_raw)) #gzed = requests.get('http://localhost:8000/chebi.owl') #raw = BytesIO(gzed.content) gzed = requests.get('http://ftp.ebi.ac.uk/pub/databases/chebi/ontology/nightly/chebi.owl.gz') raw = BytesIO(gzip.decompress(gzed.content)) t = etree.parse(raw) r = t.getroot() cs = r.getchildren() classes = [_ for _ in cs if _.tag == '{http://www.w3.org/2002/07/owl#}Class' and _.values()[0] in ids] ontology = t.xpath("/*[local-name()='RDF']/*[local-name()='Ontology']") ops = t.xpath("/*[local-name()='RDF']/*[local-name()='ObjectProperty']") # TODO wanted = [etree.ElementTree(_) for _ in classes] rpl_check = t.xpath("/*[local-name()='RDF']/*[local-name()='Class']/*[local-name()='hasAlternativeId']") rpl_dict = {_.text:_.getparent() for _ in rpl_check if _.text in ids_raw } # we also need to have any new classes that have replaced old ids also_classes = list(rpl_dict.values()) def rec(start_set, done): ids_ = set() for c in start_set: ids_.update([_.items()[0][1] for _ in etree.ElementTree(c).xpath("/*[local-name()='Class']/*[local-name()='subClassOf']") if _.items()]) ids_.update([_.items()[0][1] for _ in etree.ElementTree(c).xpath("/*[local-name()='Class']/*[local-name()='subClassOf']/*[local-name()='Restriction']/*[local-name()='someValuesFrom']") if _.items()]) supers = [_ for _ in cs if _.tag == '{http://www.w3.org/2002/07/owl#}Class' and _.values()[0] in ids_ and _ not in done] if supers: msup, mids = rec(supers, done + supers) supers += msup ids_.update(mids) return supers, ids_ a = ontology + ops + classes + also_classes more, mids = rec(a, a) all_ = set(a + more) r.clear() # wipe all the stuff we don't need for c in all_: r.append(c) data = etree.tostring(r) g = rdflib.Graph() g.parse(data=data) # now _this_ is stupidly slow (like 20 minutes of slow) might make more sense to do the xml directly? src_version = list(g.query('SELECT DISTINCT ?match WHERE { ?temp rdf:type owl:Ontology . ?temp owl:versionIRI ?match . }'))[0][0] ont = OntMeta('http://ontology.neuinfo.org/NIF/ttl/generated/', 'chebislim', 'NIF ChEBI slim', 'chebislim', 'This file is generated by pyontutils/slimgen from the full ChEBI nightly at versionIRI %s based on the list of terms in %s.' % (src_version, IDS_FILE), TODAY) dont = OntMeta('http://ontology.neuinfo.org/NIF/ttl/generated/', 'chebi-dead', 'NIF ChEBI deprecated', 'chebidead', 'This file is generated by pyontutils/slimgen to make deprecated classes resolvablefrom the full ChEBI nightly at versionIRI %s based on the list of terms in %s.' % (src_version, IDS_FILE), TODAY) new_graph = makeGraph(ont.filename, PREFIXES) ontid = ont.path + ont.filename + '.ttl' new_graph.add_ont(ontid, *ont[2:]) chebi_dead = makeGraph(dont.filename, dPREFIXES) dontid = dont.path + dont.filename + '.ttl' chebi_dead.add_ont(dontid, *dont[2:]) depwor = {'CHEBI:33243':'natural product', # FIXME remove these? 'CHEBI:36809':'tricyclic antidepressant', } for id_ in sorted(set(ids_raw) | set((ug.g.namespace_manager.qname(_) for _ in mids))): eid = ug.expand(id_) trips = list(g.triples((eid, None, None))) if not trips: #looks for the id_ as a literal alts = list(g.triples((None, rdflib.term.URIRef('http://www.geneontology.org/formats/oboInOwl#hasAlternativeId'), rdflib.Literal(id_, datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))))) if alts: replaced_by, _, __ = alts[0] if replaced_by.toPython() not in ids: # we need to add any replacment classes to the bridge print('REPLACED BY NEW CLASS', id_) for t in g.triples((replaced_by, None, None)): new_graph.add_recursive(t, g) chebi_dead.add_class(id_) chebi_dead.add_node(id_, 'replacedBy:', replaced_by) chebi_dead.add_node(id_, rdflib.OWL.deprecated, True) else: if id_ not in depwor: raise BaseException('wtf error', id_) else: for trip in trips: new_graph.add_recursive(trip, g) new_graph.write() chebi_dead.write() embed()