def test_parse_shared_bnode_context(self): bnode_ctx = dict() g = ConjunctiveGraph() h = ConjunctiveGraph() g.parse(self.data, format="nquads", bnode_context=bnode_ctx) self.data.seek(0) h.parse(self.data, format="nquads", bnode_context=bnode_ctx) self.assertEqual(set(h.subjects()), set(g.subjects()))
def test_parse_distinct_bnode_context(self): g = ConjunctiveGraph() g.parse(self.data, format="nquads", bnode_context=dict()) s1 = set(g.subjects()) self.data.seek(0) g.parse(self.data, format="nquads", bnode_context=dict()) s2 = set(g.subjects()) self.assertNotEqual(set(), s2 - s1)
def test_parse_distinct_bnode_contexts_between_graphs(self): g = ConjunctiveGraph() h = ConjunctiveGraph() g.parse(self.data, format="nquads") s1 = set(g.subjects()) self.data.seek(0) h.parse(self.data, format="nquads") s2 = set(h.subjects()) self.assertNotEqual(s1, s2)
def verify_rdf(rdf_output): g = ConjunctiveGraph() g.parse(data=rdf_output, format="turtle") assert len(g) == 6 assert len(set(g.subjects())) == 2 assert len(set(g.predicates())) == 3 assert len(set(g.objects())) == 6
def test_null_values_with_single_string(): csvw = CSVW(csv_path="tests/null1.csv", metadata_path="tests/null1.single.csv-metadata.json") rdf_contents = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_contents, format="turtle") # There should be no subject NA all_subjects = {x for x in g.subjects()} assert subj_ns['null_key'] not in all_subjects assert subj_ns['1'] in all_subjects assert len(all_subjects) == 4 # Null valued objects should not be created all_objects = {x for x in g.objects()} assert Literal('null_key', datatype=XSD.token) not in all_objects assert Literal('null_sector') not in all_objects assert Literal('null_id', datatype=XSD.token) not in all_objects assert Literal('PUBLIC') in all_objects assert Literal('12', datatype=XSD.token) in all_objects # Spot check some triples do not exist but other do from the same row null_key_lit = Literal('null_id', datatype=XSD.token) assert len(list(g.triples((subj_ns['2'], id_uri, null_key_lit)))) == 0 priv_lit = Literal('PRIVATE') assert len(list(g.triples((subj_ns['2'], sect_uri, priv_lit)))) == 1 null_sector_lit = Literal('null_sector') assert len(list(g.triples((subj_ns['3'], sect_uri, null_sector_lit)))) == 0 twelve_lit = Literal('12', datatype=XSD.token) assert len(list(g.triples((subj_ns['3'], id_uri, twelve_lit)))) == 1
def get_mediator_vocabs(userid): vocabs = {} if not os.path.isfile(os.path.join(ag.mediatorsdir, '%s.rdf'%userid)): "Cannot find file %s"%os.path.join(ag.mediatorsdir, '%s.rdf'%userid) return vocabs #Get list of vocabularies created by userid graph = Graph() graph.parse(os.path.join(ag.mediatorsdir, '%s.rdf'%userid)) for v in graph.subjects(namespaces['dcterms']['mediator'], None): k = v.split('/')[-1] svn_src = "http://damssupport.ouls.ox.ac.uk/trac/vocab/browser/trunks/internalVocabularies/%s"%k vocabs[k] = (v, svn_src) return vocabs
def get_rdf_metadata(self, uniprot_id): """Retrieve RDF metadata for the given UniProt accession. XXX Not finished. XML parsing looks to be more straightforward """ from rdflib import ConjunctiveGraph as Graph url_base = "%s/uniprot/%s.rdf" full_url = url_base % (self._server, uniprot_id) graph = Graph() with self._get_open_handle(full_url) as in_handle: graph.parse(in_handle) main_subject = [s for s in list(set(graph.subjects())) if s.split('/')[-1] == uniprot_id][0] for sub, pred, obj in graph: print sub, pred, obj
def test_default(): csvw = CSVW(csv_path='tests/virtual1.csv', metadata_path='tests/virtual1.default.csv-metadata.json') rdf_output = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_output, format="turtle") all_subjects = {x for x in g.subjects()} assert len(all_subjects) == 4 ns = Namespace("http://example.org/") assert ns['sub-1'] in all_subjects assert ns['sub-2'] in all_subjects assert len([g.triples((ns['sub-1'], ns['obj-1'], ns['myvalue']))]) == 1 assert len([g.triples((ns['sub-2'], ns['obj-2'], ns['myvalue']))]) == 1
def get_mediator_details(userid): #Get mediator_details - firstname, lastname, department, email details = {} details['userid'] = userid details['uri'] = None details['name'] = None details['fname'] = None details['lname'] = None details['title'] = None details['email'] = None details['dept'] = [] if userid.startswith('uuid'): userid = get_mediator_account(userid) details['userid'] = userid if not userid: return details if not os.path.isfile(os.path.join(ag.mediatorsdir, '%s.rdf'%userid)): return details graph = Graph() graph.parse(os.path.join(ag.mediatorsdir, '%s.rdf'%userid)) t = '' f = '' l = '' for title in graph.objects(None, namespaces['foaf']['title']): if title.strip(): t = title details['title'] = t for fname in graph.objects(None, namespaces['foaf']['firstName']): if fname.strip(): f = fname details['fname'] = fname for lname in graph.objects(None, namespaces['foaf']['lastName']): if lname.strip(): l = lname details['lname'] = lname details['name'] = "%s %s %s"%(t, f, l) details['name'] = details['name'].strip() if not details['name']: details['name'] = userid for email in graph.objects(None, namespaces['foaf']['mbox']): details['email'] = email for dept in graph.objects(None, namespaces['dcterms']['isPartOf']): details['dept'].append(dept) for uri in graph.subjects(namespaces['foaf']['account'], None): details['uri'] = uri return details
def generate(cls, n): graph = ConjunctiveGraph() load_rdf_file(STORE['utensils'], graph) all_uris = set(graph.subjects()) n = min(n, len(all_uris)) selected_uris = sample(all_uris, n) # On récupère les ustensiles voulus dans le graphe selected_triples = chain(*map(graph.triples, ((uri, None, None) for uri in selected_uris))) map(rdfSubject.db.add, selected_triples) utensils = [Utensil(uri) for uri in selected_uris] # On récupère les actions de ces ustensiles ActionGenerator.generate(utensils) return utensils
def verify_rdf_contents(contents, fmt): g = ConjunctiveGraph() g.parse(data=contents, format=fmt) books = Namespace('http://www.books.org/') isbn = Namespace("http://www.books.org/isbn/") # Check number of all triples assert sum( 1 for _ in g.triples((None, None, None))) == NUM_SUBJECTS * NUM_TRIPLES_PER_SUBJ # Check number of subject subjs = set(g.subjects()) expected_subjs = ["0062316095", "0374532508", "1610391845", "0374275637"] assert len(subjs) == len(expected_subjs) for s in expected_subjs: assert isbn[s] in subjs # Verify isbn number is positive integer s_isbn = list(g.triples((isbn[s], books['isbnnumber'], None))) assert len(s_isbn) == 1 s_isbn_val = s_isbn[0][2] assert isinstance(s_isbn_val, Literal) assert s_isbn_val.datatype == XSD.positiveInteger # Verify pages is a unsignedShort s_page = list(g.triples((isbn[s], books['pagecount'], None))) assert len(s_page) == 1 s_page_val = s_page[0][2] assert isinstance(s_page_val, Literal) assert s_page_val.datatype == XSD.unsignedShort # Verify hardcover is a boolean s_hardcover = list(g.triples((isbn[s], books['hardcover'], None))) assert len(s_hardcover) == 1 s_hardcover_val = s_hardcover[0][2] assert isinstance(s_hardcover_val, Literal) assert s_hardcover_val.datatype == XSD.boolean # Verify price is a decimal s_price = list(g.triples((isbn[s], books['price'], None))) assert len(s_price) == 1 s_price_val = s_price[0][2] assert isinstance(s_price_val, Literal) assert s_price_val.datatype == XSD.decimal
def verify_rdf(rdf_output): ids_ns = Namespace("http://foo.example.org/CSV/People-IDs/") ages_ns = Namespace("http://foo.example.org/CSV/People-Ages/") g = ConjunctiveGraph() g.parse(data=rdf_output, format="turtle") all_subjects = {x for x in g.subjects()} assert len(all_subjects) == 2 bob_subj = ids_ns['1'] joe_subj = ids_ns['2'] assert bob_subj in all_subjects assert joe_subj in all_subjects # Bob's details assert len([g.triples((bob_subj, ids_ns.id, Literal(1)))]) == 1 assert len([g.triples((bob_subj, ids_ns.name, Literal("Bob")))]) == 1 assert len([g.triples((bob_subj, ages_ns.age, Literal(34)))]) == 1 # Joe's details assert len([g.triples((joe_subj, ids_ns.id, Literal(2)))]) == 1 assert len([g.triples((joe_subj, ids_ns.name, Literal("Joe")))]) == 1 assert len([g.triples((joe_subj, ages_ns.age, Literal(54)))]) == 1
raise # Test6: ontology is internally consistent with respect to domains, ranges, etc # step 1: find all the classes. rdftype = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type") rdfsdomain = URIRef("http://www.w3.org/2000/01/rdf-schema#domain") rdfsrange = URIRef("http://www.w3.org/2000/01/rdf-schema#range") rdfsresource = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#Resource") rdfssco = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf") asColl = URIRef("http://www.w3.org/ns/activitystreams#OrderedCollection") skosConcept = URIRef("http://www.w3.org/2004/02/skos/core#Concept") otherClasses = [asColl, skosConcept] classes = list(g.subjects(rdftype, URIRef("http://www.w3.org/2000/01/rdf-schema#Class"))) props = list(g.subjects(rdftype, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#Property"))) for p in props: domains = list(g.objects(p, rdfsdomain)) for d in domains: assert(d in classes) for p in props: ranges = list(g.objects(p, rdfsrange)) for r in ranges: if not r in classes and not str(r).startswith("http://www.w3.org/2001/XMLSchema#") and \ not r == rdfsresource: print "Found inconsistent property: %s has unknown range" % p for c in classes:
class KB4ITGraph: """ This class creates a RDF graph based on attributes for each doc. Also it has convenient function to ask the graph """ def __init__(self, path=None): """ If not path is passed it build a graph in memory. Otherwise, it creates a persistent graph in disk. """ if path is not None: # Create persistent Graph in disk self.path = path self.graph = ConjunctiveGraph('Sleepycat', URIRef("kb4it://")) graph_path = path + SEP + 'kb4it.graph' self.graph.store.open(graph_path) else: # Create Graph in Memory self.graph = ConjunctiveGraph('IOMemory') # Assign namespaces to the Namespace Manager of this graph namespace_manager = NamespaceManager(ConjunctiveGraph()) for ns in NSBINDINGS: namespace_manager.bind(ns, NSBINDINGS[ns]) self.graph.namespace_manager = namespace_manager def __uniq_sort(self, result): alist = list(result) aset = set(alist) alist = list(aset) alist.sort() return alist def subjects(self, predicate, object): """ Returns a list of sorted and uniques subjects given a predicate and an object. """ return self.__uniq_sort(self.graph.subjects(predicate, object)) def predicates(self, subject=None, object=None): """ Returns a list of sorted and uniques predicates given a subject and an object. """ return self.__uniq_sort(self.graph.predicates(subject, object)) def objects(self, subject, predicate): """ Returns a list of sorted and uniques objects given a subject and an predicate. """ return self.__uniq_sort(self.graph.objects(subject, predicate)) def value(self, subject=None, predicate=None, object=None, default=None, any=True): """ Returns a value given the subject and the predicate. """ return self.graph.value(subject, predicate, object, default, any) def add_document(self, doc): """ Add a new document to the graph. """ subject = URIRef(doc) predicate = RDF['type'] object = URIRef(KB4IT['Document']) self.graph.add([subject, predicate, object]) def add_document_attribute(self, doc, attribute, value): """ Add a new attribute to a document """ predicate = 'has%s' % attribute subject = URIRef(doc) predicate = KB4IT[predicate] object = Literal(value) self.graph.add([subject, predicate, object]) def get_attributes(self): """ Get all predicates except RFD.type and Title """ blacklist = set() blacklist.add(RDF['type']) blacklist.add(KB4IT['hasTitle']) alist = list(self.graph.predicates(None, None)) aset = set(alist) - blacklist alist = list(aset) alist.sort() return alist def serialize(self): """ Serialize graph to pretty xml format """ return self.graph.serialize(format='pretty-xml') def close(self): """ Close the graph if it is persistent. FIXME: check if it is open """ self.graph.store.close()
def create_vocab_statusfile(userid, vocabprefix, vocabfile, baseuri, update=False, using_uuid=False, refvocab=False): vocab_uri = URIRef("http://vocab.ox.ac.uk/%s"%vocabprefix) vocabdir = os.path.join(ag.vocabulariesdir, str(vocabprefix)) vocabstatusfile = os.path.join(vocabdir, "status.rdf") vocab_file_name = os.path.basename(vocabfile) vocabfile_uri = URIRef("http://vocab.ox.ac.uk/%s/%s"%(vocabprefix, vocab_file_name)) #Add vocab in mediator file graph = Graph() mediatorfile = os.path.join(ag.mediatorsdir, '%s.rdf'%userid) graph.parse(mediatorfile) user_uri = [] for uri in graph.subjects(namespaces['foaf']['account'], Literal(userid)): if not uri in user_uri: user_uri.append(uri) user_uri = URIRef(user_uri[0]) graph.add((vocab_uri, namespaces['dcterms']['mediator'], URIRef(user_uri))) rdf_str = None rdf_str = graph.serialize() f = codecs.open(mediatorfile, 'w', 'utf-8') f.write(rdf_str) f.close() #Add vocab in vocab status file graph = Graph() if update and os.path.isfile(vocabstatusfile): graph.parse(vocabstatusfile) for prefix, url in namespaces.iteritems(): graph.bind(prefix, URIRef(url)) graph.add((vocab_uri, namespaces['dcterms']['mediator'], URIRef(user_uri))) graph.add((user_uri, namespaces['foaf']['account'], Literal(userid))) graph.add((vocab_uri, namespaces['dcterms']['hasFormat'], URIRef(vocabfile_uri))) graph.add((vocab_uri, namespaces['vann']['preferredNamespaceUri'], URIRef(baseuri))) graph.add((vocab_uri, namespaces['vann']['preferredNamespacePrefix'], Literal(vocabprefix))) graph.add((vocab_uri, namespaces['skos']['editorialNote'], Literal(vocab_editorial_descriptions[0]))) if refvocab: add_ref_vocab(vocabprefix, refvocab) graph.add((vocab_uri, namespaces['dcterms']['isVersionOf'], URIRef(refvocab))) # get mimetype of file if os.path.isfile(vocabfile): graph.add((vocabfile_uri, namespaces['nfo']['fileUrl'], Literal('file://%s'%vocabfile))) graph.add((vocabfile_uri, namespaces['nfo']['fileName'], Literal(vocab_file_name))) mt = None if check_rdf(vocabfile): mt = 'application/rdf+xml' graph.add((vocabfile_uri, namespaces['dcterms']['conformsTo'], Literal(mt))) graph.add((vocabfile_uri, namespaces['skos']['editorialNote'], Literal(vocab_editorial_descriptions[3]))) elif check_n3(vocabfile): mt = 'text/rdf+nt' root, ext = os.path.splitext(vocabfile) if ext == '.rdf': rdffile = "%s_2.rdf"%root else: rdffile = "%s.rdf"%root converttordf = convert_n3_rdf(vocabfile, rdffile) if converttordf and os.path.isfile(rdffile): rdf_file_name = os.path.basename(rdffile) rdffile_uri = URIRef("http://vocab.ox.ac.uk/%s/%s"%(vocabprefix, rdf_file_name)) graph.add((vocab_uri, namespaces['dcterms']['hasFormat'], URIRef(rdffile_uri))) graph.add((rdffile_uri, namespaces['nfo']['fileUrl'], Literal('file://%s'%rdffile))) graph.add((rdffile_uri, namespaces['nfo']['fileName'], Literal(rdf_file_name))) graph.add((rdffile_uri, namespaces['dcterms']['conformsTo'], Literal('application/rdf+xml'))) graph.add((rdffile_uri, namespaces['skos']['editorialNote'], Literal(vocab_editorial_descriptions[3]))) graph.add((rdffile_uri, namespaces['dcterms']['format'], Literal('application/rdf+xml'))) else: mt1 = mimetypes.guess_type(vocabfile) mt2 = get_file_mimetype(vocabfile) if mt1[0]: mt = mt1[0] else: mt = mt2 if str(mt) == 'application/rdf+xml': graph.add((vocabfile_uri, namespaces['skos']['editorialNote'], Literal(vocab_editorial_descriptions[2]))) else: graph.add((vocab_uri, namespaces['skos']['editorialNote'], Literal(vocab_editorial_descriptions[1]))) if mt: graph.add((vocabfile_uri, namespaces['dcterms']['format'], Literal(mt))) rdf_str = None rdf_str = graph.serialize() f = codecs.open(vocabstatusfile, 'w', 'utf-8') f.write(rdf_str) f.close() return True
def add_property_axioms(graph, properties): ontology_graph = ConjunctiveGraph() GH = 'https://raw.githubusercontent.com' OBO = 'http://purl.obolibrary.org/obo' ontologies = [ OBO + '/sepio.owl', OBO + '/geno.owl', OBO + '/iao.owl', OBO + '/ero.owl', OBO + '/pco.owl', OBO + '/xco.owl', OBO + '/ro.owl', GH + '/jamesmalone/OBAN/master/ontology/oban_core.ttl', ] # random timeouts can waste hours. (too many redirects?) # there is a timeout param in urllib.request, # but it is not exposed by rdflib.parsing # so retry once on URLError for ontology in ontologies: LOG.info("parsing: " + ontology) try: ontology_graph.parse(ontology, format=rdflib_util.guess_format(ontology)) except SAXParseException as e: LOG.error(e) LOG.error('Retrying as turtle: ' + ontology) ontology_graph.parse(ontology, format="turtle") except OSError as e: # URLError: # simple retry LOG.error(e) LOG.error('Retrying: ' + ontology) ontology_graph.parse(ontology, format=rdflib_util.guess_format(ontology)) # Get object properties graph = GraphUtils.add_property_to_graph( ontology_graph.subjects(RDF['type'], OWL['ObjectProperty']), graph, OWL['ObjectProperty'], properties) # Get annotation properties graph = GraphUtils.add_property_to_graph( ontology_graph.subjects(RDF['type'], OWL['AnnotationProperty']), graph, OWL['AnnotationProperty'], properties) # Get data properties graph = GraphUtils.add_property_to_graph( ontology_graph.subjects(RDF['type'], OWL['DatatypeProperty']), graph, OWL['DatatypeProperty'], properties) for row in graph.predicates(DCTERMS['source'], OWL['AnnotationProperty']): if row == RDF['type']: graph.remove((DCTERMS['source'], RDF['type'], OWL['AnnotationProperty'])) graph.add((DCTERMS['source'], RDF['type'], OWL['ObjectProperty'])) # Hardcoded properties graph.add( (URIRef('https://monarchinitiative.org/MONARCH_cliqueLeader'), RDF['type'], OWL['AnnotationProperty'])) graph.add((URIRef('https://monarchinitiative.org/MONARCH_anonymous'), RDF['type'], OWL['AnnotationProperty'])) return graph
resource_to_remove.add(URIRef(item)) while len(resource_to_remove): res = resource_to_remove.pop() for s, p, o in g.triples((res, None, None)): g.remove((s, p, o)) if type(o) is URIRef and "/br/" not in str(o): resource_to_remove.add(o) full_info_dir = info_dir + args.prefix + sep print("Generate data compliant with the OCDM.") gs = GraphSet(base_iri, context_path) entity_count = 1000 counter = 0 for s in g.subjects(): if counter == entity_count: store_all(gs) counter = 0 gs = GraphSet(base_iri, context_path) with open(args.done, "a") as f: s_string = str(s) if s_string not in done: entity = None if "/ar/" in s_string: entity = gs.add_ar(agent_name, source_agent=args.source_agent, source=args.source, res=s) elif "/be/" in s_string:
class BerkeleyDBTestCase(unittest.TestCase): def setUp(self): if not has_bsddb: self.skipTest("skipping as berkleydb is missing") self.store_name = "BerkeleyDB" self.path = mktemp() self.g = ConjunctiveGraph(store=self.store_name) self.rt = self.g.open(self.path, create=True) assert self.rt == VALID_STORE, "The underlying store is corrupt" assert ( len(self.g) == 0 ), "There must be zero triples in the graph just after store (file) creation" data = """ PREFIX : <https://example.org/> :a :b :c . :d :e :f . :d :g :h . """ self.g.parse(data=data, format="ttl") def tearDown(self): self.g.close() def test_write(self): assert ( len(self.g) == 3 ), "There must be three triples in the graph after the first data chunk parse" data2 = """ PREFIX : <https://example.org/> :d :i :j . """ self.g.parse(data=data2, format="ttl") assert ( len(self.g) == 4 ), "There must be four triples in the graph after the second data chunk parse" data3 = """ PREFIX : <https://example.org/> :d :i :j . """ self.g.parse(data=data3, format="ttl") assert ( len(self.g) == 4 ), "There must still be four triples in the graph after the thrd data chunk parse" def test_read(self): sx = None for s in self.g.subjects( predicate=URIRef("https://example.org/e"), object=URIRef("https://example.org/f"), ): sx = s assert sx == URIRef("https://example.org/d") def test_sparql_query(self): q = """ PREFIX : <https://example.org/> SELECT (COUNT(*) AS ?c) WHERE { :d ?p ?o . }""" c = 0 for row in self.g.query(q): c = int(row.c) assert c == 2, "SPARQL COUNT must return 2" def test_sparql_insert(self): q = """ PREFIX : <https://example.org/> INSERT DATA { :x :y :z . }""" self.g.update(q) assert len(self.g) == 4, "After extra triple insert, length must be 4" def test_multigraph(self): q = """ PREFIX : <https://example.org/> INSERT DATA { GRAPH :m { :x :y :z . } GRAPH :n { :x :y :z . } }""" self.g.update(q) q = """ SELECT (COUNT(?g) AS ?c) WHERE { SELECT DISTINCT ?g WHERE { GRAPH ?g { ?s ?p ?o } } } """ c = 0 for row in self.g.query(q): c = int(row.c) assert c == 3, "SPARQL COUNT must return 3 (default, :m & :n)" def test_open_shut(self): assert len(self.g) == 3, "Initially we must have 3 triples from setUp" self.g.close() self.g = None # reopen the graph self.g = ConjunctiveGraph("BerkeleyDB") self.g.open(self.path, create=False) assert ( len(self.g) == 3 ), "After close and reopen, we should still have the 3 originally added triples"
(filmNS['directedBy'], rdfsRange, filmNS['Director']), ] graph = ConjunctiveGraph() for triple in schemaTriples: graph.add(triple) def isSubClassOf(subClass, superClass, graph): if subClass == superClass: return True for parentClass in graph.objects(subClass, rdfsSubClassOf): if isSubClassOf(parentClass, superClass, graph): return True return False pprint.pprint(list(graph.subjects(rdfType, owlClass))) pprint.pprint(list(graph.subjects(rdfType, owlObjectProperty))) pprint.pprint(list(graph.subjects(rdfType, owlDatatypeProperty))) print(isSubClassOf(filmNS['Actor'], filmNS['Person'], graph)) print(isSubClassOf(filmNS['Film'], filmNS['Person'], graph)) owl_filename = "film_ontology.owl" with open(owl_filename, "w") as owl_file: xml_string = str(graph.serialize(format="xml"), "utf-8") owl_file.write(xml_string) # Define a blank node for the performance performance = BNode('_:perf1') filmTriples = [
global_modules_dict = dict() global_fes_dict = dict() entities_dict = dict() remove_original = False clone_g = ConjunctiveGraph() # TODO: remove NamedIndividual for all entities for s, p, o in original_g.triples((None, RDF.type, OWL.NamedIndividual)): original_g.remove((s, p, o)) for i, clone in enumerate(clones): if i == len(clones) - 1: remove_original = True # copy all device entities for dev in original_g.subjects(RDF.type, device): # their associated triples for s, p, o in original_g.triples((dev, None, None)): new_s = clone + '-' + unicode(s).split('#')[1] new_s = amberg_ns[new_s] if p in (RDF.type, amberg_ns['hasSkill']): if remove_original: original_g.remove((s, p, o)) clone_g.add((new_s, p, o)) elif p in (hasPart, amberg_ns['connectsTo']): new_o = clone + '-' + unicode(o).split('#')[1] new_o = amberg_ns[new_o] if remove_original: original_g.remove((s, p, o)) clone_g.add((new_s, p, new_o))
def __load_citations_from_rdf_file(data_f_path, prov_f_path, service_name, id_type, id_shape, citation_type): citation_data = Graph() citation_data.load(data_f_path, format="nt11") citation_prov = ConjunctiveGraph() citation_prov.load(prov_f_path, format="nquads") for cit_ent in citation_data.subjects(RDF.type, Citation.citation): prov_entity = None snapshot = 0 for entity in citation_prov.subjects(Citation.specialization_of, cit_ent): entity_snapshot = int(sub("^.+/se/(.+)$", "\\1", entity)) if prov_entity is None or snapshot < entity_snapshot: prov_entity = entity snapshot = entity_snapshot invalidated = None update = None creation_date = None timespan = None for en in citation_prov.objects(prov_entity, Citation.invalidated_at_time): invalidated = str(en) for en in citation_prov.objects(prov_entity, Citation.has_update_query): update = str(en) for en in citation_data.objects( cit_ent, Citation.has_citation_creation_date): creation_date = str(en) for en in citation_data.objects(cit_ent, Citation.has_citation_time_span): timespan = str(en) c = Citation( sub("^.+/ci/(.+)$", "\\1", str(cit_ent)), str( list( citation_data.objects(cit_ent, Citation.has_citing_entity))[0]), None, str( list( citation_data.objects(cit_ent, Citation.has_cited_entity))[0]), None, creation_date, timespan, entity_snapshot, str( list( citation_prov.objects(prov_entity, Citation.was_attributed_to))[0]), str( list( citation_prov.objects( prov_entity, Citation.had_primary_source))[0]), str( list( citation_prov.objects(prov_entity, Citation.generated_at_time))[0]), service_name, id_type, id_shape, citation_type, Citation.journal_self_citation in citation_data.objects( cit_ent, RDF.type), Citation.author_self_citation in citation_data.objects(cit_ent, RDF.type), invalidated, str( list( citation_prov.objects(prov_entity, Citation.description))[0]), update) yield c
primer.add((myNS.pat, myNS.knows, myNS.jo)) # or: primer.add((myNS['pat'], myNS['age'], Literal(24))) # Now, with just that, lets see how the system # recorded *way* too many details about what # you just asserted as fact. # from pprint import pprint pprint(list(primer)) # just think .whatever((s, p, o)) # here we report on what we know pprint(list(primer.subjects())) pprint(list(primer.predicates())) pprint(list(primer.objects())) # and other things that make sense # what do we know about pat? pprint(list(primer.predicate_objects(myNS.pat))) # who is what age? pprint(list(primer.subject_objects(myNS.age))) # Okay, so lets now work with a bigger # dataset from the example, and start # with a fresh new graph.
def add_property_axioms(graph, properties): ontology_graph = ConjunctiveGraph() GH = 'https://raw.githubusercontent.com' MI = '/monarch-initiative' ontologies = [ GH + MI + '/SEPIO-ontology/master/src/ontology/sepio.owl', GH + MI + '/GENO-ontology/develop/src/ontology/geno.owl', GH + '/oborel/obo-relations/master/ro.owl', 'http://purl.obolibrary.org/obo/iao.owl', 'http://purl.obolibrary.org/obo/ero.owl', GH + '/jamesmalone/OBAN/master/ontology/oban_core.ttl', 'http://purl.obolibrary.org/obo/pco.owl', 'http://purl.obolibrary.org/obo/xco.owl' ] # random timeouts can waste hours. (too many redirects?) # there is a timeout param in urllib.request, # but it is not exposed by rdflib.parsing # so retry once on URLError for ontology in ontologies: logger.info("parsing: " + ontology) try: ontology_graph.parse( ontology, format=rdflib_util.guess_format(ontology)) except SAXParseException as e: logger.error(e) logger.error('Retrying as turtle: ' + ontology) ontology_graph.parse(ontology, format="turtle") except OSError as e: # URLError: # simple retry logger.error(e) logger.error('Retrying: ' + ontology) ontology_graph.parse( ontology, format=rdflib_util.guess_format(ontology)) # Get object properties graph = GraphUtils.add_property_to_graph( ontology_graph.subjects(RDF['type'], OWL['ObjectProperty']), graph, OWL['ObjectProperty'], properties) # Get annotation properties graph = GraphUtils.add_property_to_graph( ontology_graph.subjects(RDF['type'], OWL['AnnotationProperty']), graph, OWL['AnnotationProperty'], properties) # Get data properties graph = GraphUtils.add_property_to_graph( ontology_graph.subjects(RDF['type'], OWL['DatatypeProperty']), graph, OWL['DatatypeProperty'], properties) for row in graph.predicates(DC['source'], OWL['AnnotationProperty']): if row == RDF['type']: graph.remove( (DC['source'], RDF['type'], OWL['AnnotationProperty'])) graph.add((DC['source'], RDF['type'], OWL['ObjectProperty'])) # Hardcoded properties graph.add(( URIRef('https://monarchinitiative.org/MONARCH_cliqueLeader'), RDF['type'], OWL['AnnotationProperty'])) graph.add((URIRef('https://monarchinitiative.org/MONARCH_anonymous'), RDF['type'], OWL['AnnotationProperty'])) return graph
def test_get_history(self): with open(filepath('test-patch-adds-items.json')) as f: patch = f.read() with self.client as client: res1 = client.patch( '/d/', data=patch, content_type='application/json', headers={'Authorization': 'Bearer ' + 'NTAwNWViMTgtYmU2Yi00YWMwLWIwODQtMDQ0MzI4OWIzMzc4'}) patch_url = urlparse(res1.headers['Location']).path client.post( patch_url + 'merge', headers={'Authorization': 'Bearer ' + 'ZjdjNjQ1ODQtMDc1MC00Y2I2LThjODEtMjkzMmY1ZGFhYmI4'}) res2 = client.get('/h') self.assertEqual(res2.status_code, http.client.OK) self.assertEqual( res2.headers['Content-Type'], 'application/ld+json') jsonld = res2.get_data(as_text=True) g = ConjunctiveGraph() g.parse(format='json-ld', data=jsonld) # Initial data load self.assertIn( # None means any (PERIODO['p0h#change-1'], PROV.endedAtTime, None), g) self.assertIn( (PERIODO['p0h#change-1'], PROV.used, PERIODO['p0d?version=0']), g) self.assertIn( (PERIODO['p0d?version=0'], PROV.specializationOf, PERIODO['p0d']), g) self.assertIn( (PERIODO['p0h#change-1'], PROV.used, PERIODO['p0h#patch-1']), g) self.assertIn( (PERIODO['p0h#patch-1'], FOAF.page, PERIODO['p0patches/1/patch.jsonpatch']), g) self.assertIn( (PERIODO['p0h#change-1'], PROV.generated, PERIODO['p0d?version=1']), g) self.assertIn( (PERIODO['p0d?version=1'], PROV.specializationOf, PERIODO['p0d']), g) self.assertIn( (PERIODO['p0h#change-1'], PROV.generated, PERIODO['p0trgkv?version=1']), g) self.assertIn( (PERIODO['p0trgkv?version=1'], PROV.specializationOf, PERIODO['p0trgkv']), g) self.assertIn( (PERIODO['p0h#change-1'], PROV.generated, PERIODO['p0trgkvwbjd?version=1']), g) self.assertIn( (PERIODO['p0trgkvwbjd?version=1'], PROV.specializationOf, PERIODO['p0trgkvwbjd']), g) # Change from first submitted patch self.assertIn( # None means any (PERIODO['p0h#change-2'], PROV.startedAtTime, None), g) self.assertIn( # None means any (PERIODO['p0h#change-2'], PROV.endedAtTime, None), g) start = g.value( subject=PERIODO['p0h#change-2'], predicate=PROV.startedAtTime) self.assertEqual(start.datatype, XSD.dateTime) self.assertRegex(start.value.isoformat(), W3CDTF) end = g.value( subject=PERIODO['p0h#change-2'], predicate=PROV.endedAtTime) self.assertEqual(end.datatype, XSD.dateTime) self.assertRegex(end.value.isoformat(), W3CDTF) self.assertIn( (PERIODO['p0h#change-2'], PROV.wasAssociatedWith, URIRef('http://orcid.org/1234-5678-9101-112X')), g) self.assertIn( (PERIODO['p0h#change-2'], PROV.wasAssociatedWith, URIRef('http://orcid.org/1211-1098-7654-321X')), g) for association in g.subjects( predicate=PROV.agent, object=URIRef('http://orcid.org/1234-5678-9101-112X')): role = g.value(subject=association, predicate=PROV.hadRole) self.assertIn(role, (PERIODO['p0v#submitted'], PERIODO['p0v#updated'])) merger = g.value( predicate=PROV.agent, object=URIRef('http://orcid.org/1211-1098-7654-321X')) self.assertIn( (PERIODO['p0h#change-2'], PROV.qualifiedAssociation, merger), g) self.assertIn( (merger, PROV.hadRole, PERIODO['p0v#merged']), g) self.assertIn( (PERIODO['p0h#change-2'], PROV.used, PERIODO['p0d?version=1']), g) self.assertIn( (PERIODO['p0d?version=1'], PROV.specializationOf, PERIODO['p0d']), g) self.assertIn( (PERIODO['p0h#change-2'], PROV.used, PERIODO['p0h#patch-2']), g) self.assertIn( (PERIODO['p0h#patch-2'], FOAF.page, PERIODO['p0patches/2/patch.jsonpatch']), g) self.assertIn( (PERIODO['p0h#change-2'], PROV.generated, PERIODO['p0d?version=2']), g) self.assertIn( (PERIODO['p0d?version=2'], PROV.specializationOf, PERIODO['p0d']), g) self.assertIn( (PERIODO['p0h#change-2'], PROV.generated, PERIODO['p0trgkv?version=2']), g) self.assertIn( (PERIODO['p0trgkv?version=2'], PROV.specializationOf, PERIODO['p0trgkv']), g) self.assertIn( (PERIODO['p0trgkv?version=2'], PROV.wasRevisionOf, PERIODO['p0trgkv?version=1']), g) entities = 0 for _, _, version in g.triples( (PERIODO['p0h#change-2'], PROV.generated, None)): entity = g.value(subject=version, predicate=PROV.specializationOf) self.assertEqual(str(entity) + '?version=2', str(version)) entities += 1 self.assertEqual(entities, 5)
def create_graph(filelist, output_train, output_test, pos_graphs, cv, predicate, ob): global relation_counter relation_counter = 1000000 global entity_counter global local_entity_counter global local_entity_map global id_to_uri id_to_uri = dict() entity_counter = 0 entity_map = dict() relation_map = dict() graph_labels_train = [] graph_labels_test = [] filelist = np.array(filelist) i_fold = 0 for train_index, test_index in cross_validation.KFold(len(filelist), n_folds=cv): train = True test = True filelist_train = filelist[train_index] filelist_test = filelist[test_index] output_train_tmp = output_train + str(i_fold) + ".txt" output_test_tmp = output_test + str(i_fold) + ".txt" # delete train and test output files try: os.remove(output_train_tmp) except OSError: pass try: os.remove(output_test_tmp) except OSError: pass # First round train then test while train or test: graph_labels_tmp = [] filelist_tmp = None graph_labels_list_tmp = None if train: filelist_tmp = filelist_train output_tmp = output_train_tmp train = False graph_labels_list_tmp = graph_labels_train else: filelist_tmp = filelist_test output_tmp = output_test_tmp test = False graph_labels_list_tmp = graph_labels_test for f in filelist_tmp: num = int(f.split("_")[1]) labels = pos_graphs[num] graph_labels_tmp.append(labels) g = ConjunctiveGraph() g.load(open(f, "rb")) operations = list(g.subjects(predicate, ob)) with open(output_tmp, "a") as tf: o = operations[0] entity_set = set() edge_set = [] local_entity_counter = 0 local_entity_map = [] local_entity_counter = 0 local_entity_map = dict() dfs_triples(entity_set, entity_map, edge_set, relation_map, g, o) #id = list(g.objects(o, ID))[0] tf.write("t") tf.write("\n") for (local_id, global_id) in sorted(entity_set, key=lambda x: x[0]): tf.write("v" + " " + str(local_id) + " " + str(global_id)) tf.write("\n") for (s,p,o) in edge_set: tf.write("e" + " " + str(s) + " " + str(o) + " " + str(p)) tf.write("\n") graph_labels_list_tmp.append(graph_labels_tmp) i_fold += 1 return id_to_uri, graph_labels_train, graph_labels_test
def test_get_history(self): with open(filepath('test-patch-adds-items.json')) as f: patch = f.read() with self.client as client: res1 = client.patch( '/d/', data=patch, content_type='application/json', headers={'Authorization': 'Bearer ' + 'NTAwNWViMTgtYmU2Yi00YWMwLWIwODQtMDQ0MzI4OWIzMzc4'}) patch_url = urlparse(res1.headers['Location']).path client.post( patch_url + 'messages', data='{"message": "Here is my patch"}', content_type='application/json', headers={'Authorization': 'Bearer ' + 'NTAwNWViMTgtYmU2Yi00YWMwLWIwODQtMDQ0MzI4OWIzMzc4'}) client.post( patch_url + 'messages', data='{"message": "Looks good to me"}', content_type='application/json', headers={'Authorization': 'Bearer ' + 'ZjdjNjQ1ODQtMDc1MC00Y2I2LThjODEtMjkzMmY1ZGFhYmI4'}) client.post( patch_url + 'merge', buffered=True, headers={'Authorization': 'Bearer ' + 'ZjdjNjQ1ODQtMDc1MC00Y2I2LThjODEtMjkzMmY1ZGFhYmI4'}) res3 = client.get('/h', headers={'Accept': 'application/ld+json'}) self.assertEqual(res3.status_code, http.client.SEE_OTHER) self.assertEqual( urlparse(res3.headers['Location']).path, '/h.jsonld') res4 = client.get('/history.jsonld?inline-context') self.assertEqual(res4.status_code, http.client.OK) self.assertEqual( res4.headers['Content-Type'], 'application/ld+json') jsonld = res4.get_data(as_text=True) g = ConjunctiveGraph() g.parse(format='json-ld', data=jsonld) # Initial data load self.assertIn( # None means any (HOST['h#change-1'], PROV.endedAtTime, None), g) self.assertIn( (HOST['h#change-1'], PROV.used, HOST['d?version=0']), g) self.assertIn( (HOST['d?version=0'], PROV.specializationOf, HOST['d']), g) self.assertIn( (HOST['h#change-1'], RDFS.seeAlso, HOST['h#patch-request-1']), g) self.assertIn( (HOST['h#patch-request-1'], FOAF.page, HOST['patches/1/']), g) self.assertNotIn( (HOST['h#patch-request-1'], AS.replies, HOST['h#patch-request-1-comments']), g) self.assertIn( (HOST['h#change-1'], PROV.used, HOST['h#patch-1']), g) self.assertIn( (HOST['h#patch-1'], FOAF.page, HOST['patches/1/patch.jsonpatch']), g) self.assertIn( (HOST['h#change-1'], PROV.generated, HOST['d?version=1']), g) self.assertIn( (HOST['d?version=1'], PROV.specializationOf, HOST['d']), g) # Change from first submitted patch self.assertIn( # None means any (HOST['h#change-2'], PROV.startedAtTime, None), g) self.assertIn( # None means any (HOST['h#change-2'], PROV.endedAtTime, None), g) start = g.value( subject=HOST['h#change-2'], predicate=PROV.startedAtTime) self.assertEqual(start.datatype, XSD.dateTime) self.assertRegex(start.value.isoformat(), W3CDTF) end = g.value( subject=HOST['h#change-2'], predicate=PROV.endedAtTime) self.assertEqual(end.datatype, XSD.dateTime) self.assertRegex(end.value.isoformat(), W3CDTF) self.assertIn( (HOST['h#change-2'], PROV.wasAssociatedWith, URIRef('https://orcid.org/1234-5678-9101-112X')), g) self.assertIn( (HOST['h#change-2'], PROV.wasAssociatedWith, URIRef('https://orcid.org/1211-1098-7654-321X')), g) for association in g.subjects( predicate=PROV.agent, object=URIRef('https://orcid.org/1234-5678-9101-112X')): role = g.value(subject=association, predicate=PROV.hadRole) self.assertIn(role, (HOST['v#submitted'], HOST['v#updated'])) merger = g.value( predicate=PROV.agent, object=URIRef('https://orcid.org/1211-1098-7654-321X')) self.assertIn( (HOST['h#change-2'], PROV.qualifiedAssociation, merger), g) self.assertIn( (merger, PROV.hadRole, HOST['v#merged']), g) self.assertIn( (HOST['h#change-2'], PROV.used, HOST['d?version=1']), g) self.assertIn( (HOST['d?version=1'], PROV.specializationOf, HOST['d']), g) self.assertIn( (HOST['h#change-2'], RDFS.seeAlso, HOST['h#patch-request-2']), g) self.assertIn( (HOST['h#patch-request-2'], FOAF.page, HOST['patches/2/']), g) self.assertIn( (HOST['h#patch-request-2'], AS.replies, HOST['h#patch-request-2-comments']), g) commentCount = g.value( subject=HOST['h#patch-request-2-comments'], predicate=AS.totalItems) self.assertEqual(commentCount.value, 2) self.assertIn( (HOST['h#patch-request-2-comments'], AS.first, HOST['h#patch-request-2-comment-1']), g) self.assertIn( (HOST['h#patch-request-2-comments'], AS.last, HOST['h#patch-request-2-comment-2']), g) self.assertIn( (HOST['h#patch-request-2-comments'], AS.items, HOST['h#patch-request-2-comment-1']), g) self.assertIn( (HOST['h#patch-request-2-comments'], AS.items, HOST['h#patch-request-2-comment-2']), g) self.assertIn( (HOST['h#patch-request-2-comment-1'], RDF.type, AS.Note), g) self.assertIn( (HOST['h#patch-request-2-comment-1'], AS.attributedTo, URIRef('https://orcid.org/1234-5678-9101-112X')), g) self.assertIn( # None means any (HOST['h#patch-request-2-comment-1'], AS.published, None), g) comment1_media_type = g.value( subject=HOST['h#patch-request-2-comment-1'], predicate=AS.mediaType) self.assertEqual(comment1_media_type.value, 'text/plain') comment1_content = g.value( subject=HOST['h#patch-request-2-comment-1'], predicate=AS.content) self.assertEqual(comment1_content.value, 'Here is my patch') self.assertIn( (HOST['h#patch-request-2-comment-2'], RDF.type, AS.Note), g) self.assertIn( (HOST['h#patch-request-2-comment-2'], AS.attributedTo, URIRef('https://orcid.org/1211-1098-7654-321X')), g) self.assertIn( # None means any (HOST['h#patch-request-2-comment-2'], AS.published, None), g) comment2_media_type = g.value( subject=HOST['h#patch-request-2-comment-2'], predicate=AS.mediaType) self.assertEqual(comment2_media_type.value, 'text/plain') comment2_content = g.value( subject=HOST['h#patch-request-2-comment-2'], predicate=AS.content) self.assertEqual(comment2_content.value, 'Looks good to me') self.assertIn( (HOST['h#change-2'], PROV.used, HOST['h#patch-2']), g) self.assertIn( (HOST['h#patch-2'], FOAF.page, HOST['patches/2/patch.jsonpatch']), g) self.assertIn( (HOST['h#change-2'], PROV.generated, HOST['d?version=2']), g) self.assertIn( (HOST['d?version=2'], PROV.specializationOf, HOST['d']), g)
import urllib from rdflib import ConjunctiveGraph as Graph import sparta url = 'http://www.gopubmed.org/GoMeshPubMed/gomeshpubmed/Search/RDF?q=18463287&type=RdfExportAll' gopubmed_handle = urllib.urlopen(url) graph = Graph() graph.parse(gopubmed_handle) gopubmed_handle.close() graph_subjects = list(set(graph.subjects())) sparta_factory = sparta.ThingFactory(graph) for subject in graph_subjects: sparta_graph = sparta_factory(subject) print subject, [unicode(i) for i in sparta_graph.dc_title][0]
rdfGraph = ConjunctiveGraph() try: rdfGraph.parse("yoga-ontology.rdf", format="xml") except: print("Error") ns = Namespace('http://webprotege.stanford.edu/') asana = ns.RD1UGDkMwbwNp3Nh9Gy5W3M # root element of classification sukhasana = ns.R8SV4CeNnDntt7K2HTjws64 # element to delete description = ns.R7zDsGb0eQYf6uJETHG3qBx # predicate for dataprop (description) newAsana = ns.newElement1 positiveAffect = rdfGraph.subjects(RDFS.label, Literal("положительно влияет на", "ru")).__next__() negativeAffect = rdfGraph.subjects(RDFS.label, Literal("отрицательно влияет на", "ru")).__next__() backbone = rdfGraph.subjects(RDFS.label, Literal("Позвоночник", lang="ru")).__next__() print(f"\nLabel of root element is {rdfGraph.label(asana)}") print("\nFull information of root element:") for po in rdfGraph.predicate_objects(asana): print(po) print("\nGetting all instances for type of root element") printElements(rdfGraph)
s = graph.serialize(format='n3') #print(s) #print("graph has %s statements." % len(graph)) def isSubClassOf(subClass, superClass, graph): if subClass == superClass: return True for parentClass in graph.objects(subClass, rdfsSubClassOf): if isSubClassOf(parentClass, superClass, graph): return True else: return False #in danh sach tat cac ca class trong file owl print(list(graph.subjects(rdfType, owlClass))) #in dan hsach tat ca cac thuoc tinh trong file owl print(list(graph.subjects(rdfType, owlObjectProperty))) #in danh sach ca data property # list(graph.subjects(rdfType, owlObjectProperty)) #define a blank node performance = BNode('_:perf1') #dinh nghia du lieu dang triples def TruyVan(query, graph, instances=None): if instances is None: instances = set() for instance in graph.subjects(rdfType, query): instances.add(instance) for subClass in graph.subjects(rdfsSubClassOf, query):
def get_vocab_base(vocabfile): graph = Graph() try: graph.parse(vocabfile) except: graph = None graph = Graph() try: graph.parse(vocabfile, format="n3") except: return (None, None, None) identifier = None for v in graph.objects(None, namespaces['dc']['identifier']): identifier = v if not identifier: for v in graph.objects(None, namespaces['dcterms']['identifier']): identifier = v base = None if not base: for s in graph.subjects(namespaces['rdf']['type'], namespaces['owl']['Ontology']): base = s break if not base: for s in graph.subjects(namespaces['dc']['title'], None): base = s break if not base: for s in graph.subjects(namespaces['dcterms']['title'], None): base = s break if not base: for s in graph.subjects(namespaces['dc']['creator'], None): base = s break if not base: for s in graph.subjects(namespaces['dcterms']['creator'], None): base = s break if not base: for v in graph.objects(None, namespaces['vann']['preferredNamespaceUri']): base = v break if not base: for v in graph.namespaces(): if v[0] == '': base = v[1] break prefix = None vocab_prefixes = graph.objects(None, namespaces['vann']['preferredNamespacePrefix']) for vp in vocab_prefixes: prefix = vp if not prefix and base: for v in graph.namespaces(): if str(v[1]) == str(base): prefix = v[0] break if not prefix and base: prefix = base.strip().strip('/').split('/')[-1].strip('#').strip(' ') if base: base = base.strip() if (base[-1]!="/" and base[-1]!="#"): base += "#" return (identifier, base, prefix)
def graph_plan(plan, fountain): plan_graph = ConjunctiveGraph() plan_graph.bind('agora', AGORA) prefixes = plan.get('prefixes') ef_plan = plan.get('plan') tree_lengths = {} s_trees = set([]) patterns = {} for (prefix, u) in prefixes.items(): plan_graph.bind(prefix, u) def __get_pattern_node(p): if p not in patterns: patterns[p] = BNode('tp_{}'.format(len(patterns))) return patterns[p] def __inc_tree_length(tree, l): if tree not in tree_lengths: tree_lengths[tree] = 0 tree_lengths[tree] += l def __add_variable(p_node, vid, subject=True): sub_node = BNode(str(vid).replace('?', 'var_')) if subject: plan_graph.add((p_node, AGORA.subject, sub_node)) else: plan_graph.add((p_node, AGORA.object, sub_node)) plan_graph.set((sub_node, RDF.type, AGORA.Variable)) plan_graph.set((sub_node, RDFS.label, Literal(str(vid), datatype=XSD.string))) def include_path(elm, p_seeds, p_steps): elm_uri = __extend_uri(prefixes, elm) path_g = plan_graph.get_context(elm_uri) b_tree = BNode(elm_uri) s_trees.add(b_tree) path_g.set((b_tree, RDF.type, AGORA.SearchTree)) path_g.set((b_tree, AGORA.fromType, elm_uri)) for seed in p_seeds: path_g.add((b_tree, AGORA.hasSeed, URIRef(seed))) previous_node = b_tree __inc_tree_length(b_tree, len(p_steps)) for j, step in enumerate(p_steps): prop = step.get('property') b_node = BNode(previous_node.n3() + prop) if j < len(p_steps) - 1 or pattern[1] == RDF.type: path_g.add((b_node, AGORA.onProperty, __extend_uri(prefixes, prop))) path_g.add((b_node, AGORA.expectedType, __extend_uri(prefixes, step.get('type')))) path_g.add((previous_node, AGORA.next, b_node)) previous_node = b_node p_node = __get_pattern_node(pattern) path_g.add((previous_node, AGORA.byPattern, p_node)) for i, tp_plan in enumerate(ef_plan): paths = tp_plan.get('paths') pattern = tp_plan.get('pattern') hints = tp_plan.get('hints') context = BNode('space_{}'.format(tp_plan.get('context'))) for path in paths: steps = path.get('steps') seeds = path.get('seeds') if not len(steps) and len(seeds): include_path(pattern[2], seeds, steps) elif len(steps): ty = steps[0].get('type') include_path(ty, seeds, steps) for t in s_trees: plan_graph.set((t, AGORA.length, Literal(tree_lengths.get(t, 0), datatype=XSD.integer))) pattern_node = __get_pattern_node(pattern) plan_graph.add((context, AGORA.definedBy, pattern_node)) plan_graph.set((context, RDF.type, AGORA.SearchSpace)) plan_graph.add((pattern_node, RDF.type, AGORA.TriplePattern)) (sub, pred, obj) = pattern if isinstance(sub, BNode): __add_variable(pattern_node, str(sub)) elif isinstance(sub, URIRef): plan_graph.add((pattern_node, AGORA.subject, sub)) if isinstance(obj, BNode): __add_variable(pattern_node, str(obj), subject=False) elif isinstance(obj, Literal): node = BNode(str(obj).replace(' ', '')) plan_graph.add((pattern_node, AGORA.object, node)) plan_graph.set((node, RDF.type, AGORA.Literal)) plan_graph.set((node, AGORA.value, Literal(str(obj), datatype=XSD.string))) else: plan_graph.add((pattern_node, AGORA.object, obj)) plan_graph.add((pattern_node, AGORA.predicate, pred)) if pred == RDF.type: if 'check' in hints: plan_graph.add((pattern_node, AGORA.checkType, Literal(hints['check'], datatype=XSD.boolean))) sub_expected = plan_graph.subjects(predicate=AGORA.expectedType) for s in sub_expected: expected_types = list(plan_graph.objects(s, AGORA.expectedType)) for et in expected_types: plan_graph.remove((s, AGORA.expectedType, et)) q_expected_types = [plan_graph.qname(t) for t in expected_types] expected_types = [d for d in expected_types if not set.intersection(set(fountain.get_type(plan_graph.qname(d)).get('super')), set(q_expected_types))] for et in expected_types: plan_graph.add((s, AGORA.expectedType, et)) return plan_graph
class PreProcessor(object): def __init__(self, kg_path): self.kg_path = kg_path self.ent_dict = dict() self.rel_dict = dict() self.g = ConjunctiveGraph() self.unique_msgs = self.ent_dict.copy() def load_knowledge_graph(self, format='xml', exclude_rels=[], clean_schema=True, amberg_params=None, excluded_entities=None): self.g.load(self.kg_path, format=format) # remove triples with excluded relation remove_rel_triples(self.g, exclude_rels) # remove triples with relations between class-level constructs if clean_schema: remove_rel_triples(self.g, schema_relations) if excluded_entities is not None: remove_ent_triples(self.g, excluded_entities) if amberg_params: path_to_events = amberg_params[0] max_events = amberg_params[1] self.merged = get_merged_dataframe(path_to_events, max_events) self.unique_msgs, unique_vars, unique_mods, unique_fes = get_unique_entities( self.merged) update_amberg_ontology(self.g, self.ent_dict, self.unique_msgs, unique_mods, unique_fes, unique_vars, self.merged) self.update_entity_relation_dictionaries() def update_entity_relation_dictionaries(self): """ Given an existing entity dictionary, update it to *ontology* :param ontology: :param ent_dict: the existing entity dictionary :return: """ ent_counter = 0 fixed_ids = set([id for id in self.ent_dict.values()]) # sorting ensures equal random splits on equal seeds for h in sorted( set(self.g.subjects(None, None)).union( set(self.g.objects(None, None)))): uni_h = unicode(h) if uni_h not in self.ent_dict: while ent_counter in fixed_ids: ent_counter += 1 self.ent_dict.setdefault(uni_h, ent_counter) ent_counter += 1 # add new relations to dict for r in sorted(set(self.g.predicates(None, None))): uni_r = unicode(r) if uni_r not in self.rel_dict: self.rel_dict.setdefault(uni_r, len(self.rel_dict)) def load_unique_msgs_from_txt(self, path, max_events=None): """ Assuming csv text files with two columns :param path: :return: """ with open(path, "rb") as f: for line in f: split = line.split(',') try: emb_id = int(split[1].strip()) except: print("Error reading id of {0} in given dictionary".format( line)) # skip this event entitiy, treat it as common entitiy later on continue self.ent_dict[split[0]] = emb_id # sort ascending w.r.t. embedding id, in case of later stripping # self.ent_dict = sorted(self.ent_dict.items(), key=operator.itemgetter(1), reverse=False) self.unique_msgs = self.ent_dict.copy() if max_events is not None: all_msgs = sorted(self.unique_msgs.items(), key=operator.itemgetter(1), reverse=False) self.unique_msgs = dict(all_msgs[:max_events]) excluded_events = dict(all_msgs[max_events:]).keys() return excluded_events def prepare_sequences(self, path_to_input, use_dict=True): """ Dumps pickle for sequences and dictionary :param data_frame: :param file_name: :param index: :param classification_event: :return: """ print("Preparing sequential data...") with open(path_to_input, "rb") as f: result = [] for line in f: entities = line.split(',') if use_dict: result.append([ int(e.strip()) for e in entities if int(e.strip()) in self.unique_msgs.values() ]) else: result.append([int(e.strip()) for e in entities]) print("Processed {0} sequences".format(len(result))) return result def get_vocab_size(self): return len(self.unique_msgs) def get_ent_dict(self): return self.ent_dict def get_rel_dict(self): return self.rel_dict def get_kg(self): return self.g def get_unique_msgs(self): return self.unique_msgs def get_merged(self): return self.merged
def test_multiple_value_urls_in_virtual(): csvw = CSVW(csv_path="tests/value_urls.csv", metadata_path="tests/value_urls.csv-metadata.json") rdf_contents = csvw.to_rdf(fmt="nt") g = ConjunctiveGraph() g.parse(data=rdf_contents, format="nt") # Test subjects all_subjects = list(g.subjects()) s_amount = NS['amount'] s_desc = NS['description'] s_id = NS['id'] assert s_amount in all_subjects assert s_desc in all_subjects assert s_id in all_subjects # Test descriptions p_def = NS['definition'] assert len(list(g.triples( (s_amount, p_def, Literal("the amount paid"))))) == 1 assert len( list(g.triples( (s_desc, p_def, Literal("description of the expense"))))) == 1 assert len(list(g.triples((s_id, p_def, Literal("transaction id"))))) == 1 # Test each is a element type o_element = NS['element'] assert len(list(g.triples((s_amount, RDF.type, o_element)))) == 1 assert len(list(g.triples((s_desc, RDF.type, o_element)))) == 1 assert len(list(g.triples((s_id, RDF.type, o_element)))) == 1 # Test that range is specified r_amount = NS['element/amount-RANGE'] r_desc = NS['element/description-RANGE'] r_id = NS['element/id-RANGE'] assert len(list(g.triples((s_amount, RDFS.range, r_amount)))) == 1 assert len(list(g.triples((s_desc, RDFS.range, r_desc)))) == 1 assert len(list(g.triples((s_id, RDFS.range, r_id)))) == 1 # Range is another subject assert r_amount in all_subjects assert r_desc in all_subjects assert r_id in all_subjects # Range is a OWL datatype of specified type assert len(list(g.triples((r_amount, OWL.onDatatype, XSD.decimal)))) == 1 assert len(list(g.triples((r_desc, OWL.onDatatype, XSD.string)))) == 1 assert len(list(g.triples((r_id, OWL.onDatatype, XSD.integer)))) == 1 # Check the restrictions for amount rest_amount_node = list(g.triples((r_amount, OWL.withRestrictions, None))) rest_amount_node = rest_amount_node[0][2] assert isinstance(rest_amount_node, BNode) assert len(list(g.triples( (rest_amount_node, RDF.first, XSD.decimal)))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len(list(g.triples( (rest_amount_node, RDF.first, XSD.MaxLength)))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len( list( g.triples((rest_amount_node, RDF.first, Literal(10, datatype=XSD.nonNegativeInteger))))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len(list(g.triples( (rest_amount_node, RDF.first, XSD.MinLength)))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len( list( g.triples((rest_amount_node, RDF.first, Literal(1, datatype=XSD.nonNegativeInteger))))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len(list(g.triples((rest_amount_node, RDF.first, None)))) == 0 assert len(list(g.triples((rest_amount_node, RDF.rest, None)))) == 0 # Check the restrictions for description rest_desc_node = list(g.triples((r_desc, OWL.withRestrictions, None))) rest_desc_node = rest_desc_node[0][2] assert isinstance(rest_desc_node, BNode) assert len(list(g.triples((rest_desc_node, RDF.first, XSD.string)))) == 1 rest_desc_node = next(g.objects(subject=rest_desc_node, predicate=RDF.rest)) assert len(list(g.triples( (rest_desc_node, RDF.first, XSD.MaxLength)))) == 1 rest_desc_node = next(g.objects(subject=rest_desc_node, predicate=RDF.rest)) assert len( list( g.triples((rest_desc_node, RDF.first, Literal(100, datatype=XSD.nonNegativeInteger))))) == 1 rest_desc_node = next(g.objects(subject=rest_desc_node, predicate=RDF.rest)) assert len(list(g.triples((rest_desc_node, RDF.first, None)))) == 0 assert len(list(g.triples((rest_desc_node, RDF.rest, None)))) == 0 # Check the restrictions for id rest_id_node = list(g.triples((r_id, OWL.withRestrictions, None))) rest_id_node = rest_id_node[0][2] assert isinstance(rest_id_node, BNode) assert len(list(g.triples((rest_id_node, RDF.first, XSD.integer)))) == 1 rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest)) assert len(list(g.triples((rest_id_node, RDF.first, XSD.MinLength)))) == 1 rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest)) assert len( list( g.triples((rest_id_node, RDF.first, Literal(0, datatype=XSD.nonNegativeInteger))))) == 1 rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest)) assert len(list(g.triples((rest_id_node, RDF.first, None)))) == 0 assert len(list(g.triples((rest_id_node, RDF.rest, None)))) == 0 # Check constant value for each const_prop = NS['another-list-value-with-constants'] for s in [r_amount, r_id, r_desc]: constant_node = list(g.triples((r_amount, const_prop, None))) constant_node = constant_node[0][2] assert isinstance(constant_node, BNode) assert len(list(g.triples( (constant_node, RDF.first, XSD.Length)))) == 1 constant_node = next( g.objects(subject=constant_node, predicate=RDF.rest)) assert len( list( g.triples((constant_node, RDF.first, Literal(1, datatype=XSD.nonNegativeInteger))))) == 1 constant_node = next( g.objects(subject=constant_node, predicate=RDF.rest)) assert len(list(g.triples((constant_node, RDF.first, None)))) == 0 assert len(list(g.triples((constant_node, RDF.rest, None)))) == 0 # Verify that empty valueUrl does not end up in graph or rdf contents assert NS['empty-list-predicate1'] not in list(g.objects()) assert "empty-list-predicate1" not in rdf_contents # Verify that empty valueUrl does not end up in graph assert NS['empty-list-predicate2'] not in list(g.objects()) assert "empty-list-predicate2" not in rdf_contents # Test total number of lists through rdf:nils in order to verify each list # ends up with a nil test_num_lists = 3 * 3 # 3 rows and 3 virtual list valued columns nil_text = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#nil> ." assert rdf_contents.count(nil_text) == test_num_lists